summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks12
-rw-r--r--kernel/Kconfig.preempt2
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/audit.c20
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/bpf/Makefile8
-rw-r--r--kernel/bpf/arraymap.c265
-rw-r--r--kernel/bpf/bpf_struct_ops.c633
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h9
-rw-r--r--kernel/bpf/btf.c1239
-rw-r--r--kernel/bpf/cgroup.c145
-rw-r--r--kernel/bpf/core.c151
-rw-r--r--kernel/bpf/cpumap.c76
-rw-r--r--kernel/bpf/devmap.c540
-rw-r--r--kernel/bpf/dispatcher.c158
-rw-r--r--kernel/bpf/hashtab.c264
-rw-r--r--kernel/bpf/helpers.c14
-rw-r--r--kernel/bpf/inode.c140
-rw-r--r--kernel/bpf/local_storage.c30
-rw-r--r--kernel/bpf/map_in_map.c10
-rw-r--r--kernel/bpf/offload.c16
-rw-r--r--kernel/bpf/stackmap.c11
-rw-r--r--kernel/bpf/syscall.c1120
-rw-r--r--kernel/bpf/sysfs_btf.c46
-rw-r--r--kernel/bpf/tnum.c9
-rw-r--r--kernel/bpf/trampoline.c426
-rw-r--r--kernel/bpf/verifier.c1090
-rw-r--r--kernel/bpf/xskmap.c239
-rw-r--r--kernel/cgroup/cgroup-internal.h9
-rw-r--r--kernel/cgroup/cgroup-v1.c67
-rw-r--r--kernel/cgroup/cgroup.c376
-rw-r--r--kernel/cgroup/cpuset.c164
-rw-r--r--kernel/cgroup/freezer.c9
-rw-r--r--kernel/cgroup/pids.c11
-rw-r--r--kernel/cgroup/rstat.c48
-rw-r--r--kernel/compat.c24
-rw-r--r--kernel/configs.c9
-rw-r--r--kernel/context_tracking.c6
-rw-r--r--kernel/cpu.c231
-rw-r--r--kernel/cred.c10
-rw-r--r--kernel/debug/debug_core.c70
-rw-r--r--kernel/debug/debug_core.h3
-rw-r--r--kernel/debug/kdb/kdb_bp.c1
-rw-r--r--kernel/debug/kdb/kdb_bt.c120
-rw-r--r--kernel/debug/kdb/kdb_io.c240
-rw-r--r--kernel/debug/kdb/kdb_main.c5
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/dma/Kconfig24
-rw-r--r--kernel/dma/coherent.c29
-rw-r--r--kernel/dma/contiguous.c17
-rw-r--r--kernel/dma/debug.c41
-rw-r--r--kernel/dma/direct.c200
-rw-r--r--kernel/dma/mapping.c142
-rw-r--r--kernel/dma/remap.c104
-rw-r--r--kernel/dma/swiotlb.c39
-rw-r--r--kernel/elfcore.c1
-rw-r--r--kernel/events/core.c652
-rw-r--r--kernel/events/hw_breakpoint.c4
-rw-r--r--kernel/events/internal.h35
-rw-r--r--kernel/events/ring_buffer.c112
-rw-r--r--kernel/events/uprobes.c94
-rw-r--r--kernel/exit.c158
-rw-r--r--kernel/extable.c20
-rw-r--r--kernel/fork.c336
-rw-r--r--kernel/freezer.c6
-rw-r--r--kernel/futex.c339
-rwxr-xr-xkernel/gen_kheaders.sh72
-rw-r--r--kernel/irq/affinity.c231
-rw-r--r--kernel/irq/chip.c44
-rw-r--r--kernel/irq/cpuhotplug.c21
-rw-r--r--kernel/irq/debugfs.c1
-rw-r--r--kernel/irq/irqdesc.c18
-rw-r--r--kernel/irq/irqdomain.c30
-rw-r--r--kernel/irq/manage.c57
-rw-r--r--kernel/irq/msi.c5
-rw-r--r--kernel/irq/pm.c20
-rw-r--r--kernel/irq/proc.c56
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/irq_work.c34
-rw-r--r--kernel/jump_label.c4
-rw-r--r--kernel/kallsyms.c23
-rw-r--r--kernel/kcov.c547
-rw-r--r--kernel/kexec.c12
-rw-r--r--kernel/kexec_core.c10
-rw-r--r--kernel/kexec_elf.c430
-rw-r--r--kernel/kexec_file.c76
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kprobes.c140
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/latencytop.c14
-rw-r--r--kernel/livepatch/Makefile2
-rw-r--r--kernel/livepatch/core.c45
-rw-r--r--kernel/livepatch/core.h5
-rw-r--r--kernel/livepatch/patch.c3
-rw-r--r--kernel/livepatch/state.c119
-rw-r--r--kernel/livepatch/state.h9
-rw-r--r--kernel/livepatch/transition.c12
-rw-r--r--kernel/locking/lockdep.c169
-rw-r--r--kernel/locking/lockdep_internals.h9
-rw-r--r--kernel/locking/lockdep_proc.c23
-rw-r--r--kernel/locking/locktorture.c9
-rw-r--r--kernel/locking/mutex.c30
-rw-r--r--kernel/locking/osq_lock.c23
-rw-r--r--kernel/locking/qspinlock.c13
-rw-r--r--kernel/locking/qspinlock_paravirt.h2
-rw-r--r--kernel/locking/rtmutex.c12
-rw-r--r--kernel/locking/rwsem.c64
-rw-r--r--kernel/locking/spinlock_debug.c32
-rw-r--r--kernel/module.c197
-rw-r--r--kernel/module_signature.c46
-rw-r--r--kernel/module_signing.c56
-rw-r--r--kernel/notifier.c41
-rw-r--r--kernel/nsproxy.c41
-rw-r--r--kernel/padata.c655
-rw-r--r--kernel/panic.c54
-rw-r--r--kernel/params.c21
-rw-r--r--kernel/pid.c176
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/Kconfig5
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/hibernate.c26
-rw-r--r--kernel/power/main.c133
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/qos.c296
-rw-r--r--kernel/power/snapshot.c57
-rw-r--r--kernel/power/suspend.c74
-rw-r--r--kernel/power/suspend_test.c6
-rw-r--r--kernel/power/wakelock.c32
-rw-r--r--kernel/printk/braille.c15
-rw-r--r--kernel/printk/printk.c40
-rw-r--r--kernel/profile.c30
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rcu/Kconfig23
-rw-r--r--kernel/rcu/Kconfig.debug11
-rw-r--r--kernel/rcu/Makefile1
-rw-r--r--kernel/rcu/rcu.h38
-rw-r--r--kernel/rcu/rcu_segcblist.c189
-rw-r--r--kernel/rcu/rcu_segcblist.h71
-rw-r--r--kernel/rcu/rcuperf.c199
-rw-r--r--kernel/rcu/rcutorture.c211
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/srcutree.c16
-rw-r--r--kernel/rcu/tiny.c28
-rw-r--r--kernel/rcu/tree.c592
-rw-r--r--kernel/rcu/tree.h100
-rw-r--r--kernel/rcu/tree_exp.h154
-rw-r--r--kernel/rcu/tree_plugin.h1353
-rw-r--r--kernel/rcu/tree_stall.h43
-rw-r--r--kernel/rcu/update.c119
-rw-r--r--kernel/resource.c49
-rw-r--r--kernel/rseq.c2
-rw-r--r--kernel/sched/clock.c6
-rw-r--r--kernel/sched/core.c920
-rw-r--r--kernel/sched/cpufreq.c18
-rw-r--r--kernel/sched/cpufreq_schedutil.c25
-rw-r--r--kernel/sched/cpupri.c25
-rw-r--r--kernel/sched/cpupri.h4
-rw-r--r--kernel/sched/cputime.c307
-rw-r--r--kernel/sched/deadline.c158
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c2042
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle.c67
-rw-r--r--kernel/sched/isolation.c18
-rw-r--r--kernel/sched/loadavg.c33
-rw-r--r--kernel/sched/membarrier.c240
-rw-r--r--kernel/sched/pelt.c20
-rw-r--r--kernel/sched/psi.c70
-rw-r--r--kernel/sched/rt.c185
-rw-r--r--kernel/sched/sched.h159
-rw-r--r--kernel/sched/stats.h7
-rw-r--r--kernel/sched/stop_task.c33
-rw-r--r--kernel/sched/topology.c112
-rw-r--r--kernel/sched/wait.c37
-rw-r--r--kernel/sched/wait_bit.c1
-rw-r--r--kernel/seccomp.c35
-rw-r--r--kernel/signal.c11
-rw-r--r--kernel/smp.c99
-rw-r--r--kernel/stacktrace.c10
-rw-r--r--kernel/stop_machine.c45
-rw-r--r--kernel/sys.c81
-rw-r--r--kernel/sys_ni.c23
-rw-r--r--kernel/sysctl-test.c394
-rw-r--r--kernel/sysctl.c14
-rw-r--r--kernel/sysctl_binary.c1305
-rw-r--r--kernel/taskstats.c30
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/alarmtimer.c141
-rw-r--r--kernel/time/clocksource.c11
-rw-r--r--kernel/time/hrtimer.c270
-rw-r--r--kernel/time/itimer.c207
-rw-r--r--kernel/time/namespace.c468
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/posix-clock.c39
-rw-r--r--kernel/time/posix-cpu-timers.c1042
-rw-r--r--kernel/time/posix-stubs.c18
-rw-r--r--kernel/time/posix-timers.c149
-rw-r--r--kernel/time/posix-timers.h8
-rw-r--r--kernel/time/sched_clock.c9
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c59
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-sched.c44
-rw-r--r--kernel/time/time.c116
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/timer.c113
-rw-r--r--kernel/time/vsyscall.c64
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig385
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/blktrace.c92
-rw-r--r--kernel/trace/bpf_trace.c276
-rw-r--r--kernel/trace/fgraph.c38
-rw-r--r--kernel/trace/ftrace.c737
-rw-r--r--kernel/trace/ftrace_internal.h8
-rw-r--r--kernel/trace/kprobe_event_gen_test.c225
-rw-r--r--kernel/trace/preemptirq_delay_test.c144
-rw-r--r--kernel/trace/ring_buffer.c141
-rw-r--r--kernel/trace/ring_buffer_benchmark.c8
-rw-r--r--kernel/trace/synth_event_gen_test.c523
-rw-r--r--kernel/trace/trace.c870
-rw-r--r--kernel/trace/trace.h168
-rw-r--r--kernel/trace/trace_benchmark.c4
-rw-r--r--kernel/trace/trace_boot.c334
-rw-r--r--kernel/trace/trace_branch.c14
-rw-r--r--kernel/trace/trace_dynevent.c226
-rw-r--r--kernel/trace/trace_dynevent.h39
-rw-r--r--kernel/trace/trace_entries.h68
-rw-r--r--kernel/trace/trace_event_perf.c19
-rw-r--r--kernel/trace/trace_events.c216
-rw-r--r--kernel/trace/trace_events_filter.c8
-rw-r--r--kernel/trace/trace_events_hist.c1107
-rw-r--r--kernel/trace/trace_events_inject.c329
-rw-r--r--kernel/trace/trace_events_trigger.c35
-rw-r--r--kernel/trace/trace_export.c110
-rw-r--r--kernel/trace/trace_functions.c8
-rw-r--r--kernel/trace/trace_functions_graph.c14
-rw-r--r--kernel/trace/trace_hwlat.c19
-rw-r--r--kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/trace/trace_kdb.c8
-rw-r--r--kernel/trace/trace_kprobe.c567
-rw-r--r--kernel/trace/trace_mmiotrace.c12
-rw-r--r--kernel/trace/trace_output.c23
-rw-r--r--kernel/trace/trace_printk.c7
-rw-r--r--kernel/trace/trace_probe.c205
-rw-r--r--kernel/trace/trace_probe.h77
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c27
-rw-r--r--kernel/trace/trace_selftest.c26
-rw-r--r--kernel/trace/trace_seq.c33
-rw-r--r--kernel/trace/trace_stack.c125
-rw-r--r--kernel/trace/trace_stat.c43
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c91
-rw-r--r--kernel/trace/trace_uprobe.c421
-rw-r--r--kernel/trace/tracing_map.c4
-rw-r--r--kernel/tsacct.c9
-rw-r--r--kernel/up.c15
-rw-r--r--kernel/watchdog.c35
-rw-r--r--kernel/workqueue.c126
261 files changed, 25313 insertions, 10145 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index e0852dc333ac..3de8fd11873b 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -101,7 +101,7 @@ config UNINLINE_SPIN_UNLOCK
# unlock and unlock_irq functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
# or
-# - DEBUG_SPINLOCK=n and PREEMPT=n
+# - DEBUG_SPINLOCK=n and PREEMPTION=n
#
# unlock_bh and unlock_irqrestore functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
@@ -139,7 +139,7 @@ config INLINE_SPIN_UNLOCK_BH
config INLINE_SPIN_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_SPIN_UNLOCK_IRQ
config INLINE_SPIN_UNLOCK_IRQRESTORE
def_bool y
@@ -168,7 +168,7 @@ config INLINE_READ_LOCK_IRQSAVE
config INLINE_READ_UNLOCK
def_bool y
- depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
+ depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK
config INLINE_READ_UNLOCK_BH
def_bool y
@@ -176,7 +176,7 @@ config INLINE_READ_UNLOCK_BH
config INLINE_READ_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK_IRQ
config INLINE_READ_UNLOCK_IRQRESTORE
def_bool y
@@ -205,7 +205,7 @@ config INLINE_WRITE_LOCK_IRQSAVE
config INLINE_WRITE_UNLOCK
def_bool y
- depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
+ depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK
config INLINE_WRITE_UNLOCK_BH
def_bool y
@@ -213,7 +213,7 @@ config INLINE_WRITE_UNLOCK_BH
config INLINE_WRITE_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK_IRQ
config INLINE_WRITE_UNLOCK_IRQRESTORE
def_bool y
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index deff97217496..bf82259cff96 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -65,7 +65,7 @@ config PREEMPT_RT
preemptible priority-inheritance aware variants, enforcing
interrupt threading and introducing mechanisms to break up long
non-preemptible sections. This makes the kernel, except for very
- low level and critical code pathes (entry code, scheduler, low
+ low level and critical code paths (entry code, scheduler, low
level interrupt handling) fully preemptible and brings most
execution contexts under scheduler control.
diff --git a/kernel/Makefile b/kernel/Makefile
index ef0d95a190b4..4cb4130ced32 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,6 +27,7 @@ KCOV_INSTRUMENT_softirq.o := n
# and produce insane amounts of uninteresting coverage.
KCOV_INSTRUMENT_module.o := n
KCOV_INSTRUMENT_extable.o := n
+KCOV_INSTRUMENT_stacktrace.o := n
# Don't self-instrument.
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
@@ -58,12 +59,14 @@ endif
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
+obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
+obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup/
@@ -113,6 +116,8 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o
obj-$(CONFIG_HAS_IOMEM) += iomem.o
obj-$(CONFIG_RSEQ) += rseq.o
+obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
+
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
KCOV_INSTRUMENT_stackleak.o := n
@@ -126,7 +131,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
-cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
+ cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
$(obj)/kheaders_data.tar.xz: FORCE
$(call cmd,genikh)
diff --git a/kernel/acct.c b/kernel/acct.c
index 81f9831a7859..11ff4a596d6b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -416,6 +416,7 @@ static void fill_ac(acct_t *ac)
{
struct pacct_struct *pacct = &current->signal->pacct;
u64 elapsed, run_time;
+ time64_t btime;
struct tty_struct *tty;
/*
@@ -448,7 +449,8 @@ static void fill_ac(acct_t *ac)
}
#endif
do_div(elapsed, AHZ);
- ac->ac_btime = get_seconds() - elapsed;
+ btime = ktime_get_real_seconds() - elapsed;
+ ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
#if ACCT_VERSION==2
ac->ac_ahz = AHZ;
#endif
diff --git a/kernel/audit.c b/kernel/audit.c
index da8dc0db5bd3..17b0d523afb3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -102,12 +102,13 @@ struct audit_net {
* This struct is RCU protected; you must either hold the RCU lock for reading
* or the associated spinlock for writing.
*/
-static struct auditd_connection {
+struct auditd_connection {
struct pid *pid;
u32 portid;
struct net *net;
struct rcu_head rcu;
-} *auditd_conn = NULL;
+};
+static struct auditd_connection __rcu *auditd_conn;
static DEFINE_SPINLOCK(auditd_conn_lock);
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
@@ -830,7 +831,7 @@ static int kauditd_thread(void *dummy)
rc = kauditd_send_queue(sk, portid,
&audit_hold_queue, UNICAST_RETRIES,
NULL, kauditd_rehold_skb);
- if (ac && rc < 0) {
+ if (rc < 0) {
sk = NULL;
auditd_reset(ac);
goto main_queue;
@@ -840,7 +841,7 @@ static int kauditd_thread(void *dummy)
rc = kauditd_send_queue(sk, portid,
&audit_retry_queue, UNICAST_RETRIES,
NULL, kauditd_hold_skb);
- if (ac && rc < 0) {
+ if (rc < 0) {
sk = NULL;
auditd_reset(ac);
goto main_queue;
@@ -2155,18 +2156,19 @@ void audit_log_task_info(struct audit_buffer *ab)
EXPORT_SYMBOL(audit_log_task_info);
/**
- * audit_log_link_denied - report a link restriction denial
- * @operation: specific link operation
+ * audit_log_path_denied - report a path restriction denial
+ * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc)
+ * @operation: specific operation name
*/
-void audit_log_link_denied(const char *operation)
+void audit_log_path_denied(int type, const char *operation)
{
struct audit_buffer *ab;
if (!audit_enabled || audit_dummy_context())
return;
- /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
- ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK);
+ /* Generate log with subject, operation, outcome. */
+ ab = audit_log_start(audit_context(), GFP_KERNEL, type);
if (!ab)
return;
audit_log_format(ab, "op=%s", operation);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 1f31c2f1e6fc..4508d5e0cf69 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -351,12 +351,12 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- inode_unlock(d_backing_inode(parent->dentry));
if (d_is_positive(d)) {
/* update watch filter fields */
watch->dev = d->d_sb->s_dev;
watch->ino = d_backing_inode(d)->i_ino;
}
+ inode_unlock(d_backing_inode(parent->dentry));
dput(d);
return 0;
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 29d781061cd5..046ce5d98033 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,9 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
@@ -22,3 +24,9 @@ obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
endif
+ifeq ($(CONFIG_SYSFS),y)
+obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
+endif
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1c65ce0098a9..95d77770353c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,7 +14,7 @@
#include "map_in_map.h"
#define ARRAY_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
+ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK)
static void bpf_array_free_percpu(struct bpf_array *array)
{
@@ -59,6 +59,10 @@ int array_map_alloc_check(union bpf_attr *attr)
(percpu && numa_node != NUMA_NO_NODE))
return -EINVAL;
+ if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
+ attr->map_flags & BPF_F_MMAPABLE)
+ return -EINVAL;
+
if (attr->value_size > KMALLOC_MAX_SIZE)
/* if value_size is bigger, the user space won't be able to
* access the elements.
@@ -102,10 +106,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
}
array_size = sizeof(*array);
- if (percpu)
+ if (percpu) {
array_size += (u64) max_entries * sizeof(void *);
- else
- array_size += (u64) max_entries * elem_size;
+ } else {
+ /* rely on vmalloc() to return page-aligned memory and
+ * ensure array->value is exactly page-aligned
+ */
+ if (attr->map_flags & BPF_F_MMAPABLE) {
+ array_size = PAGE_ALIGN(array_size);
+ array_size += PAGE_ALIGN((u64) max_entries * elem_size);
+ } else {
+ array_size += (u64) max_entries * elem_size;
+ }
+ }
/* make sure there is no u32 overflow later in round_up() */
cost = array_size;
@@ -117,7 +130,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(ret);
/* allocate all map elements and zero-initialize them */
- array = bpf_map_area_alloc(array_size, numa_node);
+ if (attr->map_flags & BPF_F_MMAPABLE) {
+ void *data;
+
+ /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
+ data = bpf_map_area_mmapable_alloc(array_size, numa_node);
+ if (!data) {
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
+ array = data + PAGE_ALIGN(sizeof(struct bpf_array))
+ - offsetof(struct bpf_array, value);
+ } else {
+ array = bpf_map_area_alloc(array_size, numa_node);
+ }
if (!array) {
bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
@@ -350,6 +376,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
+static void *array_map_vmalloc_addr(struct bpf_array *array)
+{
+ return (void *)round_down((unsigned long)array, PAGE_SIZE);
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
@@ -365,7 +396,10 @@ static void array_map_free(struct bpf_map *map)
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
- bpf_map_area_free(array);
+ if (array->map.map_flags & BPF_F_MMAPABLE)
+ bpf_map_area_free(array_map_vmalloc_addr(array));
+ else
+ bpf_map_area_free(array);
}
static void array_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -444,6 +478,17 @@ static int array_map_check_btf(const struct bpf_map *map,
return 0;
}
+static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT;
+
+ if (!(map->map_flags & BPF_F_MMAPABLE))
+ return -EINVAL;
+
+ return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff);
+}
+
const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
@@ -455,8 +500,11 @@ const struct bpf_map_ops array_map_ops = {
.map_gen_lookup = array_map_gen_lookup,
.map_direct_value_addr = array_map_direct_value_addr,
.map_direct_value_meta = array_map_direct_value_meta,
+ .map_mmap = array_map_mmap,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
};
const struct bpf_map_ops percpu_array_map_ops = {
@@ -540,10 +588,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
if (IS_ERR(new_ptr))
return PTR_ERR(new_ptr);
- old_ptr = xchg(array->ptrs + index, new_ptr);
+ if (map->ops->map_poke_run) {
+ mutex_lock(&array->aux->poke_mutex);
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ map->ops->map_poke_run(map, index, old_ptr, new_ptr);
+ mutex_unlock(&array->aux->poke_mutex);
+ } else {
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ }
+
if (old_ptr)
map->ops->map_fd_put_ptr(old_ptr);
-
return 0;
}
@@ -556,7 +611,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
if (index >= array->map.max_entries)
return -E2BIG;
- old_ptr = xchg(array->ptrs + index, NULL);
+ if (map->ops->map_poke_run) {
+ mutex_lock(&array->aux->poke_mutex);
+ old_ptr = xchg(array->ptrs + index, NULL);
+ map->ops->map_poke_run(map, index, old_ptr, NULL);
+ mutex_unlock(&array->aux->poke_mutex);
+ } else {
+ old_ptr = xchg(array->ptrs + index, NULL);
+ }
+
if (old_ptr) {
map->ops->map_fd_put_ptr(old_ptr);
return 0;
@@ -625,17 +688,195 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+struct prog_poke_elem {
+ struct list_head list;
+ struct bpf_prog_aux *aux;
+};
+
+static int prog_array_map_poke_track(struct bpf_map *map,
+ struct bpf_prog_aux *prog_aux)
+{
+ struct prog_poke_elem *elem;
+ struct bpf_array_aux *aux;
+ int ret = 0;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ mutex_lock(&aux->poke_mutex);
+ list_for_each_entry(elem, &aux->poke_progs, list) {
+ if (elem->aux == prog_aux)
+ goto out;
+ }
+
+ elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+ if (!elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&elem->list);
+ /* We must track the program's aux info at this point in time
+ * since the program pointer itself may not be stable yet, see
+ * also comment in prog_array_map_poke_run().
+ */
+ elem->aux = prog_aux;
+
+ list_add_tail(&elem->list, &aux->poke_progs);
+out:
+ mutex_unlock(&aux->poke_mutex);
+ return ret;
+}
+
+static void prog_array_map_poke_untrack(struct bpf_map *map,
+ struct bpf_prog_aux *prog_aux)
+{
+ struct prog_poke_elem *elem, *tmp;
+ struct bpf_array_aux *aux;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ mutex_lock(&aux->poke_mutex);
+ list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+ if (elem->aux == prog_aux) {
+ list_del_init(&elem->list);
+ kfree(elem);
+ break;
+ }
+ }
+ mutex_unlock(&aux->poke_mutex);
+}
+
+static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
+ struct bpf_prog *old,
+ struct bpf_prog *new)
+{
+ struct prog_poke_elem *elem;
+ struct bpf_array_aux *aux;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
+
+ list_for_each_entry(elem, &aux->poke_progs, list) {
+ struct bpf_jit_poke_descriptor *poke;
+ int i, ret;
+
+ for (i = 0; i < elem->aux->size_poke_tab; i++) {
+ poke = &elem->aux->poke_tab[i];
+
+ /* Few things to be aware of:
+ *
+ * 1) We can only ever access aux in this context, but
+ * not aux->prog since it might not be stable yet and
+ * there could be danger of use after free otherwise.
+ * 2) Initially when we start tracking aux, the program
+ * is not JITed yet and also does not have a kallsyms
+ * entry. We skip these as poke->ip_stable is not
+ * active yet. The JIT will do the final fixup before
+ * setting it stable. The various poke->ip_stable are
+ * successively activated, so tail call updates can
+ * arrive from here while JIT is still finishing its
+ * final fixup for non-activated poke entries.
+ * 3) On program teardown, the program's kallsym entry gets
+ * removed out of RCU callback, but we can only untrack
+ * from sleepable context, therefore bpf_arch_text_poke()
+ * might not see that this is in BPF text section and
+ * bails out with -EINVAL. As these are unreachable since
+ * RCU grace period already passed, we simply skip them.
+ * 4) Also programs reaching refcount of zero while patching
+ * is in progress is okay since we're protected under
+ * poke_mutex and untrack the programs before the JIT
+ * buffer is freed. When we're still in the middle of
+ * patching and suddenly kallsyms entry of the program
+ * gets evicted, we just skip the rest which is fine due
+ * to point 3).
+ * 5) Any other error happening below from bpf_arch_text_poke()
+ * is a unexpected bug.
+ */
+ if (!READ_ONCE(poke->ip_stable))
+ continue;
+ if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
+ continue;
+ if (poke->tail_call.map != map ||
+ poke->tail_call.key != key)
+ continue;
+
+ ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP,
+ old ? (u8 *)old->bpf_func +
+ poke->adj_off : NULL,
+ new ? (u8 *)new->bpf_func +
+ poke->adj_off : NULL);
+ BUG_ON(ret < 0 && ret != -EINVAL);
+ }
+ }
+}
+
+static void prog_array_map_clear_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_array_aux,
+ work)->map;
+ bpf_fd_array_map_clear(map);
+ bpf_map_put(map);
+}
+
+static void prog_array_map_clear(struct bpf_map *map)
+{
+ struct bpf_array_aux *aux = container_of(map, struct bpf_array,
+ map)->aux;
+ bpf_map_inc(map);
+ schedule_work(&aux->work);
+}
+
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_array_aux *aux;
+ struct bpf_map *map;
+
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL);
+ if (!aux)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_WORK(&aux->work, prog_array_map_clear_deferred);
+ INIT_LIST_HEAD(&aux->poke_progs);
+ mutex_init(&aux->poke_mutex);
+
+ map = array_map_alloc(attr);
+ if (IS_ERR(map)) {
+ kfree(aux);
+ return map;
+ }
+
+ container_of(map, struct bpf_array, map)->aux = aux;
+ aux->map = map;
+
+ return map;
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+ struct prog_poke_elem *elem, *tmp;
+ struct bpf_array_aux *aux;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+ list_del_init(&elem->list);
+ kfree(elem);
+ }
+ kfree(aux);
+ fd_array_map_free(map);
+}
+
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
- .map_alloc = array_map_alloc,
- .map_free = fd_array_map_free,
+ .map_alloc = prog_array_map_alloc,
+ .map_free = prog_array_map_free,
+ .map_poke_track = prog_array_map_poke_track,
+ .map_poke_untrack = prog_array_map_poke_untrack,
+ .map_poke_run = prog_array_map_poke_run,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
- .map_release_uref = bpf_fd_array_map_clear,
+ .map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
new file mode 100644
index 000000000000..042f95534f86
--- /dev/null
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -0,0 +1,633 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <linux/slab.h>
+#include <linux/numa.h>
+#include <linux/seq_file.h>
+#include <linux/refcount.h>
+#include <linux/mutex.h>
+
+enum bpf_struct_ops_state {
+ BPF_STRUCT_OPS_STATE_INIT,
+ BPF_STRUCT_OPS_STATE_INUSE,
+ BPF_STRUCT_OPS_STATE_TOBEFREE,
+};
+
+#define BPF_STRUCT_OPS_COMMON_VALUE \
+ refcount_t refcnt; \
+ enum bpf_struct_ops_state state
+
+struct bpf_struct_ops_value {
+ BPF_STRUCT_OPS_COMMON_VALUE;
+ char data[0] ____cacheline_aligned_in_smp;
+};
+
+struct bpf_struct_ops_map {
+ struct bpf_map map;
+ const struct bpf_struct_ops *st_ops;
+ /* protect map_update */
+ struct mutex lock;
+ /* progs has all the bpf_prog that is populated
+ * to the func ptr of the kernel's struct
+ * (in kvalue.data).
+ */
+ struct bpf_prog **progs;
+ /* image is a page that has all the trampolines
+ * that stores the func args before calling the bpf_prog.
+ * A PAGE_SIZE "image" is enough to store all trampoline for
+ * "progs[]".
+ */
+ void *image;
+ /* uvalue->data stores the kernel struct
+ * (e.g. tcp_congestion_ops) that is more useful
+ * to userspace than the kvalue. For example,
+ * the bpf_prog's id is stored instead of the kernel
+ * address of a func ptr.
+ */
+ struct bpf_struct_ops_value *uvalue;
+ /* kvalue.data stores the actual kernel's struct
+ * (e.g. tcp_congestion_ops) that will be
+ * registered to the kernel subsystem.
+ */
+ struct bpf_struct_ops_value kvalue;
+};
+
+#define VALUE_PREFIX "bpf_struct_ops_"
+#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
+
+/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
+ * the map's value exposed to the userspace and its btf-type-id is
+ * stored at the map->btf_vmlinux_value_type_id.
+ *
+ */
+#define BPF_STRUCT_OPS_TYPE(_name) \
+extern struct bpf_struct_ops bpf_##_name; \
+ \
+struct bpf_struct_ops_##_name { \
+ BPF_STRUCT_OPS_COMMON_VALUE; \
+ struct _name data ____cacheline_aligned_in_smp; \
+};
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+
+enum {
+#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+ __NR_BPF_STRUCT_OPS_TYPE,
+};
+
+static struct bpf_struct_ops * const bpf_struct_ops[] = {
+#define BPF_STRUCT_OPS_TYPE(_name) \
+ [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+};
+
+const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
+};
+
+const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
+};
+
+static const struct btf_type *module_type;
+
+void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
+{
+ s32 type_id, value_id, module_id;
+ const struct btf_member *member;
+ struct bpf_struct_ops *st_ops;
+ const struct btf_type *t;
+ char value_name[128];
+ const char *mname;
+ u32 i, j;
+
+ /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
+#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+
+ module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
+ if (module_id < 0) {
+ pr_warn("Cannot find struct module in btf_vmlinux\n");
+ return;
+ }
+ module_type = btf_type_by_id(btf, module_id);
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ st_ops = bpf_struct_ops[i];
+
+ if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
+ sizeof(value_name)) {
+ pr_warn("struct_ops name %s is too long\n",
+ st_ops->name);
+ continue;
+ }
+ sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
+
+ value_id = btf_find_by_name_kind(btf, value_name,
+ BTF_KIND_STRUCT);
+ if (value_id < 0) {
+ pr_warn("Cannot find struct %s in btf_vmlinux\n",
+ value_name);
+ continue;
+ }
+
+ type_id = btf_find_by_name_kind(btf, st_ops->name,
+ BTF_KIND_STRUCT);
+ if (type_id < 0) {
+ pr_warn("Cannot find struct %s in btf_vmlinux\n",
+ st_ops->name);
+ continue;
+ }
+ t = btf_type_by_id(btf, type_id);
+ if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
+ pr_warn("Cannot support #%u members in struct %s\n",
+ btf_type_vlen(t), st_ops->name);
+ continue;
+ }
+
+ for_each_member(j, t, member) {
+ const struct btf_type *func_proto;
+
+ mname = btf_name_by_offset(btf, member->name_off);
+ if (!*mname) {
+ pr_warn("anon member in struct %s is not supported\n",
+ st_ops->name);
+ break;
+ }
+
+ if (btf_member_bitfield_size(t, member)) {
+ pr_warn("bit field member %s in struct %s is not supported\n",
+ mname, st_ops->name);
+ break;
+ }
+
+ func_proto = btf_type_resolve_func_ptr(btf,
+ member->type,
+ NULL);
+ if (func_proto &&
+ btf_distill_func_proto(log, btf,
+ func_proto, mname,
+ &st_ops->func_models[j])) {
+ pr_warn("Error in parsing func ptr %s in struct %s\n",
+ mname, st_ops->name);
+ break;
+ }
+ }
+
+ if (j == btf_type_vlen(t)) {
+ if (st_ops->init(btf)) {
+ pr_warn("Error in init bpf_struct_ops %s\n",
+ st_ops->name);
+ } else {
+ st_ops->type_id = type_id;
+ st_ops->type = t;
+ st_ops->value_id = value_id;
+ st_ops->value_type = btf_type_by_id(btf,
+ value_id);
+ }
+ }
+ }
+}
+
+extern struct btf *btf_vmlinux;
+
+static const struct bpf_struct_ops *
+bpf_struct_ops_find_value(u32 value_id)
+{
+ unsigned int i;
+
+ if (!value_id || !btf_vmlinux)
+ return NULL;
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ if (bpf_struct_ops[i]->value_id == value_id)
+ return bpf_struct_ops[i];
+ }
+
+ return NULL;
+}
+
+const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
+{
+ unsigned int i;
+
+ if (!type_id || !btf_vmlinux)
+ return NULL;
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ if (bpf_struct_ops[i]->type_id == type_id)
+ return bpf_struct_ops[i];
+ }
+
+ return NULL;
+}
+
+static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ if (key && *(u32 *)key == 0)
+ return -ENOENT;
+
+ *(u32 *)next_key = 0;
+ return 0;
+}
+
+int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
+ void *value)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ struct bpf_struct_ops_value *uvalue, *kvalue;
+ enum bpf_struct_ops_state state;
+
+ if (unlikely(*(u32 *)key != 0))
+ return -ENOENT;
+
+ kvalue = &st_map->kvalue;
+ /* Pair with smp_store_release() during map_update */
+ state = smp_load_acquire(&kvalue->state);
+ if (state == BPF_STRUCT_OPS_STATE_INIT) {
+ memset(value, 0, map->value_size);
+ return 0;
+ }
+
+ /* No lock is needed. state and refcnt do not need
+ * to be updated together under atomic context.
+ */
+ uvalue = (struct bpf_struct_ops_value *)value;
+ memcpy(uvalue, st_map->uvalue, map->value_size);
+ uvalue->state = state;
+ refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ return 0;
+}
+
+static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
+{
+ const struct btf_type *t = st_map->st_ops->type;
+ u32 i;
+
+ for (i = 0; i < btf_type_vlen(t); i++) {
+ if (st_map->progs[i]) {
+ bpf_prog_put(st_map->progs[i]);
+ st_map->progs[i] = NULL;
+ }
+ }
+}
+
+static int check_zero_holes(const struct btf_type *t, void *data)
+{
+ const struct btf_member *member;
+ u32 i, moff, msize, prev_mend = 0;
+ const struct btf_type *mtype;
+
+ for_each_member(i, t, member) {
+ moff = btf_member_bit_offset(t, member) / 8;
+ if (moff > prev_mend &&
+ memchr_inv(data + prev_mend, 0, moff - prev_mend))
+ return -EINVAL;
+
+ mtype = btf_type_by_id(btf_vmlinux, member->type);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
+ NULL, NULL);
+ if (IS_ERR(mtype))
+ return PTR_ERR(mtype);
+ prev_mend = moff + msize;
+ }
+
+ if (t->size > prev_mend &&
+ memchr_inv(data + prev_mend, 0, t->size - prev_mend))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ struct bpf_struct_ops_value *uvalue, *kvalue;
+ const struct btf_member *member;
+ const struct btf_type *t = st_ops->type;
+ void *udata, *kdata;
+ int prog_fd, err = 0;
+ void *image;
+ u32 i;
+
+ if (flags)
+ return -EINVAL;
+
+ if (*(u32 *)key != 0)
+ return -E2BIG;
+
+ err = check_zero_holes(st_ops->value_type, value);
+ if (err)
+ return err;
+
+ uvalue = (struct bpf_struct_ops_value *)value;
+ err = check_zero_holes(t, uvalue->data);
+ if (err)
+ return err;
+
+ if (uvalue->state || refcount_read(&uvalue->refcnt))
+ return -EINVAL;
+
+ uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
+ kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
+
+ mutex_lock(&st_map->lock);
+
+ if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
+ err = -EBUSY;
+ goto unlock;
+ }
+
+ memcpy(uvalue, value, map->value_size);
+
+ udata = &uvalue->data;
+ kdata = &kvalue->data;
+ image = st_map->image;
+
+ for_each_member(i, t, member) {
+ const struct btf_type *mtype, *ptype;
+ struct bpf_prog *prog;
+ u32 moff;
+
+ moff = btf_member_bit_offset(t, member) / 8;
+ ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
+ if (ptype == module_type) {
+ if (*(void **)(udata + moff))
+ goto reset_unlock;
+ *(void **)(kdata + moff) = BPF_MODULE_OWNER;
+ continue;
+ }
+
+ err = st_ops->init_member(t, member, kdata, udata);
+ if (err < 0)
+ goto reset_unlock;
+
+ /* The ->init_member() has handled this member */
+ if (err > 0)
+ continue;
+
+ /* If st_ops->init_member does not handle it,
+ * we will only handle func ptrs and zero-ed members
+ * here. Reject everything else.
+ */
+
+ /* All non func ptr member must be 0 */
+ if (!ptype || !btf_type_is_func_proto(ptype)) {
+ u32 msize;
+
+ mtype = btf_type_by_id(btf_vmlinux, member->type);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
+ NULL, NULL);
+ if (IS_ERR(mtype)) {
+ err = PTR_ERR(mtype);
+ goto reset_unlock;
+ }
+
+ if (memchr_inv(udata + moff, 0, msize)) {
+ err = -EINVAL;
+ goto reset_unlock;
+ }
+
+ continue;
+ }
+
+ prog_fd = (int)(*(unsigned long *)(udata + moff));
+ /* Similar check as the attr->attach_prog_fd */
+ if (!prog_fd)
+ continue;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto reset_unlock;
+ }
+ st_map->progs[i] = prog;
+
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
+ prog->aux->attach_btf_id != st_ops->type_id ||
+ prog->expected_attach_type != i) {
+ err = -EINVAL;
+ goto reset_unlock;
+ }
+
+ err = arch_prepare_bpf_trampoline(image,
+ st_map->image + PAGE_SIZE,
+ &st_ops->func_models[i], 0,
+ &prog, 1, NULL, 0, NULL);
+ if (err < 0)
+ goto reset_unlock;
+
+ *(void **)(kdata + moff) = image;
+ image += err;
+
+ /* put prog_id to udata */
+ *(unsigned long *)(udata + moff) = prog->aux->id;
+ }
+
+ refcount_set(&kvalue->refcnt, 1);
+ bpf_map_inc(map);
+
+ set_memory_ro((long)st_map->image, 1);
+ set_memory_x((long)st_map->image, 1);
+ err = st_ops->reg(kdata);
+ if (likely(!err)) {
+ /* Pair with smp_load_acquire() during lookup_elem().
+ * It ensures the above udata updates (e.g. prog->aux->id)
+ * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
+ goto unlock;
+ }
+
+ /* Error during st_ops->reg(). It is very unlikely since
+ * the above init_member() should have caught it earlier
+ * before reg(). The only possibility is if there was a race
+ * in registering the struct_ops (under the same name) to
+ * a sub-system through different struct_ops's maps.
+ */
+ set_memory_nx((long)st_map->image, 1);
+ set_memory_rw((long)st_map->image, 1);
+ bpf_map_put(map);
+
+reset_unlock:
+ bpf_struct_ops_map_put_progs(st_map);
+ memset(uvalue, 0, map->value_size);
+ memset(kvalue, 0, map->value_size);
+unlock:
+ mutex_unlock(&st_map->lock);
+ return err;
+}
+
+static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+{
+ enum bpf_struct_ops_state prev_state;
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = (struct bpf_struct_ops_map *)map;
+ prev_state = cmpxchg(&st_map->kvalue.state,
+ BPF_STRUCT_OPS_STATE_INUSE,
+ BPF_STRUCT_OPS_STATE_TOBEFREE);
+ if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) {
+ st_map->st_ops->unreg(&st_map->kvalue.data);
+ if (refcount_dec_and_test(&st_map->kvalue.refcnt))
+ bpf_map_put(map);
+ }
+
+ return 0;
+}
+
+static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void *value;
+ int err;
+
+ value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ if (!value)
+ return;
+
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
+ if (!err) {
+ btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id,
+ value, m);
+ seq_puts(m, "\n");
+ }
+
+ kfree(value);
+}
+
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ if (st_map->progs)
+ bpf_struct_ops_map_put_progs(st_map);
+ bpf_map_area_free(st_map->progs);
+ bpf_jit_free_exec(st_map->image);
+ bpf_map_area_free(st_map->uvalue);
+ bpf_map_area_free(st_map);
+}
+
+static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
+ attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ return -EINVAL;
+ return 0;
+}
+
+static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
+{
+ const struct bpf_struct_ops *st_ops;
+ size_t map_total_size, st_map_size;
+ struct bpf_struct_ops_map *st_map;
+ const struct btf_type *t, *vt;
+ struct bpf_map_memory mem;
+ struct bpf_map *map;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
+ if (!st_ops)
+ return ERR_PTR(-ENOTSUPP);
+
+ vt = st_ops->value_type;
+ if (attr->value_size != vt->size)
+ return ERR_PTR(-EINVAL);
+
+ t = st_ops->type;
+
+ st_map_size = sizeof(*st_map) +
+ /* kvalue stores the
+ * struct bpf_struct_ops_tcp_congestions_ops
+ */
+ (vt->size - sizeof(struct bpf_struct_ops_value));
+ map_total_size = st_map_size +
+ /* uvalue */
+ sizeof(vt->size) +
+ /* struct bpf_progs **progs */
+ btf_type_vlen(t) * sizeof(struct bpf_prog *);
+ err = bpf_map_charge_init(&mem, map_total_size);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
+ if (!st_map) {
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
+ st_map->st_ops = st_ops;
+ map = &st_map->map;
+
+ st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
+ st_map->progs =
+ bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
+ NUMA_NO_NODE);
+ st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!st_map->uvalue || !st_map->progs || !st_map->image) {
+ bpf_struct_ops_map_free(map);
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ mutex_init(&st_map->lock);
+ set_vm_flush_reset_perms(st_map->image);
+ bpf_map_init_from_attr(map, attr);
+ bpf_map_charge_move(&map->memory, &mem);
+
+ return map;
+}
+
+const struct bpf_map_ops bpf_struct_ops_map_ops = {
+ .map_alloc_check = bpf_struct_ops_map_alloc_check,
+ .map_alloc = bpf_struct_ops_map_alloc,
+ .map_free = bpf_struct_ops_map_free,
+ .map_get_next_key = bpf_struct_ops_map_get_next_key,
+ .map_lookup_elem = bpf_struct_ops_map_lookup_elem,
+ .map_delete_elem = bpf_struct_ops_map_delete_elem,
+ .map_update_elem = bpf_struct_ops_map_update_elem,
+ .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+};
+
+/* "const void *" because some subsystem is
+ * passing a const (e.g. const struct tcp_congestion_ops *)
+ */
+bool bpf_struct_ops_get(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+
+ return refcount_inc_not_zero(&kvalue->refcnt);
+}
+
+void bpf_struct_ops_put(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ if (refcount_dec_and_test(&kvalue->refcnt)) {
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(kvalue, struct bpf_struct_ops_map,
+ kvalue);
+ bpf_map_put(&st_map->map);
+ }
+}
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
new file mode 100644
index 000000000000..066d83ea1c99
--- /dev/null
+++ b/kernel/bpf/bpf_struct_ops_types.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* internal file - do not include directly */
+
+#ifdef CONFIG_BPF_JIT
+#ifdef CONFIG_INET
+#include <net/tcp.h>
+BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
+#endif
+#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5fcc7a17eb5a..805c43b083e9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2,6 +2,8 @@
/* Copyright (c) 2018 Facebook */
#include <uapi/linux/btf.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/bpf_perf_event.h>
#include <uapi/linux/types.h>
#include <linux/seq_file.h>
#include <linux/compiler.h>
@@ -16,6 +18,9 @@
#include <linux/sort.h>
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
+#include <linux/skmsg.h>
+#include <linux/perf_event.h>
+#include <net/sock.h>
/* BTF (BPF Type Format) is the meta data format which describes
* the data types of BPF program/map. Hence, it basically focus
@@ -175,11 +180,6 @@
*/
#define BTF_MAX_SIZE (16 * 1024 * 1024)
-#define for_each_member(i, struct_type, member) \
- for (i = 0, member = btf_type_member(struct_type); \
- i < btf_type_vlen(struct_type); \
- i++, member++)
-
#define for_each_member_from(i, from, struct_type, member) \
for (i = from, member = btf_type_member(struct_type) + from; \
i < btf_type_vlen(struct_type); \
@@ -195,8 +195,8 @@
i < btf_type_vlen(struct_type); \
i++, member++)
-static DEFINE_IDR(btf_idr);
-static DEFINE_SPINLOCK(btf_idr_lock);
+DEFINE_IDR(btf_idr);
+DEFINE_SPINLOCK(btf_idr_lock);
struct btf {
void *data;
@@ -276,6 +276,11 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_DATASEC] = "DATASEC",
};
+static const char *btf_type_str(const struct btf_type *t)
+{
+ return btf_kind_str[BTF_INFO_KIND(t->info)];
+}
+
struct btf_kind_operations {
s32 (*check_meta)(struct btf_verifier_env *env,
const struct btf_type *t,
@@ -336,16 +341,6 @@ static bool btf_type_is_fwd(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
}
-static bool btf_type_is_func(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
-}
-
-static bool btf_type_is_func_proto(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
-}
-
static bool btf_type_nosize(const struct btf_type *t)
{
return btf_type_is_void(t) || btf_type_is_fwd(t) ||
@@ -377,24 +372,73 @@ static bool btf_type_is_array(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
}
-static bool btf_type_is_ptr(const struct btf_type *t)
+static bool btf_type_is_var(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
+ return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
}
-static bool btf_type_is_int(const struct btf_type *t)
+static bool btf_type_is_datasec(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
-static bool btf_type_is_var(const struct btf_type *t)
+s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
+ const struct btf_type *t;
+ const char *tname;
+ u32 i;
+
+ for (i = 1; i <= btf->nr_types; i++) {
+ t = btf->types[i];
+ if (BTF_INFO_KIND(t->info) != kind)
+ continue;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!strcmp(tname, name))
+ return i;
+ }
+
+ return -ENOENT;
}
-static bool btf_type_is_datasec(const struct btf_type *t)
+const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
+ u32 id, u32 *res_id)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+ const struct btf_type *t = btf_type_by_id(btf, id);
+
+ while (btf_type_is_modifier(t)) {
+ id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ if (res_id)
+ *res_id = id;
+
+ return t;
+}
+
+const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, id, NULL);
+ if (!btf_type_is_ptr(t))
+ return NULL;
+
+ return btf_type_skip_modifiers(btf, t->type, res_id);
+}
+
+const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *ptype;
+
+ ptype = btf_type_resolve_ptr(btf, id, res_id);
+ if (ptype && btf_type_is_func_proto(ptype))
+ return ptype;
+
+ return NULL;
}
/* Types that act only as a source, not sink or intermediate
@@ -461,30 +505,6 @@ static const char *btf_int_encoding_str(u8 encoding)
return "UNKN";
}
-static u16 btf_type_vlen(const struct btf_type *t)
-{
- return BTF_INFO_VLEN(t->info);
-}
-
-static bool btf_type_kflag(const struct btf_type *t)
-{
- return BTF_INFO_KFLAG(t->info);
-}
-
-static u32 btf_member_bit_offset(const struct btf_type *struct_type,
- const struct btf_member *member)
-{
- return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
- : member->offset;
-}
-
-static u32 btf_member_bitfield_size(const struct btf_type *struct_type,
- const struct btf_member *member)
-{
- return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
- : 0;
-}
-
static u32 btf_type_int(const struct btf_type *t)
{
return *(u32 *)(t + 1);
@@ -495,11 +515,6 @@ static const struct btf_array *btf_type_array(const struct btf_type *t)
return (const struct btf_array *)(t + 1);
}
-static const struct btf_member *btf_type_member(const struct btf_type *t)
-{
- return (const struct btf_member *)(t + 1);
-}
-
static const struct btf_enum *btf_type_enum(const struct btf_type *t)
{
return (const struct btf_enum *)(t + 1);
@@ -698,6 +713,13 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
+ /* btf verifier prints all types it is processing via
+ * btf_verifier_log_type(..., fmt = NULL).
+ * Skip those prints for in-kernel BTF verification.
+ */
+ if (log->level == BPF_LOG_KERNEL && !fmt)
+ return;
+
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
btf_kind_str[kind],
@@ -735,6 +757,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
+ if (log->level == BPF_LOG_KERNEL && !fmt)
+ return;
/* The CHECK_META phase already did a btf dump.
*
* If member is logged again, it must hit an error in
@@ -777,6 +801,8 @@ static void btf_verifier_log_vsi(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
+ if (log->level == BPF_LOG_KERNEL && !fmt)
+ return;
if (env->phase != CHECK_META)
btf_verifier_log_type(env, datasec_type, NULL);
@@ -802,6 +828,8 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
+ if (log->level == BPF_LOG_KERNEL)
+ return;
hdr = &btf->hdr;
__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
__btf_verifier_log(log, "version: %u\n", hdr->version);
@@ -1043,6 +1071,84 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
}
+/* Resolve the size of a passed-in "type"
+ *
+ * type: is an array (e.g. u32 array[x][y])
+ * return type: type "u32[x][y]", i.e. BTF_KIND_ARRAY,
+ * *type_size: (x * y * sizeof(u32)). Hence, *type_size always
+ * corresponds to the return type.
+ * *elem_type: u32
+ * *total_nelems: (x * y). Hence, individual elem size is
+ * (*type_size / *total_nelems)
+ *
+ * type: is not an array (e.g. const struct X)
+ * return type: type "struct X"
+ * *type_size: sizeof(struct X)
+ * *elem_type: same as return type ("struct X")
+ * *total_nelems: 1
+ */
+const struct btf_type *
+btf_resolve_size(const struct btf *btf, const struct btf_type *type,
+ u32 *type_size, const struct btf_type **elem_type,
+ u32 *total_nelems)
+{
+ const struct btf_type *array_type = NULL;
+ const struct btf_array *array;
+ u32 i, size, nelems = 1;
+
+ for (i = 0; i < MAX_RESOLVE_DEPTH; i++) {
+ switch (BTF_INFO_KIND(type->info)) {
+ /* type->size can be used */
+ case BTF_KIND_INT:
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ case BTF_KIND_ENUM:
+ size = type->size;
+ goto resolved;
+
+ case BTF_KIND_PTR:
+ size = sizeof(void *);
+ goto resolved;
+
+ /* Modifiers */
+ case BTF_KIND_TYPEDEF:
+ case BTF_KIND_VOLATILE:
+ case BTF_KIND_CONST:
+ case BTF_KIND_RESTRICT:
+ type = btf_type_by_id(btf, type->type);
+ break;
+
+ case BTF_KIND_ARRAY:
+ if (!array_type)
+ array_type = type;
+ array = btf_type_array(type);
+ if (nelems && array->nelems > U32_MAX / nelems)
+ return ERR_PTR(-EINVAL);
+ nelems *= array->nelems;
+ type = btf_type_by_id(btf, array->type);
+ break;
+
+ /* type without size */
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+
+resolved:
+ if (nelems && size > U32_MAX / nelems)
+ return ERR_PTR(-EINVAL);
+
+ *type_size = nelems * size;
+ if (total_nelems)
+ *total_nelems = nelems;
+ if (elem_type)
+ *elem_type = type;
+
+ return array_type ? : type;
+}
+
/* The input param "type_id" must point to a needs_resolve type */
static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
u32 *type_id)
@@ -1752,7 +1858,10 @@ static void btf_modifier_seq_show(const struct btf *btf,
u32 type_id, void *data,
u8 bits_offset, struct seq_file *m)
{
- t = btf_type_id_resolve(btf, &type_id);
+ if (btf->resolved_ids)
+ t = btf_type_id_resolve(btf, &type_id);
+ else
+ t = btf_type_skip_modifiers(btf, type_id, NULL);
btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
}
@@ -2332,7 +2441,7 @@ static int btf_enum_check_kflag_member(struct btf_verifier_env *env,
if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
btf_verifier_log_member(env, struct_type, member,
"Member is not byte aligned");
- return -EINVAL;
+ return -EINVAL;
}
nr_bits = int_bitsize;
@@ -2377,9 +2486,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (t->size != sizeof(int)) {
- btf_verifier_log_type(env, t, "Expected size:%zu",
- sizeof(int));
+ if (t->size > 8 || !is_power_of_2(t->size)) {
+ btf_verifier_log_type(env, t, "Unexpected size");
return -EINVAL;
}
@@ -2406,7 +2514,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
-
+ if (env->log.level == BPF_LOG_KERNEL)
+ continue;
btf_verifier_log(env, "\t%s val=%d\n",
__btf_name_by_offset(btf, enums[i].name_off),
enums[i].val);
@@ -2547,8 +2656,8 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_vlen(t)) {
- btf_verifier_log_type(env, t, "vlen != 0");
+ if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) {
+ btf_verifier_log_type(env, t, "Invalid func linkage");
return -EINVAL;
}
@@ -3368,6 +3477,1000 @@ errout:
return ERR_PTR(err);
}
+extern char __weak _binary__btf_vmlinux_bin_start[];
+extern char __weak _binary__btf_vmlinux_bin_end[];
+extern struct btf *btf_vmlinux;
+
+#define BPF_MAP_TYPE(_id, _ops)
+static union {
+ struct bpf_ctx_convert {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+ prog_ctx_type _id##_prog; \
+ kern_ctx_type _id##_kern;
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+ } *__t;
+ /* 't' is written once under lock. Read many times. */
+ const struct btf_type *t;
+} bpf_ctx_convert;
+enum {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+ __ctx_convert##_id,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+ __ctx_convert_unused, /* to avoid empty enum in extreme .config */
+};
+static u8 bpf_ctx_convert_map[] = {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+ [_id] = __ctx_convert##_id,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+ 0, /* avoid empty array */
+};
+#undef BPF_MAP_TYPE
+
+static const struct btf_member *
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
+{
+ const struct btf_type *conv_struct;
+ const struct btf_type *ctx_struct;
+ const struct btf_member *ctx_type;
+ const char *tname, *ctx_tname;
+
+ conv_struct = bpf_ctx_convert.t;
+ if (!conv_struct) {
+ bpf_log(log, "btf_vmlinux is malformed\n");
+ return NULL;
+ }
+ t = btf_type_by_id(btf, t->type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_struct(t)) {
+ /* Only pointer to struct is supported for now.
+ * That means that BPF_PROG_TYPE_TRACEPOINT with BTF
+ * is not supported yet.
+ * BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
+ */
+ if (log->level & BPF_LOG_LEVEL)
+ bpf_log(log, "arg#%d type is not a struct\n", arg);
+ return NULL;
+ }
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!tname) {
+ bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
+ return NULL;
+ }
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
+ /* ctx_struct is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct __sk_buff'
+ */
+ ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type);
+ if (!ctx_struct)
+ /* should not happen */
+ return NULL;
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
+ if (!ctx_tname) {
+ /* should not happen */
+ bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
+ return NULL;
+ }
+ /* only compare that prog's ctx type name is the same as
+ * kernel expects. No need to compare field by field.
+ * It's ok for bpf prog to do:
+ * struct __sk_buff {};
+ * int socket_filter_bpf_prog(struct __sk_buff *skb)
+ * { // no fields of skb are ever used }
+ */
+ if (strcmp(ctx_tname, tname))
+ return NULL;
+ return ctx_type;
+}
+
+static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
+ struct btf *btf,
+ const struct btf_type *t,
+ enum bpf_prog_type prog_type,
+ int arg)
+{
+ const struct btf_member *prog_ctx_type, *kern_ctx_type;
+
+ prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
+ if (!prog_ctx_type)
+ return -ENOENT;
+ kern_ctx_type = prog_ctx_type + 1;
+ return kern_ctx_type->type;
+}
+
+struct btf *btf_parse_vmlinux(void)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf = NULL;
+ int err, i;
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ log->level = BPF_LOG_KERNEL;
+
+ btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+ if (!btf) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ env->btf = btf;
+
+ btf->data = _binary__btf_vmlinux_bin_start;
+ btf->data_size = _binary__btf_vmlinux_bin_end -
+ _binary__btf_vmlinux_bin_start;
+
+ err = btf_parse_hdr(env);
+ if (err)
+ goto errout;
+
+ btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+ err = btf_parse_str_sec(env);
+ if (err)
+ goto errout;
+
+ err = btf_check_all_metas(env);
+ if (err)
+ goto errout;
+
+ /* find struct bpf_ctx_convert for type checking later */
+ for (i = 1; i <= btf->nr_types; i++) {
+ const struct btf_type *t;
+ const char *tname;
+
+ t = btf_type_by_id(btf, i);
+ if (!__btf_type_is_struct(t))
+ continue;
+ tname = __btf_name_by_offset(btf, t->name_off);
+ if (!strcmp(tname, "bpf_ctx_convert")) {
+ /* btf_parse_vmlinux() runs under bpf_verifier_lock */
+ bpf_ctx_convert.t = t;
+ break;
+ }
+ }
+ if (i > btf->nr_types) {
+ err = -ENOENT;
+ goto errout;
+ }
+
+ bpf_struct_ops_init(btf, log);
+
+ btf_verifier_env_free(env);
+ refcount_set(&btf->refcnt, 1);
+ return btf;
+
+errout:
+ btf_verifier_env_free(env);
+ if (btf) {
+ kvfree(btf->types);
+ kfree(btf);
+ }
+ return ERR_PTR(err);
+}
+
+struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
+{
+ struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+
+ if (tgt_prog) {
+ return tgt_prog->aux->btf;
+ } else {
+ return btf_vmlinux;
+ }
+}
+
+static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+{
+ /* t comes in already as a pointer */
+ t = btf_type_by_id(btf, t->type);
+
+ /* allow const */
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
+ t = btf_type_by_id(btf, t->type);
+
+ /* char, signed char, unsigned char */
+ return btf_type_is_int(t) && t->size == 1;
+}
+
+bool btf_ctx_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const struct btf_type *t = prog->aux->attach_func_proto;
+ struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+ struct btf *btf = bpf_prog_get_target_btf(prog);
+ const char *tname = prog->aux->attach_func_name;
+ struct bpf_verifier_log *log = info->log;
+ const struct btf_param *args;
+ u32 nr_args, arg;
+ int ret;
+
+ if (off % 8) {
+ bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
+ tname, off);
+ return false;
+ }
+ arg = off / 8;
+ args = (const struct btf_param *)(t + 1);
+ /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */
+ nr_args = t ? btf_type_vlen(t) : 5;
+ if (prog->aux->attach_btf_trace) {
+ /* skip first 'void *__data' argument in btf_trace_##name typedef */
+ args++;
+ nr_args--;
+ }
+
+ if (prog->expected_attach_type == BPF_TRACE_FEXIT &&
+ arg == nr_args) {
+ if (!t)
+ /* Default prog with 5 args. 6th arg is retval. */
+ return true;
+ /* function return type */
+ t = btf_type_by_id(btf, t->type);
+ } else if (arg >= nr_args) {
+ bpf_log(log, "func '%s' doesn't have %d-th argument\n",
+ tname, arg + 1);
+ return false;
+ } else {
+ if (!t)
+ /* Default prog with 5 args */
+ return true;
+ t = btf_type_by_id(btf, args[arg].type);
+ }
+ /* skip modifiers */
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (btf_type_is_int(t) || btf_type_is_enum(t))
+ /* accessing a scalar */
+ return true;
+ if (!btf_type_is_ptr(t)) {
+ bpf_log(log,
+ "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
+ tname, arg,
+ __btf_name_by_offset(btf, t->name_off),
+ btf_kind_str[BTF_INFO_KIND(t->info)]);
+ return false;
+ }
+ if (t->type == 0)
+ /* This is a pointer to void.
+ * It is the same as scalar from the verifier safety pov.
+ * No further pointer walking is allowed.
+ */
+ return true;
+
+ if (is_string_ptr(btf, t))
+ return true;
+
+ /* this is a pointer to another type */
+ info->reg_type = PTR_TO_BTF_ID;
+
+ if (tgt_prog) {
+ ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
+ if (ret > 0) {
+ info->btf_id = ret;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ info->btf_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ /* skip modifiers */
+ while (btf_type_is_modifier(t)) {
+ info->btf_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+ if (!btf_type_is_struct(t)) {
+ bpf_log(log,
+ "func '%s' arg%d type %s is not a struct\n",
+ tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ return false;
+ }
+ bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
+ tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
+ __btf_name_by_offset(btf, t->name_off));
+ return true;
+}
+
+int btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf_type *t, int off, int size,
+ enum bpf_access_type atype,
+ u32 *next_btf_id)
+{
+ u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
+ const struct btf_type *mtype, *elem_type = NULL;
+ const struct btf_member *member;
+ const char *tname, *mname;
+
+again:
+ tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
+ if (!btf_type_is_struct(t)) {
+ bpf_log(log, "Type '%s' is not a struct\n", tname);
+ return -EINVAL;
+ }
+
+ if (off + size > t->size) {
+ bpf_log(log, "access beyond struct %s at off %u size %u\n",
+ tname, off, size);
+ return -EACCES;
+ }
+
+ for_each_member(i, t, member) {
+ /* offset of the field in bytes */
+ moff = btf_member_bit_offset(t, member) / 8;
+ if (off + size <= moff)
+ /* won't find anything, field is already too far */
+ break;
+
+ if (btf_member_bitfield_size(t, member)) {
+ u32 end_bit = btf_member_bit_offset(t, member) +
+ btf_member_bitfield_size(t, member);
+
+ /* off <= moff instead of off == moff because clang
+ * does not generate a BTF member for anonymous
+ * bitfield like the ":16" here:
+ * struct {
+ * int :16;
+ * int x:8;
+ * };
+ */
+ if (off <= moff &&
+ BITS_ROUNDUP_BYTES(end_bit) <= off + size)
+ return SCALAR_VALUE;
+
+ /* off may be accessing a following member
+ *
+ * or
+ *
+ * Doing partial access at either end of this
+ * bitfield. Continue on this case also to
+ * treat it as not accessing this bitfield
+ * and eventually error out as field not
+ * found to keep it simple.
+ * It could be relaxed if there was a legit
+ * partial access case later.
+ */
+ continue;
+ }
+
+ /* In case of "off" is pointing to holes of a struct */
+ if (off < moff)
+ break;
+
+ /* type of the field */
+ mtype = btf_type_by_id(btf_vmlinux, member->type);
+ mname = __btf_name_by_offset(btf_vmlinux, member->name_off);
+
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
+ &elem_type, &total_nelems);
+ if (IS_ERR(mtype)) {
+ bpf_log(log, "field %s doesn't have size\n", mname);
+ return -EFAULT;
+ }
+
+ mtrue_end = moff + msize;
+ if (off >= mtrue_end)
+ /* no overlap with member, keep iterating */
+ continue;
+
+ if (btf_type_is_array(mtype)) {
+ u32 elem_idx;
+
+ /* btf_resolve_size() above helps to
+ * linearize a multi-dimensional array.
+ *
+ * The logic here is treating an array
+ * in a struct as the following way:
+ *
+ * struct outer {
+ * struct inner array[2][2];
+ * };
+ *
+ * looks like:
+ *
+ * struct outer {
+ * struct inner array_elem0;
+ * struct inner array_elem1;
+ * struct inner array_elem2;
+ * struct inner array_elem3;
+ * };
+ *
+ * When accessing outer->array[1][0], it moves
+ * moff to "array_elem2", set mtype to
+ * "struct inner", and msize also becomes
+ * sizeof(struct inner). Then most of the
+ * remaining logic will fall through without
+ * caring the current member is an array or
+ * not.
+ *
+ * Unlike mtype/msize/moff, mtrue_end does not
+ * change. The naming difference ("_true") tells
+ * that it is not always corresponding to
+ * the current mtype/msize/moff.
+ * It is the true end of the current
+ * member (i.e. array in this case). That
+ * will allow an int array to be accessed like
+ * a scratch space,
+ * i.e. allow access beyond the size of
+ * the array's element as long as it is
+ * within the mtrue_end boundary.
+ */
+
+ /* skip empty array */
+ if (moff == mtrue_end)
+ continue;
+
+ msize /= total_nelems;
+ elem_idx = (off - moff) / msize;
+ moff += elem_idx * msize;
+ mtype = elem_type;
+ }
+
+ /* the 'off' we're looking for is either equal to start
+ * of this field or inside of this struct
+ */
+ if (btf_type_is_struct(mtype)) {
+ /* our field must be inside that union or struct */
+ t = mtype;
+
+ /* adjust offset we're looking for */
+ off -= moff;
+ goto again;
+ }
+
+ if (btf_type_is_ptr(mtype)) {
+ const struct btf_type *stype;
+ u32 id;
+
+ if (msize != size || off != moff) {
+ bpf_log(log,
+ "cannot access ptr member %s with moff %u in struct %s with off %u size %u\n",
+ mname, moff, tname, off, size);
+ return -EACCES;
+ }
+
+ stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id);
+ if (btf_type_is_struct(stype)) {
+ *next_btf_id = id;
+ return PTR_TO_BTF_ID;
+ }
+ }
+
+ /* Allow more flexible access within an int as long as
+ * it is within mtrue_end.
+ * Since mtrue_end could be the end of an array,
+ * that also allows using an array of int as a scratch
+ * space. e.g. skb->cb[].
+ */
+ if (off + size > mtrue_end) {
+ bpf_log(log,
+ "access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n",
+ mname, mtrue_end, tname, off, size);
+ return -EACCES;
+ }
+
+ return SCALAR_VALUE;
+ }
+ bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off);
+ return -EINVAL;
+}
+
+static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn,
+ int arg)
+{
+ char fnname[KSYM_SYMBOL_LEN + 4] = "btf_";
+ const struct btf_param *args;
+ const struct btf_type *t;
+ const char *tname, *sym;
+ u32 btf_id, i;
+
+ if (IS_ERR(btf_vmlinux)) {
+ bpf_log(log, "btf_vmlinux is malformed\n");
+ return -EINVAL;
+ }
+
+ sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4);
+ if (!sym) {
+ bpf_log(log, "kernel doesn't have kallsyms\n");
+ return -EFAULT;
+ }
+
+ for (i = 1; i <= btf_vmlinux->nr_types; i++) {
+ t = btf_type_by_id(btf_vmlinux, i);
+ if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF)
+ continue;
+ tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
+ if (!strcmp(tname, fnname))
+ break;
+ }
+ if (i > btf_vmlinux->nr_types) {
+ bpf_log(log, "helper %s type is not found\n", fnname);
+ return -ENOENT;
+ }
+
+ t = btf_type_by_id(btf_vmlinux, t->type);
+ if (!btf_type_is_ptr(t))
+ return -EFAULT;
+ t = btf_type_by_id(btf_vmlinux, t->type);
+ if (!btf_type_is_func_proto(t))
+ return -EFAULT;
+
+ args = (const struct btf_param *)(t + 1);
+ if (arg >= btf_type_vlen(t)) {
+ bpf_log(log, "bpf helper %s doesn't have %d-th argument\n",
+ fnname, arg);
+ return -EINVAL;
+ }
+
+ t = btf_type_by_id(btf_vmlinux, args[arg].type);
+ if (!btf_type_is_ptr(t) || !t->type) {
+ /* anything but the pointer to struct is a helper config bug */
+ bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n");
+ return -EFAULT;
+ }
+ btf_id = t->type;
+ t = btf_type_by_id(btf_vmlinux, t->type);
+ /* skip modifiers */
+ while (btf_type_is_modifier(t)) {
+ btf_id = t->type;
+ t = btf_type_by_id(btf_vmlinux, t->type);
+ }
+ if (!btf_type_is_struct(t)) {
+ bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n");
+ return -EFAULT;
+ }
+ bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4,
+ arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off));
+ return btf_id;
+}
+
+int btf_resolve_helper_id(struct bpf_verifier_log *log,
+ const struct bpf_func_proto *fn, int arg)
+{
+ int *btf_id = &fn->btf_id[arg];
+ int ret;
+
+ if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID)
+ return -EINVAL;
+
+ ret = READ_ONCE(*btf_id);
+ if (ret)
+ return ret;
+ /* ok to race the search. The result is the same */
+ ret = __btf_resolve_helper_id(log, fn->func, arg);
+ if (!ret) {
+ /* Function argument cannot be type 'void' */
+ bpf_log(log, "BTF resolution bug\n");
+ return -EFAULT;
+ }
+ WRITE_ONCE(*btf_id, ret);
+ return ret;
+}
+
+static int __get_type_size(struct btf *btf, u32 btf_id,
+ const struct btf_type **bad_type)
+{
+ const struct btf_type *t;
+
+ if (!btf_id)
+ /* void */
+ return 0;
+ t = btf_type_by_id(btf, btf_id);
+ while (t && btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (!t) {
+ *bad_type = btf->types[0];
+ return -EINVAL;
+ }
+ if (btf_type_is_ptr(t))
+ /* kernel size of pointer. Not BPF's size of pointer*/
+ return sizeof(void *);
+ if (btf_type_is_int(t) || btf_type_is_enum(t))
+ return t->size;
+ *bad_type = t;
+ return -EINVAL;
+}
+
+int btf_distill_func_proto(struct bpf_verifier_log *log,
+ struct btf *btf,
+ const struct btf_type *func,
+ const char *tname,
+ struct btf_func_model *m)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 i, nargs;
+ int ret;
+
+ if (!func) {
+ /* BTF function prototype doesn't match the verifier types.
+ * Fall back to 5 u64 args.
+ */
+ for (i = 0; i < 5; i++)
+ m->arg_size[i] = 8;
+ m->ret_size = 8;
+ m->nr_args = 5;
+ return 0;
+ }
+ args = (const struct btf_param *)(func + 1);
+ nargs = btf_type_vlen(func);
+ if (nargs >= MAX_BPF_FUNC_ARGS) {
+ bpf_log(log,
+ "The function %s has %d arguments. Too many.\n",
+ tname, nargs);
+ return -EINVAL;
+ }
+ ret = __get_type_size(btf, func->type, &t);
+ if (ret < 0) {
+ bpf_log(log,
+ "The function %s return type %s is unsupported.\n",
+ tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ return -EINVAL;
+ }
+ m->ret_size = ret;
+
+ for (i = 0; i < nargs; i++) {
+ ret = __get_type_size(btf, args[i].type, &t);
+ if (ret < 0) {
+ bpf_log(log,
+ "The function %s arg%d type %s is unsupported.\n",
+ tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ return -EINVAL;
+ }
+ m->arg_size[i] = ret;
+ }
+ m->nr_args = nargs;
+ return 0;
+}
+
+/* Compare BTFs of two functions assuming only scalars and pointers to context.
+ * t1 points to BTF_KIND_FUNC in btf1
+ * t2 points to BTF_KIND_FUNC in btf2
+ * Returns:
+ * EINVAL - function prototype mismatch
+ * EFAULT - verifier bug
+ * 0 - 99% match. The last 1% is validated by the verifier.
+ */
+int btf_check_func_type_match(struct bpf_verifier_log *log,
+ struct btf *btf1, const struct btf_type *t1,
+ struct btf *btf2, const struct btf_type *t2)
+{
+ const struct btf_param *args1, *args2;
+ const char *fn1, *fn2, *s1, *s2;
+ u32 nargs1, nargs2, i;
+
+ fn1 = btf_name_by_offset(btf1, t1->name_off);
+ fn2 = btf_name_by_offset(btf2, t2->name_off);
+
+ if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "%s() is not a global function\n", fn1);
+ return -EINVAL;
+ }
+ if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "%s() is not a global function\n", fn2);
+ return -EINVAL;
+ }
+
+ t1 = btf_type_by_id(btf1, t1->type);
+ if (!t1 || !btf_type_is_func_proto(t1))
+ return -EFAULT;
+ t2 = btf_type_by_id(btf2, t2->type);
+ if (!t2 || !btf_type_is_func_proto(t2))
+ return -EFAULT;
+
+ args1 = (const struct btf_param *)(t1 + 1);
+ nargs1 = btf_type_vlen(t1);
+ args2 = (const struct btf_param *)(t2 + 1);
+ nargs2 = btf_type_vlen(t2);
+
+ if (nargs1 != nargs2) {
+ bpf_log(log, "%s() has %d args while %s() has %d args\n",
+ fn1, nargs1, fn2, nargs2);
+ return -EINVAL;
+ }
+
+ t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+ if (t1->info != t2->info) {
+ bpf_log(log,
+ "Return type %s of %s() doesn't match type %s of %s()\n",
+ btf_type_str(t1), fn1,
+ btf_type_str(t2), fn2);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nargs1; i++) {
+ t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL);
+
+ if (t1->info != t2->info) {
+ bpf_log(log, "arg%d in %s() is %s while %s() has %s\n",
+ i, fn1, btf_type_str(t1),
+ fn2, btf_type_str(t2));
+ return -EINVAL;
+ }
+ if (btf_type_has_size(t1) && t1->size != t2->size) {
+ bpf_log(log,
+ "arg%d in %s() has size %d while %s() has %d\n",
+ i, fn1, t1->size,
+ fn2, t2->size);
+ return -EINVAL;
+ }
+
+ /* global functions are validated with scalars and pointers
+ * to context only. And only global functions can be replaced.
+ * Hence type check only those types.
+ */
+ if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+ continue;
+ if (!btf_type_is_ptr(t1)) {
+ bpf_log(log,
+ "arg%d in %s() has unrecognized type\n",
+ i, fn1);
+ return -EINVAL;
+ }
+ t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+ if (!btf_type_is_struct(t1)) {
+ bpf_log(log,
+ "arg%d in %s() is not a pointer to context\n",
+ i, fn1);
+ return -EINVAL;
+ }
+ if (!btf_type_is_struct(t2)) {
+ bpf_log(log,
+ "arg%d in %s() is not a pointer to context\n",
+ i, fn2);
+ return -EINVAL;
+ }
+ /* This is an optional check to make program writing easier.
+ * Compare names of structs and report an error to the user.
+ * btf_prepare_func_args() already checked that t2 struct
+ * is a context type. btf_prepare_func_args() will check
+ * later that t1 struct is a context type as well.
+ */
+ s1 = btf_name_by_offset(btf1, t1->name_off);
+ s2 = btf_name_by_offset(btf2, t2->name_off);
+ if (strcmp(s1, s2)) {
+ bpf_log(log,
+ "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n",
+ i, fn1, s1, fn2, s2);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/* Compare BTFs of given program with BTF of target program */
+int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
+ struct btf *btf2, const struct btf_type *t2)
+{
+ struct btf *btf1 = prog->aux->btf;
+ const struct btf_type *t1;
+ u32 btf_id = 0;
+
+ if (!prog->aux->func_info) {
+ bpf_log(&env->log, "Program extension requires BTF\n");
+ return -EINVAL;
+ }
+
+ btf_id = prog->aux->func_info[0].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ t1 = btf_type_by_id(btf1, btf_id);
+ if (!t1 || !btf_type_is_func(t1))
+ return -EFAULT;
+
+ return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2);
+}
+
+/* Compare BTF of a function with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_log *log = &env->log;
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 i, nargs, btf_id;
+ const char *tname;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ t = btf_type_by_id(btf, btf_id);
+ if (!t || !btf_type_is_func(t)) {
+ /* These checks were already done by the verifier while loading
+ * struct bpf_func_info
+ */
+ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
+ subprog);
+ return -EFAULT;
+ }
+ tname = btf_name_by_offset(btf, t->name_off);
+
+ t = btf_type_by_id(btf, t->type);
+ if (!t || !btf_type_is_func_proto(t)) {
+ bpf_log(log, "Invalid BTF of func %s\n", tname);
+ return -EFAULT;
+ }
+ args = (const struct btf_param *)(t + 1);
+ nargs = btf_type_vlen(t);
+ if (nargs > 5) {
+ bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs);
+ goto out;
+ }
+ /* check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < nargs; i++) {
+ t = btf_type_by_id(btf, args[i].type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ if (reg[i + 1].type == SCALAR_VALUE)
+ continue;
+ bpf_log(log, "R%d is not a scalar\n", i + 1);
+ goto out;
+ }
+ if (btf_type_is_ptr(t)) {
+ if (reg[i + 1].type == SCALAR_VALUE) {
+ bpf_log(log, "R%d is not a pointer\n", i + 1);
+ goto out;
+ }
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
+ */
+ if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
+ if (reg[i + 1].type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ goto out;
+ }
+ if (check_ctx_reg(env, &reg[i + 1], i + 1))
+ goto out;
+ continue;
+ }
+ }
+ bpf_log(log, "Unrecognized arg#%d type %s\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ goto out;
+ }
+ return 0;
+out:
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return -EINVAL;
+}
+
+/* Convert BTF of a function into bpf_reg_state if possible
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - cannot convert BTF.
+ * 0 - Successfully converted BTF into bpf_reg_state
+ * (either PTR_TO_CTX or SCALAR_VALUE).
+ */
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_log *log = &env->log;
+ struct bpf_prog *prog = env->prog;
+ enum bpf_prog_type prog_type = prog->type;
+ struct btf *btf = prog->aux->btf;
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 i, nargs, btf_id;
+ const char *tname;
+
+ if (!prog->aux->func_info ||
+ prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "Verifier bug\n");
+ return -EFAULT;
+ }
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id) {
+ bpf_log(log, "Global functions need valid BTF\n");
+ return -EFAULT;
+ }
+
+ t = btf_type_by_id(btf, btf_id);
+ if (!t || !btf_type_is_func(t)) {
+ /* These checks were already done by the verifier while loading
+ * struct bpf_func_info
+ */
+ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
+ subprog);
+ return -EFAULT;
+ }
+ tname = btf_name_by_offset(btf, t->name_off);
+
+ if (log->level & BPF_LOG_LEVEL)
+ bpf_log(log, "Validating %s() func#%d...\n",
+ tname, subprog);
+
+ if (prog->aux->func_info_aux[subprog].unreliable) {
+ bpf_log(log, "Verifier bug in function %s()\n", tname);
+ return -EFAULT;
+ }
+ if (prog_type == BPF_PROG_TYPE_EXT)
+ prog_type = prog->aux->linked_prog->type;
+
+ t = btf_type_by_id(btf, t->type);
+ if (!t || !btf_type_is_func_proto(t)) {
+ bpf_log(log, "Invalid type of function %s()\n", tname);
+ return -EFAULT;
+ }
+ args = (const struct btf_param *)(t + 1);
+ nargs = btf_type_vlen(t);
+ if (nargs > 5) {
+ bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
+ tname, nargs);
+ return -EINVAL;
+ }
+ /* check that function returns int */
+ t = btf_type_by_id(btf, t->type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+ bpf_log(log,
+ "Global function %s() doesn't return scalar. Only those are supported.\n",
+ tname);
+ return -EINVAL;
+ }
+ /* Convert BTF function arguments into verifier types.
+ * Only PTR_TO_CTX and SCALAR are supported atm.
+ */
+ for (i = 0; i < nargs; i++) {
+ t = btf_type_by_id(btf, args[i].type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ reg[i + 1].type = SCALAR_VALUE;
+ continue;
+ }
+ if (btf_type_is_ptr(t) &&
+ btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ reg[i + 1].type = PTR_TO_CTX;
+ continue;
+ }
+ bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
struct seq_file *m)
{
@@ -3376,6 +4479,15 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
}
+#ifdef CONFIG_PROC_FS
+static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct btf *btf = filp->private_data;
+
+ seq_printf(m, "btf_id:\t%u\n", btf->id);
+}
+#endif
+
static int btf_release(struct inode *inode, struct file *filp)
{
btf_put(filp->private_data);
@@ -3383,6 +4495,9 @@ static int btf_release(struct inode *inode, struct file *filp)
}
const struct file_operations btf_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_btf_show_fdinfo,
+#endif
.release = btf_release,
};
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 0a00eaca6fae..9a500fadbef5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -35,8 +35,8 @@ void cgroup_bpf_offline(struct cgroup *cgrp)
*/
static void cgroup_bpf_release(struct work_struct *work)
{
- struct cgroup *cgrp = container_of(work, struct cgroup,
- bpf.release_work);
+ struct cgroup *p, *cgrp = container_of(work, struct cgroup,
+ bpf.release_work);
enum bpf_cgroup_storage_type stype;
struct bpf_prog_array *old_array;
unsigned int type;
@@ -65,6 +65,9 @@ static void cgroup_bpf_release(struct work_struct *work)
mutex_unlock(&cgroup_mutex);
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_put(p);
+
percpu_ref_exit(&cgrp->bpf.refcnt);
cgroup_put(cgrp);
}
@@ -103,8 +106,7 @@ static u32 prog_list_length(struct list_head *head)
* if parent has overridable or multi-prog, allow attaching
*/
static bool hierarchy_allows_attach(struct cgroup *cgrp,
- enum bpf_attach_type type,
- u32 new_flags)
+ enum bpf_attach_type type)
{
struct cgroup *p;
@@ -180,8 +182,8 @@ static void activate_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
struct bpf_prog_array *old_array)
{
- rcu_swap_protected(cgrp->bpf.effective[type], old_array,
- lockdep_is_held(&cgroup_mutex));
+ old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
+ lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
*/
@@ -199,6 +201,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
*/
#define NR ARRAY_SIZE(cgrp->bpf.effective)
struct bpf_prog_array *arrays[NR] = {};
+ struct cgroup *p;
int ret, i;
ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
@@ -206,6 +209,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
if (ret)
return ret;
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_get(p);
+
for (i = 0; i < NR; i++)
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -283,31 +289,34 @@ cleanup:
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to attach
+ * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
* @type: Type of attach operation
* @flags: Option flags
*
* Must be called with cgroup_mutex held.
*/
int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_prog *replace_prog,
enum bpf_attach_type type, u32 flags)
{
+ u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
+ struct bpf_prog_list *pl, *replace_pl = NULL;
enum bpf_cgroup_storage_type stype;
- struct bpf_prog_list *pl;
- bool pl_was_allocated;
int err;
- if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+ if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
+ ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
/* invalid combination */
return -EINVAL;
- if (!hierarchy_allows_attach(cgrp, type, flags))
+ if (!hierarchy_allows_attach(cgrp, type))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+ if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
@@ -317,6 +326,21 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
+ if (flags & BPF_F_ALLOW_MULTI) {
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog == prog)
+ /* disallow attaching the same prog twice */
+ return -EINVAL;
+ if (pl->prog == replace_prog)
+ replace_pl = pl;
+ }
+ if ((flags & BPF_F_REPLACE) && !replace_pl)
+ /* prog to replace not found for cgroup */
+ return -ENOENT;
+ } else if (!list_empty(progs)) {
+ replace_pl = list_first_entry(progs, typeof(*pl), node);
+ }
+
for_each_cgroup_storage_type(stype) {
storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
if (IS_ERR(storage[stype])) {
@@ -327,53 +351,28 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
}
}
- if (flags & BPF_F_ALLOW_MULTI) {
- list_for_each_entry(pl, progs, node) {
- if (pl->prog == prog) {
- /* disallow attaching the same prog twice */
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_free(storage[stype]);
- return -EINVAL;
- }
+ if (replace_pl) {
+ pl = replace_pl;
+ old_prog = pl->prog;
+ for_each_cgroup_storage_type(stype) {
+ old_storage[stype] = pl->storage[stype];
+ bpf_cgroup_storage_unlink(old_storage[stype]);
}
-
+ } else {
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
for_each_cgroup_storage_type(stype)
bpf_cgroup_storage_free(storage[stype]);
return -ENOMEM;
}
-
- pl_was_allocated = true;
- pl->prog = prog;
- for_each_cgroup_storage_type(stype)
- pl->storage[stype] = storage[stype];
list_add_tail(&pl->node, progs);
- } else {
- if (list_empty(progs)) {
- pl = kmalloc(sizeof(*pl), GFP_KERNEL);
- if (!pl) {
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_free(storage[stype]);
- return -ENOMEM;
- }
- pl_was_allocated = true;
- list_add_tail(&pl->node, progs);
- } else {
- pl = list_first_entry(progs, typeof(*pl), node);
- old_prog = pl->prog;
- for_each_cgroup_storage_type(stype) {
- old_storage[stype] = pl->storage[stype];
- bpf_cgroup_storage_unlink(old_storage[stype]);
- }
- pl_was_allocated = false;
- }
- pl->prog = prog;
- for_each_cgroup_storage_type(stype)
- pl->storage[stype] = storage[stype];
}
- cgrp->bpf.flags[type] = flags;
+ pl->prog = prog;
+ for_each_cgroup_storage_type(stype)
+ pl->storage[stype] = storage[stype];
+
+ cgrp->bpf.flags[type] = saved_flags;
err = update_effective_progs(cgrp, type);
if (err)
@@ -401,7 +400,7 @@ cleanup:
pl->storage[stype] = old_storage[stype];
bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
}
- if (pl_was_allocated) {
+ if (!replace_pl) {
list_del(&pl->node);
kfree(pl);
}
@@ -539,6 +538,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
+ struct bpf_prog *replace_prog = NULL;
struct cgroup *cgrp;
int ret;
@@ -546,8 +546,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
- ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
+ (attr->attach_flags & BPF_F_REPLACE)) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
+ if (IS_ERR(replace_prog)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(replace_prog);
+ }
+ }
+
+ ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
attr->attach_flags);
+
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
cgroup_put(cgrp);
return ret;
}
@@ -964,7 +976,6 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
return -ENOMEM;
ctx->optval_end = ctx->optval + max_optlen;
- ctx->optlen = max_optlen;
return 0;
}
@@ -984,7 +995,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
.level = *level,
.optname = *optname,
};
- int ret;
+ int ret, max_optlen;
/* Opportunistic check to see whether we have any BPF program
* attached to the hook so we don't waste time allocating
@@ -994,10 +1005,18 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
return 0;
- ret = sockopt_alloc_buf(&ctx, *optlen);
+ /* Allocate a bit more than the initial user buffer for
+ * BPF program. The canonical use case is overriding
+ * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
+ */
+ max_optlen = max_t(int, 16, *optlen);
+
+ ret = sockopt_alloc_buf(&ctx, max_optlen);
if (ret)
return ret;
+ ctx.optlen = *optlen;
+
if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
ret = -EFAULT;
goto out;
@@ -1016,7 +1035,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
if (ctx.optlen == -1) {
/* optlen set to -1, bypass kernel */
ret = 1;
- } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
+ } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
/* optlen is out of bounds */
ret = -EFAULT;
} else {
@@ -1063,6 +1082,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
if (ret)
return ret;
+ ctx.optlen = max_optlen;
+
if (!retval) {
/* If kernel getsockopt finished successfully,
* copy whatever was returned to the user back
@@ -1302,12 +1323,12 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
return false;
switch (off) {
- case offsetof(struct bpf_sysctl, write):
+ case bpf_ctx_range(struct bpf_sysctl, write):
if (type != BPF_READ)
return false;
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
- case offsetof(struct bpf_sysctl, file_pos):
+ case bpf_ctx_range(struct bpf_sysctl, file_pos):
if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
@@ -1325,13 +1346,14 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
+ u32 read_size;
switch (si->off) {
case offsetof(struct bpf_sysctl, write):
*insn++ = BPF_LDX_MEM(
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
bpf_target_off(struct bpf_sysctl_kern, write,
- FIELD_SIZEOF(struct bpf_sysctl_kern,
+ sizeof_field(struct bpf_sysctl_kern,
write),
target_size));
break;
@@ -1356,7 +1378,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, ppos));
*insn++ = BPF_STX_MEM(
- BPF_SIZEOF(u32), treg, si->src_reg, 0);
+ BPF_SIZEOF(u32), treg, si->src_reg,
+ bpf_ctx_narrow_access_offset(
+ 0, sizeof(u32), sizeof(loff_t)));
*insn++ = BPF_LDX_MEM(
BPF_DW, treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, tmp_reg));
@@ -1365,8 +1389,11 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
si->dst_reg, si->src_reg,
offsetof(struct bpf_sysctl_kern, ppos));
+ read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
*insn++ = BPF_LDX_MEM(
- BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
+ BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
+ bpf_ctx_narrow_access_offset(
+ 0, read_size, sizeof(loff_t)));
}
*target_size = sizeof(u32);
break;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8191a7db2777..973a20d49749 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -30,7 +30,8 @@
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/perf_event.h>
-
+#include <linux/extable.h>
+#include <linux/log2.h>
#include <asm/unaligned.h>
/* Registers */
@@ -221,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
u32 pages, delta;
int ret;
- BUG_ON(fp_old == NULL);
-
size = round_up(size, PAGE_SIZE);
pages = size / PAGE_SIZE;
if (pages <= fp_old->pages)
@@ -255,6 +254,7 @@ void __bpf_prog_free(struct bpf_prog *fp)
{
if (fp->aux) {
free_percpu(fp->aux->stats);
+ kfree(fp->aux->poke_tab);
kfree(fp->aux);
}
vfree(fp);
@@ -502,7 +502,7 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
}
-void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
+static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
@@ -518,9 +518,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
#ifdef CONFIG_BPF_JIT
/* All BPF JIT sysctl knobs here. */
-int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
+int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
-int bpf_jit_kallsyms __read_mostly;
long bpf_jit_limit __read_mostly;
static __always_inline void
@@ -668,9 +668,6 @@ static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
{
struct latch_tree_node *n;
- if (!bpf_jit_kallsyms_enabled())
- return NULL;
-
n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
return n ?
container_of(n, struct bpf_prog_aux, ksym_tnode)->prog :
@@ -712,6 +709,24 @@ bool is_bpf_text_address(unsigned long addr)
return ret;
}
+const struct exception_table_entry *search_bpf_extables(unsigned long addr)
+{
+ const struct exception_table_entry *e = NULL;
+ struct bpf_prog *prog;
+
+ rcu_read_lock();
+ prog = bpf_prog_kallsyms_find(addr);
+ if (!prog)
+ goto out;
+ if (!prog->aux->num_exentries)
+ goto out;
+
+ e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
+out:
+ rcu_read_unlock();
+ return e;
+}
+
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
char *sym)
{
@@ -740,6 +755,39 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
return ret;
}
+int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
+ struct bpf_jit_poke_descriptor *poke)
+{
+ struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
+ static const u32 poke_tab_max = 1024;
+ u32 slot = prog->aux->size_poke_tab;
+ u32 size = slot + 1;
+
+ if (size > poke_tab_max)
+ return -ENOSPC;
+ if (poke->ip || poke->ip_stable || poke->adj_off)
+ return -EINVAL;
+
+ switch (poke->reason) {
+ case BPF_POKE_REASON_TAIL_CALL:
+ if (!poke->tail_call.map)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL);
+ if (!tab)
+ return -ENOMEM;
+
+ memcpy(&tab[slot], poke, sizeof(*poke));
+ prog->aux->size_poke_tab = size;
+ prog->aux->poke_tab = tab;
+
+ return slot;
+}
+
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -800,6 +848,9 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
struct bpf_binary_header *hdr;
u32 size, hole, start, pages;
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
+
/* Most of BPF filters are really small, but if some of them
* fill a page, allow at least 128 extra bytes to insert a
* random section of illegal instructions.
@@ -890,7 +941,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
static int bpf_jit_blind_insn(const struct bpf_insn *from,
const struct bpf_insn *aux,
- struct bpf_insn *to_buff)
+ struct bpf_insn *to_buff,
+ bool emit_zext)
{
struct bpf_insn *to = to_buff;
u32 imm_rnd = get_random_int();
@@ -1005,6 +1057,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ if (emit_zext)
+ *to++ = BPF_ZEXT_REG(BPF_REG_AX);
*to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX);
break;
@@ -1088,7 +1142,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
insn[1].code == 0)
memcpy(aux, insn, sizeof(aux));
- rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
+ rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
+ clone->aux->verifier_zext);
if (!rewritten)
continue;
@@ -1287,6 +1342,12 @@ bool bpf_opcode_in_insntable(u8 code)
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
+{
+ memset(dst, 0, size);
+ return -EFAULT;
+}
+
/**
* __bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
@@ -1306,6 +1367,10 @@ static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u6
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
};
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
@@ -1538,6 +1603,16 @@ out:
LDST(W, u32)
LDST(DW, u64)
#undef LDST
+#define LDX_PROBE(SIZEOP, SIZE) \
+ LDX_PROBE_MEM_##SIZEOP: \
+ bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \
+ CONT;
+ LDX_PROBE(B, 1)
+ LDX_PROBE(H, 2)
+ LDX_PROBE(W, 4)
+ LDX_PROBE(DW, 8)
+#undef LDX_PROBE
+
STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
atomic_add((u32) SRC, (atomic_t *)(unsigned long)
(DST + insn->off));
@@ -1648,18 +1723,17 @@ bool bpf_prog_array_compatible(struct bpf_array *array,
if (fp->kprobe_override)
return false;
- if (!array->owner_prog_type) {
+ if (!array->aux->type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- array->owner_prog_type = fp->type;
- array->owner_jited = fp->jited;
-
+ array->aux->type = fp->type;
+ array->aux->jited = fp->jited;
return true;
}
- return array->owner_prog_type == fp->type &&
- array->owner_jited == fp->jited;
+ return array->aux->type == fp->type &&
+ array->aux->jited == fp->jited;
}
static int bpf_check_tail_call(const struct bpf_prog *fp)
@@ -1960,18 +2034,52 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
: 0;
}
+static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
+{
+ enum bpf_cgroup_storage_type stype;
+
+ for_each_cgroup_storage_type(stype) {
+ if (!aux->cgroup_storage[stype])
+ continue;
+ bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]);
+ }
+}
+
+void __bpf_free_used_maps(struct bpf_prog_aux *aux,
+ struct bpf_map **used_maps, u32 len)
+{
+ struct bpf_map *map;
+ u32 i;
+
+ bpf_free_cgroup_storage(aux);
+ for (i = 0; i < len; i++) {
+ map = used_maps[i];
+ if (map->ops->map_poke_untrack)
+ map->ops->map_poke_untrack(map, aux);
+ bpf_map_put(map);
+ }
+}
+
+static void bpf_free_used_maps(struct bpf_prog_aux *aux)
+{
+ __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
+ kfree(aux->used_maps);
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
int i;
aux = container_of(work, struct bpf_prog_aux, work);
+ bpf_free_used_maps(aux);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_offload_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
if (aux->prog->has_callchain_buf)
put_callchain_buffers();
#endif
+ bpf_trampoline_put(aux->trampoline);
for (i = 0; i < aux->func_cnt; i++)
bpf_jit_free(aux->func[i]);
if (aux->func_cnt) {
@@ -1987,6 +2095,8 @@ void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
+ if (aux->linked_prog)
+ bpf_prog_put(aux->linked_prog);
INIT_WORK(&aux->work, bpf_prog_free_deferred);
schedule_work(&aux->work);
}
@@ -2027,6 +2137,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
+const struct bpf_func_proto bpf_jiffies64_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
@@ -2101,6 +2212,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
return -EFAULT;
}
+int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+ void *addr1, void *addr2)
+{
+ return -ENOTSUPP;
+}
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef49e17ae47c..70f71b154fa5 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -72,17 +72,18 @@ struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
struct bpf_cpu_map_entry **cpu_map;
- struct list_head __percpu *flush_list;
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
+static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
+
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
- int ret, cpu;
u64 cost;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
- cost += sizeof(struct list_head) * num_possible_cpus();
/* Notice returns -EPERM on if map size is larger than memlock limit */
ret = bpf_map_charge_init(&cmap->map.memory, cost);
@@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
goto free_cmap;
}
- cmap->flush_list = alloc_percpu(struct list_head);
- if (!cmap->flush_list)
- goto free_charge;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
if (!cmap->cpu_map)
- goto free_percpu;
+ goto free_charge;
return &cmap->map;
-free_percpu:
- free_percpu(cmap->flush_list);
free_charge:
bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
@@ -399,22 +390,14 @@ free_rcu:
static void __cpu_map_entry_free(struct rcu_head *rcu)
{
struct bpf_cpu_map_entry *rcpu;
- int cpu;
/* This cpu_map_entry have been disconnected from map and one
- * RCU graze-period have elapsed. Thus, XDP cannot queue any
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
* new packets and cannot change/set flush_needed that can
* find this entry.
*/
rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
- /* Flush remaining packets in percpu bulkq */
- for_each_online_cpu(cpu) {
- struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
-
- /* No concurrent bq_enqueue can run at this point */
- bq_flush_to_queue(bq, false);
- }
free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */
put_cpu_map_entry(rcpu);
@@ -436,7 +419,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
* percpu bulkq to queue. Due to caller map_delete_elem() disable
* preemption, cannot call kthread_stop() to make sure queue is empty.
* Instead a work_queue is started for stopping kthread,
- * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * cpu_map_kthread_stop, which waits for an RCU grace period before
* stopping kthread, emptying the queue.
*/
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
@@ -507,7 +490,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
static void cpu_map_free(struct bpf_map *map)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- int cpu;
u32 i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
@@ -522,18 +504,6 @@ static void cpu_map_free(struct bpf_map *map)
bpf_clear_redirect_map(map);
synchronize_rcu();
- /* To ensure all pending flush operations have completed wait for flush
- * list be empty on _all_ cpus. Because the above synchronize_rcu()
- * ensures the map is disconnected from the program we can assume no new
- * items will be added to the list.
- */
- for_each_online_cpu(cpu) {
- struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
-
- while (!list_empty(flush_list))
- cond_resched();
- }
-
/* For cpu_map the remote CPUs can still be using the entries
* (struct bpf_cpu_map_entry).
*/
@@ -544,10 +514,9 @@ static void cpu_map_free(struct bpf_map *map)
if (!rcpu)
continue;
- /* bq flush and cleanup happens after RCU graze-period */
+ /* bq flush and cleanup happens after RCU grace-period */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
- free_percpu(cmap->flush_list);
bpf_map_area_free(cmap->cpu_map);
kfree(cmap);
}
@@ -599,7 +568,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf,
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
{
struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
@@ -620,10 +589,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
err = __ptr_ring_produce(q, xdpf);
if (err) {
drops++;
- if (likely(in_napi_ctx))
- xdp_return_frame_rx_napi(xdpf);
- else
- xdp_return_frame(xdpf);
+ xdp_return_frame_rx_napi(xdpf);
}
processed++;
}
@@ -642,11 +608,11 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
*/
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
+ struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
- bq_flush_to_queue(bq, true);
+ bq_flush_to_queue(bq);
/* Notice, xdp_buff/page MUST be queued here, long enough for
* driver to code invoking us to finished, due to driver
@@ -681,16 +647,26 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
-void __cpu_map_flush(struct bpf_map *map)
+void __cpu_map_flush(void)
{
- struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
+ struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
- bq_flush_to_queue(bq, true);
+ bq_flush_to_queue(bq);
/* If already running, costs spin_lock_irqsave + smb_mb */
wake_up_process(bq->obj->kthread);
}
}
+
+static int __init cpu_map_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
+ return 0;
+}
+
+subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index d83cf8ccc872..58bdca5d978a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -37,6 +37,12 @@
* notifier hook walks the map we know that new dev references can not be
* added by the user because core infrastructure ensures dev_get_by_index()
* calls will fail at this point.
+ *
+ * The devmap_hash type is a map type which interprets keys as ifindexes and
+ * indexes these using a hashmap. This allows maps that use ifindex as key to be
+ * densely packed instead of having holes in the lookup array for unused
+ * ifindexes. The setup and packet enqueue/send code is shared between the two
+ * types of devmap; only the lookup and insertion is different.
*/
#include <linux/bpf.h>
#include <net/xdp.h>
@@ -47,109 +53,149 @@
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
#define DEV_MAP_BULK_SIZE 16
-struct bpf_dtab_netdev;
-
-struct xdp_bulk_queue {
+struct xdp_dev_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
struct list_head flush_node;
+ struct net_device *dev;
struct net_device *dev_rx;
- struct bpf_dtab_netdev *obj;
unsigned int count;
};
struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
+ struct hlist_node index_hlist;
struct bpf_dtab *dtab;
- unsigned int bit;
- struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu;
+ unsigned int idx;
};
struct bpf_dtab {
struct bpf_map map;
- struct bpf_dtab_netdev **netdev_map;
- struct list_head __percpu *flush_list;
+ struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
struct list_head list;
+
+ /* these are only used for DEVMAP_HASH type maps */
+ struct hlist_head *dev_index_head;
+ spinlock_t index_lock;
+ unsigned int items;
+ u32 n_buckets;
};
+static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
-static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+static struct hlist_head *dev_map_create_hash(unsigned int entries)
{
- struct bpf_dtab *dtab;
- int err, cpu;
- u64 cost;
+ int i;
+ struct hlist_head *hash;
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
+ hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
+ if (hash != NULL)
+ for (i = 0; i < entries; i++)
+ INIT_HLIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
+ int idx)
+{
+ return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
+}
+
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+{
+ u64 cost = 0;
+ int err;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
* verifier prevents writes from the BPF side
*/
attr->map_flags |= BPF_F_RDONLY_PROG;
- dtab = kzalloc(sizeof(*dtab), GFP_USER);
- if (!dtab)
- return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&dtab->map, attr);
- /* make sure page count doesn't overflow */
- cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
- cost += sizeof(struct list_head) * num_possible_cpus();
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
+
+ if (!dtab->n_buckets) /* Overflow check */
+ return -EINVAL;
+ cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets;
+ } else {
+ cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+ }
/* if map size is larger than memlock limit, reject it */
err = bpf_map_charge_init(&dtab->map.memory, cost);
if (err)
- goto free_dtab;
+ return -EINVAL;
- err = -ENOMEM;
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
+ if (!dtab->dev_index_head)
+ goto free_charge;
- dtab->flush_list = alloc_percpu(struct list_head);
- if (!dtab->flush_list)
- goto free_charge;
+ spin_lock_init(&dtab->index_lock);
+ } else {
+ dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+ sizeof(struct bpf_dtab_netdev *),
+ dtab->map.numa_node);
+ if (!dtab->netdev_map)
+ goto free_charge;
+ }
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
+ return 0;
- dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
- sizeof(struct bpf_dtab_netdev *),
- dtab->map.numa_node);
- if (!dtab->netdev_map)
- goto free_percpu;
+free_charge:
+ bpf_map_charge_finish(&dtab->map.memory);
+ return -ENOMEM;
+}
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_dtab *dtab;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ dtab = kzalloc(sizeof(*dtab), GFP_USER);
+ if (!dtab)
+ return ERR_PTR(-ENOMEM);
+
+ err = dev_map_init_map(dtab, attr);
+ if (err) {
+ kfree(dtab);
+ return ERR_PTR(err);
+ }
spin_lock(&dev_map_lock);
list_add_tail_rcu(&dtab->list, &dev_map_list);
spin_unlock(&dev_map_lock);
return &dtab->map;
-
-free_percpu:
- free_percpu(dtab->flush_list);
-free_charge:
- bpf_map_charge_finish(&dtab->map.memory);
-free_dtab:
- kfree(dtab);
- return ERR_PTR(err);
}
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i, cpu;
+ int i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further reads against netdev_map. It does __not__ ensure pending
- * flush operations (if any) are complete.
+ * disconnected from events. The following synchronize_rcu() guarantees
+ * both rcu read critical sections complete and waits for
+ * preempt-disable regions (NAPI being the relevant context here) so we
+ * are certain there will be no further reads against the netdev_map and
+ * all flush operations are complete. Flush operations can only be done
+ * from NAPI context for this reason.
*/
spin_lock(&dev_map_lock);
@@ -162,32 +208,37 @@ static void dev_map_free(struct bpf_map *map)
/* Make sure prior __dev_map_entry_free() have completed. */
rcu_barrier();
- /* To ensure all pending flush operations have completed wait for flush
- * list to empty on _all_ cpus.
- * Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new items will be added.
- */
- for_each_online_cpu(cpu) {
- struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
+ if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ for (i = 0; i < dtab->n_buckets; i++) {
+ struct bpf_dtab_netdev *dev;
+ struct hlist_head *head;
+ struct hlist_node *next;
- while (!list_empty(flush_list))
- cond_resched();
- }
+ head = dev_map_index_hash(dtab, i);
- for (i = 0; i < dtab->map.max_entries; i++) {
- struct bpf_dtab_netdev *dev;
+ hlist_for_each_entry_safe(dev, next, head, index_hlist) {
+ hlist_del_rcu(&dev->index_hlist);
+ dev_put(dev->dev);
+ kfree(dev);
+ }
+ }
+
+ kfree(dtab->dev_index_head);
+ } else {
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev;
- dev = dtab->netdev_map[i];
- if (!dev)
- continue;
+ dev = dtab->netdev_map[i];
+ if (!dev)
+ continue;
- free_percpu(dev->bulkq);
- dev_put(dev->dev);
- kfree(dev);
+ dev_put(dev->dev);
+ kfree(dev);
+ }
+
+ bpf_map_area_free(dtab->netdev_map);
}
- free_percpu(dtab->flush_list);
- bpf_map_area_free(dtab->netdev_map);
kfree(dtab);
}
@@ -208,11 +259,68 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
- bool in_napi_ctx)
+struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct hlist_head *head = dev_map_index_hash(dtab, key);
+ struct bpf_dtab_netdev *dev;
+
+ hlist_for_each_entry_rcu(dev, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock))
+ if (dev->idx == key)
+ return dev;
+
+ return NULL;
+}
+
+static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u32 idx, *next = next_key;
+ struct bpf_dtab_netdev *dev, *next_dev;
+ struct hlist_head *head;
+ int i = 0;
+
+ if (!key)
+ goto find_first;
+
+ idx = *(u32 *)key;
+
+ dev = __dev_map_hash_lookup_elem(map, idx);
+ if (!dev)
+ goto find_first;
+
+ next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
+ struct bpf_dtab_netdev, index_hlist);
+
+ if (next_dev) {
+ *next = next_dev->idx;
+ return 0;
+ }
+
+ i = idx & (dtab->n_buckets - 1);
+ i++;
+
+ find_first:
+ for (; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+
+ next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct bpf_dtab_netdev,
+ index_hlist);
+ if (next_dev) {
+ *next = next_dev->idx;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
- struct bpf_dtab_netdev *obj = bq->obj;
- struct net_device *dev = obj->dev;
+ struct net_device *dev = bq->dev;
int sent = 0, drops = 0, err = 0;
int i;
@@ -235,8 +343,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
out:
bq->count = 0;
- trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
- sent, drops, bq->dev_rx, dev, err);
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
bq->dev_rx = NULL;
__list_del_clearprev(&bq->flush_node);
return 0;
@@ -247,33 +354,29 @@ error:
for (i = 0; i < bq->count; i++) {
struct xdp_frame *xdpf = bq->q[i];
- /* RX path under NAPI protection, can return frames faster */
- if (likely(in_napi_ctx))
- xdp_return_frame_rx_napi(xdpf);
- else
- xdp_return_frame(xdpf);
+ xdp_return_frame_rx_napi(xdpf);
drops++;
}
goto out;
}
-/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
* net device can be torn down. On devmap tear down we ensure the flush list
* is empty before completing to ensure all flush operations have completed.
+ * When drivers update the bpf program they may need to ensure any flush ops
+ * are also complete. Using synchronize_rcu or call_rcu will suffice for this
+ * because both wait for napi context to exit.
*/
-void __dev_map_flush(struct bpf_map *map)
+void __dev_flush(void)
{
- struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
- struct xdp_bulk_queue *bq, *tmp;
+ struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
+ struct xdp_dev_bulk_queue *bq, *tmp;
- rcu_read_lock();
list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
- bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
- rcu_read_unlock();
+ bq_xmit_all(bq, XDP_XMIT_FLUSH);
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -295,15 +398,14 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
-
{
- struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
- struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+ struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
+ struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
- bq_xmit_all(bq, 0, true);
+ bq_xmit_all(bq, 0);
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
@@ -320,10 +422,9 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
return 0;
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
{
- struct net_device *dev = dst->dev;
struct xdp_frame *xdpf;
int err;
@@ -338,7 +439,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
if (unlikely(!xdpf))
return -EOVERFLOW;
- return bq_enqueue(dst, xdpf, dev_rx);
+ return bq_enqueue(dev, xdpf, dev_rx);
+}
+
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ return __xdp_enqueue(dev, xdp, dev_rx);
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct net_device *dev = dst->dev;
+
+ return __xdp_enqueue(dev, xdp, dev_rx);
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
@@ -363,19 +478,13 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
return dev ? &dev->ifindex : NULL;
}
-static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
+static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
{
- if (dev->dev->netdev_ops->ndo_xdp_xmit) {
- struct xdp_bulk_queue *bq;
- int cpu;
+ struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
+ *(u32 *)key);
+ struct net_device *dev = obj ? obj->dev : NULL;
- rcu_read_lock();
- for_each_online_cpu(cpu) {
- bq = per_cpu_ptr(dev->bulkq, cpu);
- bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
- }
- rcu_read_unlock();
- }
+ return dev ? &dev->ifindex : NULL;
}
static void __dev_map_entry_free(struct rcu_head *rcu)
@@ -383,8 +492,6 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
struct bpf_dtab_netdev *dev;
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
- dev_map_flush_old(dev);
- free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -399,12 +506,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
/* Use call_rcu() here to ensure any rcu critical sections have
- * completed, but this does not guarantee a flush has happened
- * yet. Because driver side rcu_read_lock/unlock only protects the
- * running XDP program. However, for pending flush operations the
- * dev and ctx are stored in another per cpu map. And additionally,
- * the driver tear down ensures all soft irqs are complete before
- * removing the net device in the case of dev_put equals zero.
+ * completed as well as any flush operations because call_rcu
+ * will wait for preempt-disable region to complete, NAPI in this
+ * context. And additionally, the driver tear down ensures all
+ * soft irqs are complete before removing the net device in the
+ * case of dev_put equals zero.
*/
old_dev = xchg(&dtab->netdev_map[k], NULL);
if (old_dev)
@@ -412,17 +518,59 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *old_dev;
+ int k = *(u32 *)key;
+ unsigned long flags;
+ int ret = -ENOENT;
+
+ spin_lock_irqsave(&dtab->index_lock, flags);
+
+ old_dev = __dev_map_hash_lookup_elem(map, k);
+ if (old_dev) {
+ dtab->items--;
+ hlist_del_init_rcu(&old_dev->index_hlist);
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+
+ return ret;
+}
+
+static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
+ struct bpf_dtab *dtab,
+ u32 ifindex,
+ unsigned int idx)
+{
+ struct bpf_dtab_netdev *dev;
+
+ dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+ dtab->map.numa_node);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ dev->dev = dev_get_by_index(net, ifindex);
+ if (!dev->dev) {
+ kfree(dev);
+ return ERR_PTR(-EINVAL);
+ }
+
+ dev->idx = idx;
+ dev->dtab = dtab;
+
+ return dev;
+}
+
+static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct net *net = current->nsproxy->net_ns;
- gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev, *old_dev;
u32 ifindex = *(u32 *)value;
- struct xdp_bulk_queue *bq;
u32 i = *(u32 *)key;
- int cpu;
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
@@ -434,31 +582,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
if (!ifindex) {
dev = NULL;
} else {
- dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
- if (!dev)
- return -ENOMEM;
-
- dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
- sizeof(void *), gfp);
- if (!dev->bulkq) {
- kfree(dev);
- return -ENOMEM;
- }
-
- for_each_possible_cpu(cpu) {
- bq = per_cpu_ptr(dev->bulkq, cpu);
- bq->obj = dev;
- }
-
- dev->dev = dev_get_by_index(net, ifindex);
- if (!dev->dev) {
- free_percpu(dev->bulkq);
- kfree(dev);
- return -EINVAL;
- }
-
- dev->bit = i;
- dev->dtab = dtab;
+ dev = __dev_map_alloc_node(net, dtab, ifindex, i);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
}
/* Use call_rcu() here to ensure rcu critical sections have completed
@@ -472,6 +598,70 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
return 0;
}
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return __dev_map_update_elem(current->nsproxy->net_ns,
+ map, key, value, map_flags);
+}
+
+static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dev, *old_dev;
+ u32 ifindex = *(u32 *)value;
+ u32 idx = *(u32 *)key;
+ unsigned long flags;
+ int err = -EEXIST;
+
+ if (unlikely(map_flags > BPF_EXIST || !ifindex))
+ return -EINVAL;
+
+ spin_lock_irqsave(&dtab->index_lock, flags);
+
+ old_dev = __dev_map_hash_lookup_elem(map, idx);
+ if (old_dev && (map_flags & BPF_NOEXIST))
+ goto out_err;
+
+ dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out_err;
+ }
+
+ if (old_dev) {
+ hlist_del_rcu(&old_dev->index_hlist);
+ } else {
+ if (dtab->items >= dtab->map.max_entries) {
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+ call_rcu(&dev->rcu, __dev_map_entry_free);
+ return -E2BIG;
+ }
+ dtab->items++;
+ }
+
+ hlist_add_head_rcu(&dev->index_hlist,
+ dev_map_index_hash(dtab, idx));
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+ return 0;
+
+out_err:
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+ return err;
+}
+
+static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return __dev_map_hash_update_elem(current->nsproxy->net_ns,
+ map, key, value, map_flags);
+}
+
const struct bpf_map_ops dev_map_ops = {
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
@@ -482,14 +672,64 @@ const struct bpf_map_ops dev_map_ops = {
.map_check_btf = map_check_no_btf,
};
+const struct bpf_map_ops dev_map_hash_ops = {
+ .map_alloc = dev_map_alloc,
+ .map_free = dev_map_free,
+ .map_get_next_key = dev_map_hash_get_next_key,
+ .map_lookup_elem = dev_map_hash_lookup_elem,
+ .map_update_elem = dev_map_hash_update_elem,
+ .map_delete_elem = dev_map_hash_delete_elem,
+ .map_check_btf = map_check_no_btf,
+};
+
+static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
+ struct net_device *netdev)
+{
+ unsigned long flags;
+ u32 i;
+
+ spin_lock_irqsave(&dtab->index_lock, flags);
+ for (i = 0; i < dtab->n_buckets; i++) {
+ struct bpf_dtab_netdev *dev;
+ struct hlist_head *head;
+ struct hlist_node *next;
+
+ head = dev_map_index_hash(dtab, i);
+
+ hlist_for_each_entry_safe(dev, next, head, index_hlist) {
+ if (netdev != dev->dev)
+ continue;
+
+ dtab->items--;
+ hlist_del_rcu(&dev->index_hlist);
+ call_rcu(&dev->rcu, __dev_map_entry_free);
+ }
+ }
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+}
+
static int dev_map_notification(struct notifier_block *notifier,
ulong event, void *ptr)
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
struct bpf_dtab *dtab;
- int i;
+ int i, cpu;
switch (event) {
+ case NETDEV_REGISTER:
+ if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
+ break;
+
+ /* will be freed in free_netdev() */
+ netdev->xdp_bulkq =
+ __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
+ sizeof(void *), GFP_ATOMIC);
+ if (!netdev->xdp_bulkq)
+ return NOTIFY_BAD;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
+ break;
case NETDEV_UNREGISTER:
/* This rcu_read_lock/unlock pair is needed because
* dev_map_list is an RCU list AND to ensure a delete
@@ -498,6 +738,11 @@ static int dev_map_notification(struct notifier_block *notifier,
*/
rcu_read_lock();
list_for_each_entry_rcu(dtab, &dev_map_list, list) {
+ if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ dev_map_hash_remove_netdev(dtab, netdev);
+ continue;
+ }
+
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev, *odev;
@@ -524,10 +769,15 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
+ int cpu;
+
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
new file mode 100644
index 000000000000..b3e5b214fed8
--- /dev/null
+++ b/kernel/bpf/dispatcher.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2019 Intel Corporation. */
+
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+/* The BPF dispatcher is a multiway branch code generator. The
+ * dispatcher is a mechanism to avoid the performance penalty of an
+ * indirect call, which is expensive when retpolines are enabled. A
+ * dispatch client registers a BPF program into the dispatcher, and if
+ * there is available room in the dispatcher a direct call to the BPF
+ * program will be generated. All calls to the BPF programs called via
+ * the dispatcher will then be a direct call, instead of an
+ * indirect. The dispatcher hijacks a trampoline function it via the
+ * __fentry__ of the trampoline. The trampoline function has the
+ * following signature:
+ *
+ * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi,
+ * unsigned int (*bpf_func)(const void *,
+ * const struct bpf_insn *));
+ */
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog(
+ struct bpf_dispatcher *d, struct bpf_prog *prog)
+{
+ int i;
+
+ for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+ if (prog == d->progs[i].prog)
+ return &d->progs[i];
+ }
+ return NULL;
+}
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_free(
+ struct bpf_dispatcher *d)
+{
+ return bpf_dispatcher_find_prog(d, NULL);
+}
+
+static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d,
+ struct bpf_prog *prog)
+{
+ struct bpf_dispatcher_prog *entry;
+
+ if (!prog)
+ return false;
+
+ entry = bpf_dispatcher_find_prog(d, prog);
+ if (entry) {
+ refcount_inc(&entry->users);
+ return false;
+ }
+
+ entry = bpf_dispatcher_find_free(d);
+ if (!entry)
+ return false;
+
+ bpf_prog_inc(prog);
+ entry->prog = prog;
+ refcount_set(&entry->users, 1);
+ d->num_progs++;
+ return true;
+}
+
+static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
+ struct bpf_prog *prog)
+{
+ struct bpf_dispatcher_prog *entry;
+
+ if (!prog)
+ return false;
+
+ entry = bpf_dispatcher_find_prog(d, prog);
+ if (!entry)
+ return false;
+
+ if (refcount_dec_and_test(&entry->users)) {
+ entry->prog = NULL;
+ bpf_prog_put(prog);
+ d->num_progs--;
+ return true;
+ }
+ return false;
+}
+
+int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+{
+ return -ENOTSUPP;
+}
+
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+{
+ s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
+ int i;
+
+ for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+ if (d->progs[i].prog)
+ *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
+ }
+ return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+}
+
+static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
+{
+ void *old, *new;
+ u32 noff;
+ int err;
+
+ if (!prev_num_progs) {
+ old = NULL;
+ noff = 0;
+ } else {
+ old = d->image + d->image_off;
+ noff = d->image_off ^ (BPF_IMAGE_SIZE / 2);
+ }
+
+ new = d->num_progs ? d->image + noff : NULL;
+ if (new) {
+ if (bpf_dispatcher_prepare(d, new))
+ return;
+ }
+
+ err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
+ if (err || !new)
+ return;
+
+ d->image_off = noff;
+}
+
+void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
+ struct bpf_prog *to)
+{
+ bool changed = false;
+ int prev_num_progs;
+
+ if (from == to)
+ return;
+
+ mutex_lock(&d->mutex);
+ if (!d->image) {
+ d->image = bpf_image_alloc();
+ if (!d->image)
+ goto out;
+ }
+
+ prev_num_progs = d->num_progs;
+ changed |= bpf_dispatcher_remove_prog(d, from);
+ changed |= bpf_dispatcher_add_prog(d, to);
+
+ if (!changed)
+ goto out;
+
+ bpf_dispatcher_update(d, prev_num_progs);
+out:
+ mutex_unlock(&d->mutex);
+}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..2d182c4ee9d9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,6 +17,16 @@
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
+#define BATCH_OPS(_name) \
+ .map_lookup_batch = \
+ _name##_map_lookup_batch, \
+ .map_lookup_and_delete_batch = \
+ _name##_map_lookup_and_delete_batch, \
+ .map_update_batch = \
+ generic_map_update_batch, \
+ .map_delete_batch = \
+ generic_map_delete_batch
+
struct bucket {
struct hlist_nulls_head head;
raw_spinlock_t lock;
@@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static int
+__htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool do_delete, bool is_lru_map,
+ bool is_percpu)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
+ void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
+ void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+ void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+ void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ u32 batch, max_count, size, bucket_size;
+ u64 elem_map_flags, map_flags;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ unsigned long flags;
+ struct htab_elem *l;
+ struct bucket *b;
+ int ret = 0;
+
+ elem_map_flags = attr->batch.elem_flags;
+ if ((elem_map_flags & ~BPF_F_LOCK) ||
+ ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ return -EINVAL;
+
+ map_flags = attr->batch.flags;
+ if (map_flags)
+ return -EINVAL;
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ batch = 0;
+ if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
+ return -EFAULT;
+
+ if (batch >= htab->n_buckets)
+ return -ENOENT;
+
+ key_size = htab->map.key_size;
+ roundup_key_size = round_up(htab->map.key_size, 8);
+ value_size = htab->map.value_size;
+ size = round_up(value_size, 8);
+ if (is_percpu)
+ value_size = size * num_possible_cpus();
+ total = 0;
+ /* while experimenting with hash tables with sizes ranging from 10 to
+ * 1000, it was observed that a bucket can have upto 5 entries.
+ */
+ bucket_size = 5;
+
+alloc:
+ /* We cannot do copy_from_user or copy_to_user inside
+ * the rcu_read_lock. Allocate enough space here.
+ */
+ keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ if (!keys || !values) {
+ ret = -ENOMEM;
+ goto after_loop;
+ }
+
+again:
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+again_nocopy:
+ dst_key = keys;
+ dst_val = values;
+ b = &htab->buckets[batch];
+ head = &b->head;
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ bucket_cnt = 0;
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ bucket_cnt++;
+
+ if (bucket_cnt > (max_count - total)) {
+ if (total == 0)
+ ret = -ENOSPC;
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ goto after_loop;
+ }
+
+ if (bucket_cnt > bucket_size) {
+ bucket_size = bucket_cnt;
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ kvfree(keys);
+ kvfree(values);
+ goto alloc;
+ }
+
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ memcpy(dst_key, l->key, key_size);
+
+ if (is_percpu) {
+ int off = 0, cpu;
+ void __percpu *pptr;
+
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(dst_val + off,
+ per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ } else {
+ value = l->key + roundup_key_size;
+ if (elem_map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, dst_val, value,
+ true);
+ else
+ copy_map_value(map, dst_val, value);
+ check_and_init_map_lock(map, dst_val);
+ }
+ if (do_delete) {
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (is_lru_map)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ else
+ free_htab_elem(htab, l);
+ }
+ dst_key += key_size;
+ dst_val += value_size;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ /* If we are not copying data, we can go to next bucket and avoid
+ * unlocking the rcu.
+ */
+ if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
+ batch++;
+ goto again_nocopy;
+ }
+
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
+ key_size * bucket_cnt) ||
+ copy_to_user(uvalues + total * value_size, values,
+ value_size * bucket_cnt))) {
+ ret = -EFAULT;
+ goto after_loop;
+ }
+
+ total += bucket_cnt;
+ batch++;
+ if (batch >= htab->n_buckets) {
+ ret = -ENOENT;
+ goto after_loop;
+ }
+ goto again;
+
+after_loop:
+ if (ret == -EFAULT)
+ goto out;
+
+ /* copy # of entries and next batch */
+ ubatch = u64_to_user_ptr(attr->batch.out_batch);
+ if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
+ put_user(total, &uattr->batch.count))
+ ret = -EFAULT;
+
+out:
+ kvfree(keys);
+ kvfree(values);
+ return ret;
+}
+
+static int
+htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, true);
+}
+
+static int
+htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, true);
+}
+
+static int
+htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, false);
+}
+
+static int
+htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, false);
+}
+
+static int
+htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, true);
+}
+
+static int
+htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, true);
+}
+
+static int
+htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, false);
+}
+
+static int
+htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, false);
+}
+
const struct bpf_map_ops htab_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ BATCH_OPS(htab),
};
const struct bpf_map_ops htab_lru_map_ops = {
@@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ BATCH_OPS(htab_lru),
};
/* Called from eBPF program */
@@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ BATCH_OPS(htab_percpu),
};
const struct bpf_map_ops htab_lru_percpu_map_ops = {
@@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ BATCH_OPS(htab_lru_percpu),
};
static int fd_htab_map_alloc_check(union bpf_attr *attr)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5e28718928ca..d8b7b110a1c5 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -11,6 +11,7 @@
#include <linux/uidgid.h>
#include <linux/filter.h>
#include <linux/ctype.h>
+#include <linux/jiffies.h>
#include "../../lib/kstrtox.h"
@@ -312,12 +313,23 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
preempt_enable();
}
+BPF_CALL_0(bpf_jiffies64)
+{
+ return get_jiffies_64();
+}
+
+const struct bpf_func_proto bpf_jiffies64_proto = {
+ .func = bpf_jiffies64,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
struct cgroup *cgrp = task_dfl_cgroup(current);
- return cgrp->kn->id.id;
+ return cgroup_id(cgrp);
}
const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index cc0d0cf114e3..5e40e7fccc21 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -14,8 +14,9 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/kdev_t.h>
-#include <linux/parser.h>
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
@@ -30,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
{
switch (type) {
case BPF_TYPE_PROG:
- raw = bpf_prog_inc(raw);
+ bpf_prog_inc(raw);
break;
case BPF_TYPE_MAP:
- raw = bpf_map_inc(raw, true);
+ bpf_map_inc_with_uref(raw);
break;
default:
WARN_ON_ONCE(1);
@@ -195,6 +196,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
void *key = map_iter(m)->key;
void *prev_key;
+ (*pos)++;
if (map_iter(m)->done)
return NULL;
@@ -207,8 +209,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
map_iter(m)->done = true;
return NULL;
}
-
- ++(*pos);
return key;
}
@@ -379,7 +379,7 @@ static const struct inode_operations bpf_dir_iops = {
.unlink = simple_unlink,
};
-static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
+static int bpf_obj_do_pin(const char __user *pathname, void *raw,
enum bpf_type type)
{
struct dentry *dentry;
@@ -388,7 +388,7 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
umode_t mode;
int ret;
- dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
+ dentry = user_path_create(AT_FDCWD, pathname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -421,30 +421,22 @@ out:
int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
{
- struct filename *pname;
enum bpf_type type;
void *raw;
int ret;
- pname = getname(pathname);
- if (IS_ERR(pname))
- return PTR_ERR(pname);
-
raw = bpf_fd_probe_obj(ufd, &type);
- if (IS_ERR(raw)) {
- ret = PTR_ERR(raw);
- goto out;
- }
+ if (IS_ERR(raw))
+ return PTR_ERR(raw);
- ret = bpf_obj_do_pin(pname, raw, type);
+ ret = bpf_obj_do_pin(pathname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
-out:
- putname(pname);
+
return ret;
}
-static void *bpf_obj_do_get(const struct filename *pathname,
+static void *bpf_obj_do_get(const char __user *pathname,
enum bpf_type *type, int flags)
{
struct inode *inode;
@@ -452,7 +444,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
void *raw;
int ret;
- ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
+ ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path);
if (ret)
return ERR_PTR(ret);
@@ -479,36 +471,27 @@ out:
int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
- struct filename *pname;
- int ret = -ENOENT;
int f_flags;
void *raw;
+ int ret;
f_flags = bpf_get_file_flag(flags);
if (f_flags < 0)
return f_flags;
- pname = getname(pathname);
- if (IS_ERR(pname))
- return PTR_ERR(pname);
-
- raw = bpf_obj_do_get(pname, &type, f_flags);
- if (IS_ERR(raw)) {
- ret = PTR_ERR(raw);
- goto out;
- }
+ raw = bpf_obj_do_get(pathname, &type, f_flags);
+ if (IS_ERR(raw))
+ return PTR_ERR(raw);
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
ret = bpf_map_new_fd(raw, f_flags);
else
- goto out;
+ return -ENOENT;
if (ret < 0)
bpf_any_put(raw, type);
-out:
- putname(pname);
return ret;
}
@@ -533,7 +516,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type
if (!bpf_prog_get_ok(prog, &type, false))
return ERR_PTR(-EINVAL);
- return bpf_prog_inc(prog);
+ bpf_prog_inc(prog);
+ return prog;
}
struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
@@ -583,58 +567,47 @@ static const struct super_operations bpf_super_ops = {
enum {
OPT_MODE,
- OPT_ERR,
};
-static const match_table_t bpf_mount_tokens = {
- { OPT_MODE, "mode=%o" },
- { OPT_ERR, NULL },
+static const struct fs_parameter_spec bpf_fs_parameters[] = {
+ fsparam_u32oct ("mode", OPT_MODE),
+ {}
};
struct bpf_mount_opts {
umode_t mode;
};
-static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
+static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- substring_t args[MAX_OPT_ARGS];
- int option, token;
- char *ptr;
+ struct bpf_mount_opts *opts = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- opts->mode = S_IRWXUGO;
-
- while ((ptr = strsep(&data, ",")) != NULL) {
- if (!*ptr)
- continue;
-
- token = match_token(ptr, bpf_mount_tokens, args);
- switch (token) {
- case OPT_MODE:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->mode = option & S_IALLUGO;
- break;
+ opt = fs_parse(fc, bpf_fs_parameters, param, &result);
+ if (opt < 0)
/* We might like to report bad mount options here, but
* traditionally we've ignored all mount options, so we'd
* better continue to ignore non-existing options for bpf.
*/
- }
+ return opt == -ENOPARAM ? 0 : opt;
+
+ switch (opt) {
+ case OPT_MODE:
+ opts->mode = result.uint_32 & S_IALLUGO;
+ break;
}
return 0;
}
-static int bpf_fill_super(struct super_block *sb, void *data, int silent)
+static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
{
static const struct tree_descr bpf_rfiles[] = { { "" } };
- struct bpf_mount_opts opts;
+ struct bpf_mount_opts *opts = fc->fs_private;
struct inode *inode;
int ret;
- ret = bpf_parse_options(data, &opts);
- if (ret)
- return ret;
-
ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
if (ret)
return ret;
@@ -644,21 +617,50 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
inode = sb->s_root->d_inode;
inode->i_op = &bpf_dir_iops;
inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= S_ISVTX | opts.mode;
+ inode->i_mode |= S_ISVTX | opts->mode;
return 0;
}
-static struct dentry *bpf_mount(struct file_system_type *type, int flags,
- const char *dev_name, void *data)
+static int bpf_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, bpf_fill_super);
+}
+
+static void bpf_free_fc(struct fs_context *fc)
{
- return mount_nodev(type, flags, data, bpf_fill_super);
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations bpf_context_ops = {
+ .free = bpf_free_fc,
+ .parse_param = bpf_parse_param,
+ .get_tree = bpf_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int bpf_init_fs_context(struct fs_context *fc)
+{
+ struct bpf_mount_opts *opts;
+
+ opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ opts->mode = S_IRWXUGO;
+
+ fc->fs_private = opts;
+ fc->ops = &bpf_context_ops;
+ return 0;
}
static struct file_system_type bpf_fs_type = {
.owner = THIS_MODULE,
.name = "bpf",
- .mount = bpf_mount,
+ .init_fs_context = bpf_init_fs_context,
+ .parameters = bpf_fs_parameters,
.kill_sb = kill_litter_super,
};
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index addd6fdceec8..33d01866bcc2 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -20,7 +20,7 @@ struct bpf_cgroup_storage_map {
struct bpf_map map;
spinlock_t lock;
- struct bpf_prog *prog;
+ struct bpf_prog_aux *aux;
struct rb_root root;
struct list_head list;
};
@@ -357,7 +357,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
* The first field must be a 64 bit integer at 0 offset.
*/
m = (struct btf_member *)(key_type + 1);
- size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id);
+ size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
return -EINVAL;
@@ -366,7 +366,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
*/
m++;
offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
- size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type);
+ size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
return -EINVAL;
@@ -420,7 +420,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_seq_show_elem = cgroup_storage_seq_show_elem,
};
-int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
+int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
{
enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
@@ -428,14 +428,14 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
spin_lock_bh(&map->lock);
- if (map->prog && map->prog != prog)
+ if (map->aux && map->aux != aux)
goto unlock;
- if (prog->aux->cgroup_storage[stype] &&
- prog->aux->cgroup_storage[stype] != _map)
+ if (aux->cgroup_storage[stype] &&
+ aux->cgroup_storage[stype] != _map)
goto unlock;
- map->prog = prog;
- prog->aux->cgroup_storage[stype] = _map;
+ map->aux = aux;
+ aux->cgroup_storage[stype] = _map;
ret = 0;
unlock:
spin_unlock_bh(&map->lock);
@@ -443,16 +443,16 @@ unlock:
return ret;
}
-void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
+void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map)
{
enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
spin_lock_bh(&map->lock);
- if (map->prog == prog) {
- WARN_ON(prog->aux->cgroup_storage[stype] != _map);
- map->prog = NULL;
- prog->aux->cgroup_storage[stype] = NULL;
+ if (map->aux == aux) {
+ WARN_ON(aux->cgroup_storage[stype] != _map);
+ map->aux = NULL;
+ aux->cgroup_storage[stype] = NULL;
}
spin_unlock_bh(&map->lock);
}
@@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
return;
storage->key.attach_type = type;
- storage->key.cgroup_inode_id = cgroup->kn->id.id;
+ storage->key.cgroup_inode_id = cgroup_id(cgroup);
map = storage->map;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index fab4fb134547..b3c48d1533cb 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -17,13 +17,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
if (IS_ERR(inner_map))
return inner_map;
- /* prog_array->owner_prog_type and owner_jited
- * is a runtime binding. Doing static check alone
- * in the verifier is not enough.
+ /* prog_array->aux->{type,jited} is a runtime binding.
+ * Doing static check alone in the verifier is not enough.
*/
if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
- inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ||
+ inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
fdput(f);
return ERR_PTR(-ENOTSUPP);
}
@@ -98,7 +98,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
return inner_map;
if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
- inner_map = bpf_map_inc(inner_map, false);
+ bpf_map_inc(inner_map);
else
inner_map = ERR_PTR(-EINVAL);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index ba635209ae9a..2c5dc6541ece 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -302,14 +302,14 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
struct inode *ns_inode;
struct path ns_path;
char __user *uinsns;
- void *res;
+ int res;
u32 ulen;
res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
- if (IS_ERR(res)) {
+ if (res) {
if (!info->ifindex)
return -ENODEV;
- return PTR_ERR(res);
+ return res;
}
down_read(&bpf_devs_lock);
@@ -526,13 +526,13 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
};
struct inode *ns_inode;
struct path ns_path;
- void *res;
+ int res;
res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args);
- if (IS_ERR(res)) {
+ if (res) {
if (!info->ifindex)
return -ENODEV;
- return PTR_ERR(res);
+ return res;
}
ns_inode = ns_path.dentry->d_inode;
@@ -678,8 +678,10 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
down_write(&bpf_devs_lock);
if (!offdevs_inited) {
err = rhashtable_init(&offdevs, &offdevs_params);
- if (err)
+ if (err) {
+ up_write(&bpf_devs_lock);
return ERR_PTR(err);
+ }
offdevs_inited = true;
}
up_write(&bpf_devs_lock);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..3f958b90d914 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -287,16 +287,17 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
bool irq_work_busy = false;
struct stack_map_irq_work *work = NULL;
- if (in_nmi()) {
+ if (irqs_disabled()) {
work = this_cpu_ptr(&up_read_work);
- if (work->irq_work.flags & IRQ_WORK_BUSY)
+ if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
/* cannot queue more up_read, fallback */
irq_work_busy = true;
}
/*
- * We cannot do up_read() in nmi context. To do build_id lookup
- * in nmi context, we need to run up_read() in irq_work. We use
+ * We cannot do up_read() when the irq is disabled, because of
+ * risk to deadlock with rq_lock. To do build_id lookup when the
+ * irqs are disabled, we need to run up_read() in irq_work. We use
* a percpu variable to do the irq_work. If the irq_work is
* already used by another lookup, we fall back to report ips.
*
@@ -338,7 +339,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
* up_read_non_owner(). The rwsem_release() is called
* here to release the lock from lockdep's perspective.
*/
- rwsem_release(&current->mm->mmap_sem.dep_map, 1, _RET_IP_);
+ rwsem_release(&current->mm->mmap_sem.dep_map, _RET_IP_);
}
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5d141f16f6fa..a91ad518c050 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,13 +23,16 @@
#include <linux/timekeeping.h>
#include <linux/ctype.h>
#include <linux/nospec.h>
+#include <linux/audit.h>
+#include <uapi/linux/btf.h>
-#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
- (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
- (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
- (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
-#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
+ IS_FD_HASH(map))
#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
@@ -42,7 +45,7 @@ static DEFINE_SPINLOCK(map_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
static const struct bpf_map_ops * const bpf_map_types[] = {
-#define BPF_PROG_TYPE(_id, _ops)
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops) \
[_id] = &_ops,
#include <linux/bpf_types.h>
@@ -126,7 +129,153 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
-void *bpf_map_area_alloc(size_t size, int numa_node)
+static u32 bpf_map_value_size(struct bpf_map *map)
+{
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ return round_up(map->value_size, 8) * num_possible_cpus();
+ else if (IS_FD_MAP(map))
+ return sizeof(u32);
+ else
+ return map->value_size;
+}
+
+static void maybe_wait_bpf_programs(struct bpf_map *map)
+{
+ /* Wait for any running BPF programs to complete so that
+ * userspace, when we return to it, knows that all programs
+ * that could be running use the new map value.
+ */
+ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
+ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+ synchronize_rcu();
+}
+
+static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
+ void *value, __u64 flags)
+{
+ int err;
+
+ /* Need to create a kthread, thus must support schedule */
+ if (bpf_map_is_dev_bound(map)) {
+ return bpf_map_offload_update_elem(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+ map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ return map->ops->map_update_elem(map, key, value, flags);
+ } else if (IS_FD_PROG_ARRAY(map)) {
+ return bpf_fd_array_map_update_elem(map, f.file, key, value,
+ flags);
+ }
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+ * inside bpf map update or delete otherwise deadlocks are possible
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ err = bpf_percpu_hash_update(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_update(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_update(map, key, value,
+ flags);
+ } else if (IS_FD_ARRAY(map)) {
+ rcu_read_lock();
+ err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ flags);
+ rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ rcu_read_lock();
+ err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ flags);
+ rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ /* rcu_read_lock() is not needed */
+ err = bpf_fd_reuseport_array_update_elem(map, key, value,
+ flags);
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK) {
+ err = map->ops->map_push_elem(map, value, flags);
+ } else {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, flags);
+ rcu_read_unlock();
+ }
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+
+ return err;
+}
+
+static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
+ __u64 flags)
+{
+ void *ptr;
+ int err;
+
+ if (bpf_map_is_dev_bound(map))
+ return bpf_map_offload_lookup_elem(map, key, value);
+
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ err = bpf_percpu_hash_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+ err = bpf_stackmap_copy(map, key, value);
+ } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
+ err = bpf_fd_array_map_lookup_elem(map, key, value);
+ } else if (IS_FD_HASH(map)) {
+ err = bpf_fd_htab_map_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK) {
+ err = map->ops->map_peek_elem(map, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ /* struct_ops map requires directly updating "value" */
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
+ } else {
+ rcu_read_lock();
+ if (map->ops->map_lookup_elem_sys_only)
+ ptr = map->ops->map_lookup_elem_sys_only(map, key);
+ else
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ } else if (!ptr) {
+ err = -ENOENT;
+ } else {
+ err = 0;
+ if (flags & BPF_F_LOCK)
+ /* lock 'ptr' and copy everything but lock */
+ copy_map_value_locked(map, value, ptr, true);
+ else
+ copy_map_value(map, value, ptr);
+ /* mask lock, since value wasn't zero inited */
+ check_and_init_map_lock(map, value);
+ }
+ rcu_read_unlock();
+ }
+
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+
+ return err;
+}
+
+static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
{
/* We really just want to fail instead of triggering OOM killer
* under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
@@ -141,18 +290,36 @@ void *bpf_map_area_alloc(size_t size, int numa_node)
const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
void *area;
- if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ if (size >= SIZE_MAX)
+ return NULL;
+
+ /* kmalloc()'ed memory can't be mmap()'ed */
+ if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
numa_node);
if (area != NULL)
return area;
}
-
+ if (mmapable) {
+ BUG_ON(!PAGE_ALIGNED(size));
+ return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
+ __GFP_RETRY_MAYFAIL | flags);
+ }
return __vmalloc_node_flags_caller(size, numa_node,
GFP_KERNEL | __GFP_RETRY_MAYFAIL |
flags, __builtin_return_address(0));
}
+void *bpf_map_area_alloc(u64 size, int numa_node)
+{
+ return __bpf_map_area_alloc(size, numa_node, false);
+}
+
+void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
+{
+ return __bpf_map_area_alloc(size, numa_node, true);
+}
+
void bpf_map_area_free(void *area)
{
kvfree(area);
@@ -197,7 +364,7 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
atomic_long_sub(pages, &user->locked_vm);
}
-int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size)
+int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size)
{
u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
struct user_struct *user;
@@ -310,7 +477,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
static void bpf_map_put_uref(struct bpf_map *map)
{
- if (atomic_dec_and_test(&map->usercnt)) {
+ if (atomic64_dec_and_test(&map->usercnt)) {
if (map->ops->map_release_uref)
map->ops->map_release_uref(map);
}
@@ -321,7 +488,7 @@ static void bpf_map_put_uref(struct bpf_map *map)
*/
static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
{
- if (atomic_dec_and_test(&map->refcnt)) {
+ if (atomic64_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map, do_idr_lock);
btf_put(map->btf);
@@ -370,13 +537,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_map *map = filp->private_data;
const struct bpf_array *array;
- u32 owner_prog_type = 0;
- u32 owner_jited = 0;
+ u32 type = 0, jited = 0;
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
- owner_prog_type = array->owner_prog_type;
- owner_jited = array->owner_jited;
+ type = array->aux->type;
+ jited = array->aux->jited;
}
seq_printf(m,
@@ -396,12 +562,9 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->memory.pages * 1ULL << PAGE_SHIFT,
map->id,
READ_ONCE(map->frozen));
-
- if (owner_prog_type) {
- seq_printf(m, "owner_prog_type:\t%u\n",
- owner_prog_type);
- seq_printf(m, "owner_jited:\t%u\n",
- owner_jited);
+ if (type) {
+ seq_printf(m, "owner_prog_type:\t%u\n", type);
+ seq_printf(m, "owner_jited:\t%u\n", jited);
}
}
#endif
@@ -424,6 +587,74 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
return -EINVAL;
}
+/* called for any extra memory-mapped regions (except initial) */
+static void bpf_map_mmap_open(struct vm_area_struct *vma)
+{
+ struct bpf_map *map = vma->vm_file->private_data;
+
+ bpf_map_inc_with_uref(map);
+
+ if (vma->vm_flags & VM_WRITE) {
+ mutex_lock(&map->freeze_mutex);
+ map->writecnt++;
+ mutex_unlock(&map->freeze_mutex);
+ }
+}
+
+/* called for all unmapped memory region (including initial) */
+static void bpf_map_mmap_close(struct vm_area_struct *vma)
+{
+ struct bpf_map *map = vma->vm_file->private_data;
+
+ if (vma->vm_flags & VM_WRITE) {
+ mutex_lock(&map->freeze_mutex);
+ map->writecnt--;
+ mutex_unlock(&map->freeze_mutex);
+ }
+
+ bpf_map_put_with_uref(map);
+}
+
+static const struct vm_operations_struct bpf_map_default_vmops = {
+ .open = bpf_map_mmap_open,
+ .close = bpf_map_mmap_close,
+};
+
+static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct bpf_map *map = filp->private_data;
+ int err;
+
+ if (!map->ops->map_mmap || map_value_has_spin_lock(map))
+ return -ENOTSUPP;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ mutex_lock(&map->freeze_mutex);
+
+ if ((vma->vm_flags & VM_WRITE) && map->frozen) {
+ err = -EPERM;
+ goto out;
+ }
+
+ /* set default open/close callbacks */
+ vma->vm_ops = &bpf_map_default_vmops;
+ vma->vm_private_data = map;
+
+ err = map->ops->map_mmap(map, vma);
+ if (err)
+ goto out;
+
+ bpf_map_inc_with_uref(map);
+
+ if (vma->vm_flags & VM_WRITE)
+ map->writecnt++;
+out:
+ mutex_unlock(&map->freeze_mutex);
+ return err;
+}
+
const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
@@ -431,6 +662,7 @@ const struct file_operations bpf_map_fops = {
.release = bpf_map_release,
.read = bpf_dummy_read,
.write = bpf_dummy_write,
+ .mmap = bpf_map_mmap,
};
int bpf_map_new_fd(struct bpf_map *map, int flags)
@@ -542,7 +774,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
+#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -556,6 +788,14 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
+ if (attr->btf_vmlinux_value_type_id) {
+ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
+ attr->btf_key_type_id || attr->btf_value_type_id)
+ return -EINVAL;
+ } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
+ return -EINVAL;
+ }
+
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
@@ -574,35 +814,39 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map;
- atomic_set(&map->refcnt, 1);
- atomic_set(&map->usercnt, 1);
-
- if (attr->btf_key_type_id || attr->btf_value_type_id) {
+ atomic64_set(&map->refcnt, 1);
+ atomic64_set(&map->usercnt, 1);
+ mutex_init(&map->freeze_mutex);
+
+ map->spin_lock_off = -EINVAL;
+ if (attr->btf_key_type_id || attr->btf_value_type_id ||
+ /* Even the map's value is a kernel's struct,
+ * the bpf_prog.o must have BTF to begin with
+ * to figure out the corresponding kernel's
+ * counter part. Thus, attr->btf_fd has
+ * to be valid also.
+ */
+ attr->btf_vmlinux_value_type_id) {
struct btf *btf;
- if (!attr->btf_value_type_id) {
- err = -EINVAL;
- goto free_map;
- }
-
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
goto free_map;
}
+ map->btf = btf;
- err = map_check_btf(map, btf, attr->btf_key_type_id,
- attr->btf_value_type_id);
- if (err) {
- btf_put(btf);
- goto free_map;
+ if (attr->btf_value_type_id) {
+ err = map_check_btf(map, btf, attr->btf_key_type_id,
+ attr->btf_value_type_id);
+ if (err)
+ goto free_map;
}
- map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
- } else {
- map->spin_lock_off = -EINVAL;
+ map->btf_vmlinux_value_type_id =
+ attr->btf_vmlinux_value_type_id;
}
err = security_bpf_map_alloc(map);
@@ -652,21 +896,19 @@ struct bpf_map *__bpf_map_get(struct fd f)
return f.file->private_data;
}
-/* prog's and map's refcnt limit */
-#define BPF_MAX_REFCNT 32768
-
-struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
+void bpf_map_inc(struct bpf_map *map)
{
- if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
- atomic_dec(&map->refcnt);
- return ERR_PTR(-EBUSY);
- }
- if (uref)
- atomic_inc(&map->usercnt);
- return map;
+ atomic64_inc(&map->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_map_inc);
+void bpf_map_inc_with_uref(struct bpf_map *map)
+{
+ atomic64_inc(&map->refcnt);
+ atomic64_inc(&map->usercnt);
+}
+EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
+
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
struct fd f = fdget(ufd);
@@ -676,34 +918,36 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
if (IS_ERR(map))
return map;
- map = bpf_map_inc(map, true);
+ bpf_map_inc_with_uref(map);
fdput(f);
return map;
}
/* map_idr_lock should have been held */
-static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
- bool uref)
+static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
- refold = atomic_fetch_add_unless(&map->refcnt, 1, 0);
-
- if (refold >= BPF_MAX_REFCNT) {
- __bpf_map_put(map, false);
- return ERR_PTR(-EBUSY);
- }
-
+ refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
if (!refold)
return ERR_PTR(-ENOENT);
-
if (uref)
- atomic_inc(&map->usercnt);
+ atomic64_inc(&map->usercnt);
return map;
}
+struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
+{
+ spin_lock_bh(&map_idr_lock);
+ map = __bpf_map_inc_not_zero(map, false);
+ spin_unlock_bh(&map_idr_lock);
+
+ return map;
+}
+EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
+
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@@ -729,7 +973,7 @@ static int map_lookup_elem(union bpf_attr *attr)
void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
- void *key, *value, *ptr;
+ void *key, *value;
u32 value_size;
struct fd f;
int err;
@@ -761,72 +1005,14 @@ static int map_lookup_elem(union bpf_attr *attr)
goto err_put;
}
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
- value_size = round_up(map->value_size, 8) * num_possible_cpus();
- else if (IS_FD_MAP(map))
- value_size = sizeof(u32);
- else
- value_size = map->value_size;
+ value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
- err = bpf_map_offload_lookup_elem(map, key, value);
- goto done;
- }
-
- preempt_disable();
- this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
- err = bpf_stackmap_copy(map, key, value);
- } else if (IS_FD_ARRAY(map)) {
- err = bpf_fd_array_map_lookup_elem(map, key, value);
- } else if (IS_FD_HASH(map)) {
- err = bpf_fd_htab_map_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
- err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
- err = map->ops->map_peek_elem(map, value);
- } else {
- rcu_read_lock();
- if (map->ops->map_lookup_elem_sys_only)
- ptr = map->ops->map_lookup_elem_sys_only(map, key);
- else
- ptr = map->ops->map_lookup_elem(map, key);
- if (IS_ERR(ptr)) {
- err = PTR_ERR(ptr);
- } else if (!ptr) {
- err = -ENOENT;
- } else {
- err = 0;
- if (attr->flags & BPF_F_LOCK)
- /* lock 'ptr' and copy everything but lock */
- copy_map_value_locked(map, value, ptr, true);
- else
- copy_map_value(map, value, ptr);
- /* mask lock, since value wasn't zero inited */
- check_and_init_map_lock(map, value);
- }
- rcu_read_unlock();
- }
- this_cpu_dec(bpf_prog_active);
- preempt_enable();
-
-done:
+ err = bpf_map_copy_value(map, key, value, attr->flags);
if (err)
goto free_value;
@@ -845,16 +1031,6 @@ err_put:
return err;
}
-static void maybe_wait_bpf_programs(struct bpf_map *map)
-{
- /* Wait for any running BPF programs to complete so that
- * userspace, when we return to it, knows that all programs
- * that could be running use the new map value.
- */
- if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
- map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
- synchronize_rcu();
-}
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
@@ -910,56 +1086,8 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
- /* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
- err = bpf_map_offload_update_elem(map, key, value, attr->flags);
- goto out;
- } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
- map->map_type == BPF_MAP_TYPE_SOCKHASH ||
- map->map_type == BPF_MAP_TYPE_SOCKMAP) {
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- goto out;
- }
+ err = bpf_map_update_value(map, f, key, value, attr->flags);
- /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
- * inside bpf map update or delete otherwise deadlocks are possible
- */
- preempt_disable();
- __this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_update(map, key, value, attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_update(map, key, value, attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_update(map, key, value,
- attr->flags);
- } else if (IS_FD_ARRAY(map)) {
- rcu_read_lock();
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
- attr->flags);
- rcu_read_unlock();
- } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- rcu_read_lock();
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
- attr->flags);
- rcu_read_unlock();
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
- /* rcu_read_lock() is not needed */
- err = bpf_fd_reuseport_array_update_elem(map, key, value,
- attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
- err = map->ops->map_push_elem(map, value, attr->flags);
- } else {
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- rcu_read_unlock();
- }
- __this_cpu_dec(bpf_prog_active);
- preempt_enable();
- maybe_wait_bpf_programs(map);
-out:
free_value:
kfree(value);
free_key:
@@ -1001,6 +1129,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
+ } else if (IS_FD_PROG_ARRAY(map) ||
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ /* These maps require sleepable context */
+ err = map->ops->map_delete_elem(map, key);
+ goto out;
}
preempt_disable();
@@ -1085,6 +1218,220 @@ err_put:
return err;
}
+int generic_map_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ u32 cp, max_count;
+ int err = 0;
+ void *key;
+
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ return -EINVAL;
+ }
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!key)
+ return -ENOMEM;
+
+ for (cp = 0; cp < max_count; cp++) {
+ err = -EFAULT;
+ if (copy_from_user(key, keys + cp * map->key_size,
+ map->key_size))
+ break;
+
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_delete_elem(map, key);
+ break;
+ }
+
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+ if (err)
+ break;
+ }
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
+ err = -EFAULT;
+
+ kfree(key);
+ return err;
+}
+
+int generic_map_update_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ u32 value_size, cp, max_count;
+ int ufd = attr->map_fd;
+ void *key, *value;
+ struct fd f;
+ int err = 0;
+
+ f = fdget(ufd);
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ return -EINVAL;
+ }
+
+ value_size = bpf_map_value_size(map);
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!key)
+ return -ENOMEM;
+
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ if (!value) {
+ kfree(key);
+ return -ENOMEM;
+ }
+
+ for (cp = 0; cp < max_count; cp++) {
+ err = -EFAULT;
+ if (copy_from_user(key, keys + cp * map->key_size,
+ map->key_size) ||
+ copy_from_user(value, values + cp * value_size, value_size))
+ break;
+
+ err = bpf_map_update_value(map, f, key, value,
+ attr->batch.elem_flags);
+
+ if (err)
+ break;
+ }
+
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
+ err = -EFAULT;
+
+ kfree(value);
+ kfree(key);
+ return err;
+}
+
+#define MAP_LOOKUP_RETRIES 3
+
+int generic_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ void *buf, *buf_prevkey, *prev_key, *key, *value;
+ int err, retry = MAP_LOOKUP_RETRIES;
+ u32 value_size, cp, max_count;
+
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map))
+ return -EINVAL;
+
+ value_size = bpf_map_value_size(map);
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!buf_prevkey)
+ return -ENOMEM;
+
+ buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
+ if (!buf) {
+ kvfree(buf_prevkey);
+ return -ENOMEM;
+ }
+
+ err = -EFAULT;
+ prev_key = NULL;
+ if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
+ goto free_buf;
+ key = buf;
+ value = key + map->key_size;
+ if (ubatch)
+ prev_key = buf_prevkey;
+
+ for (cp = 0; cp < max_count;) {
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, prev_key, key);
+ rcu_read_unlock();
+ if (err)
+ break;
+ err = bpf_map_copy_value(map, key, value,
+ attr->batch.elem_flags);
+
+ if (err == -ENOENT) {
+ if (retry) {
+ retry--;
+ continue;
+ }
+ err = -EINTR;
+ break;
+ }
+
+ if (err)
+ goto free_buf;
+
+ if (copy_to_user(keys + cp * map->key_size, key,
+ map->key_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+ if (copy_to_user(values + cp * value_size, value, value_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+
+ if (!prev_key)
+ prev_key = buf_prevkey;
+
+ swap(prev_key, key);
+ retry = MAP_LOOKUP_RETRIES;
+ cp++;
+ }
+
+ if (err == -EFAULT)
+ goto free_buf;
+
+ if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
+ (cp && copy_to_user(uobatch, prev_key, map->key_size))))
+ err = -EFAULT;
+
+free_buf:
+ kfree(buf_prevkey);
+ kfree(buf);
+ return err;
+}
+
#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
static int map_lookup_and_delete_elem(union bpf_attr *attr)
@@ -1162,6 +1509,13 @@ static int map_freeze(const union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+
+ mutex_lock(&map->freeze_mutex);
+
+ if (map->writecnt) {
+ err = -EBUSY;
+ goto err_put;
+ }
if (READ_ONCE(map->frozen)) {
err = -EBUSY;
goto err_put;
@@ -1173,12 +1527,13 @@ static int map_freeze(const union bpf_attr *attr)
WRITE_ONCE(map->frozen, true);
err_put:
+ mutex_unlock(&map->freeze_mutex);
fdput(f);
return err;
}
static const struct bpf_prog_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _name) \
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#include <linux/bpf_types.h>
@@ -1205,23 +1560,34 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
return 0;
}
-/* drop refcnt on maps used by eBPF program and free auxilary data */
-static void free_used_maps(struct bpf_prog_aux *aux)
-{
- enum bpf_cgroup_storage_type stype;
- int i;
+enum bpf_audit {
+ BPF_AUDIT_LOAD,
+ BPF_AUDIT_UNLOAD,
+ BPF_AUDIT_MAX,
+};
- for_each_cgroup_storage_type(stype) {
- if (!aux->cgroup_storage[stype])
- continue;
- bpf_cgroup_storage_release(aux->prog,
- aux->cgroup_storage[stype]);
- }
+static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
+ [BPF_AUDIT_LOAD] = "LOAD",
+ [BPF_AUDIT_UNLOAD] = "UNLOAD",
+};
- for (i = 0; i < aux->used_map_cnt; i++)
- bpf_map_put(aux->used_maps[i]);
+static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
+{
+ struct audit_context *ctx = NULL;
+ struct audit_buffer *ab;
- kfree(aux->used_maps);
+ if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
+ return;
+ if (audit_enabled == AUDIT_OFF)
+ return;
+ if (op == BPF_AUDIT_LOAD)
+ ctx = audit_context();
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "prog-id=%u op=%s",
+ prog->aux->id, bpf_audit_str[op]);
+ audit_log_end(ab);
}
int __bpf_prog_charge(struct user_struct *user, u32 pages)
@@ -1316,24 +1682,33 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
- free_used_maps(aux);
+ kvfree(aux->func_info);
+ kfree(aux->func_info_aux);
bpf_prog_uncharge_memlock(aux->prog);
security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
+static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
+{
+ bpf_prog_kallsyms_del_all(prog);
+ btf_put(prog->aux->btf);
+ bpf_prog_free_linfo(prog);
+
+ if (deferred)
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ else
+ __bpf_prog_put_rcu(&prog->aux->rcu);
+}
+
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
- if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ if (atomic64_dec_and_test(&prog->aux->refcnt)) {
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
- bpf_prog_kallsyms_del_all(prog);
- btf_put(prog->aux->btf);
- kvfree(prog->aux->func_info);
- bpf_prog_free_linfo(prog);
-
- call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ __bpf_prog_put_noref(prog, true);
}
}
@@ -1435,13 +1810,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f)
return f.file->private_data;
}
-struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
+void bpf_prog_add(struct bpf_prog *prog, int i)
{
- if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
- atomic_sub(i, &prog->aux->refcnt);
- return ERR_PTR(-EBUSY);
- }
- return prog;
+ atomic64_add(i, &prog->aux->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_prog_add);
@@ -1452,13 +1823,13 @@ void bpf_prog_sub(struct bpf_prog *prog, int i)
* path holds a reference to the program, thus atomic_sub() can
* be safely used in such cases!
*/
- WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0);
+ WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
}
EXPORT_SYMBOL_GPL(bpf_prog_sub);
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+void bpf_prog_inc(struct bpf_prog *prog)
{
- return bpf_prog_add(prog, 1);
+ atomic64_inc(&prog->aux->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_prog_inc);
@@ -1467,12 +1838,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
{
int refold;
- refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0);
-
- if (refold >= BPF_MAX_REFCNT) {
- __bpf_prog_put(prog, false);
- return ERR_PTR(-EBUSY);
- }
+ refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
if (!refold)
return ERR_PTR(-ENOENT);
@@ -1510,7 +1876,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
goto out;
}
- prog = bpf_prog_inc(prog);
+ bpf_prog_inc(prog);
out:
fdput(f);
return prog;
@@ -1555,9 +1921,28 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
}
static int
-bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
- enum bpf_attach_type expected_attach_type)
+bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
+ enum bpf_attach_type expected_attach_type,
+ u32 btf_id, u32 prog_fd)
{
+ if (btf_id) {
+ if (btf_id > BTF_MAX_TYPE)
+ return -EINVAL;
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ case BPF_PROG_TYPE_EXT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
+ prog_type != BPF_PROG_TYPE_EXT)
+ return -EINVAL;
+
switch (prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
switch (expected_attach_type) {
@@ -1598,13 +1983,17 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
+ case BPF_PROG_TYPE_EXT:
+ if (expected_attach_type)
+ return -EINVAL;
+ /* fallthrough */
default:
return 0;
}
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt
+#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
@@ -1619,6 +2008,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
+ BPF_F_TEST_STATE_FREQ |
BPF_F_TEST_RND_HI32))
return -EINVAL;
@@ -1645,7 +2035,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
return -EPERM;
bpf_prog_load_fixup_attach_type(attr);
- if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type))
+ if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
+ attr->attach_btf_id,
+ attr->attach_prog_fd))
return -EINVAL;
/* plain bpf_prog allocation */
@@ -1654,6 +2046,17 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
return -ENOMEM;
prog->expected_attach_type = attr->expected_attach_type;
+ prog->aux->attach_btf_id = attr->attach_btf_id;
+ if (attr->attach_prog_fd) {
+ struct bpf_prog *tgt_prog;
+
+ tgt_prog = bpf_prog_get(attr->attach_prog_fd);
+ if (IS_ERR(tgt_prog)) {
+ err = PTR_ERR(tgt_prog);
+ goto free_prog_nouncharge;
+ }
+ prog->aux->linked_prog = tgt_prog;
+ }
prog->aux->offload_requested = !!attr->prog_ifindex;
@@ -1675,7 +2078,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
prog->orig_prog = NULL;
prog->jited = 0;
- atomic_set(&prog->aux->refcnt, 1);
+ atomic64_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
@@ -1707,28 +2110,36 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (err)
goto free_used_maps;
- err = bpf_prog_new_fd(prog);
- if (err < 0) {
- /* failed to allocate fd.
- * bpf_prog_put() is needed because the above
- * bpf_prog_alloc_id() has published the prog
- * to the userspace and the userspace may
- * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
- */
- bpf_prog_put(prog);
- return err;
- }
-
+ /* Upon success of bpf_prog_alloc_id(), the BPF prog is
+ * effectively publicly exposed. However, retrieving via
+ * bpf_prog_get_fd_by_id() will take another reference,
+ * therefore it cannot be gone underneath us.
+ *
+ * Only for the time /after/ successful bpf_prog_new_fd()
+ * and before returning to userspace, we might just hold
+ * one reference and any parallel close on that fd could
+ * rip everything out. Hence, below notifications must
+ * happen before bpf_prog_new_fd().
+ *
+ * Also, any failure handling from this point onwards must
+ * be using bpf_prog_put() given the program is exposed.
+ */
bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_LOAD);
+
+ err = bpf_prog_new_fd(prog);
+ if (err < 0)
+ bpf_prog_put(prog);
return err;
free_used_maps:
- bpf_prog_free_linfo(prog);
- kvfree(prog->aux->func_info);
- btf_put(prog->aux->btf);
- bpf_prog_kallsyms_del_subprogs(prog);
- free_used_maps(prog->aux);
+ /* In case we have subprogs, we need to wait for a grace
+ * period before we can tear down JIT memory since symbols
+ * are already exposed under kallsyms.
+ */
+ __bpf_prog_put_noref(prog, prog->aux->func_cnt);
+ return err;
free_prog:
bpf_prog_uncharge_memlock(prog);
free_prog_sec:
@@ -1758,6 +2169,50 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
+static int bpf_tracing_prog_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_prog *prog = filp->private_data;
+
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
+ bpf_prog_put(prog);
+ return 0;
+}
+
+static const struct file_operations bpf_tracing_prog_fops = {
+ .release = bpf_tracing_prog_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+};
+
+static int bpf_tracing_prog_attach(struct bpf_prog *prog)
+{
+ int tr_fd, err;
+
+ if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
+ prog->expected_attach_type != BPF_TRACE_FEXIT &&
+ prog->type != BPF_PROG_TYPE_EXT) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+
+ err = bpf_trampoline_link_prog(prog);
+ if (err)
+ goto out_put_prog;
+
+ tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops,
+ prog, O_CLOEXEC);
+ if (tr_fd < 0) {
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
+ err = tr_fd;
+ goto out_put_prog;
+ }
+ return tr_fd;
+
+out_put_prog:
+ bpf_prog_put(prog);
+ return err;
+}
+
struct bpf_raw_tracepoint {
struct bpf_raw_event_map *btp;
struct bpf_prog *prog;
@@ -1789,17 +2244,54 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
struct bpf_raw_tracepoint *raw_tp;
struct bpf_raw_event_map *btp;
struct bpf_prog *prog;
- char tp_name[128];
+ const char *tp_name;
+ char buf[128];
int tp_fd, err;
- if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
- sizeof(tp_name) - 1) < 0)
- return -EFAULT;
- tp_name[sizeof(tp_name) - 1] = 0;
+ if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
+ prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_EXT &&
+ prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+
+ if (prog->type == BPF_PROG_TYPE_TRACING ||
+ prog->type == BPF_PROG_TYPE_EXT) {
+ if (attr->raw_tracepoint.name) {
+ /* The attach point for this category of programs
+ * should be specified via btf_id during program load.
+ */
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ tp_name = prog->aux->attach_func_name;
+ else
+ return bpf_tracing_prog_attach(prog);
+ } else {
+ if (strncpy_from_user(buf,
+ u64_to_user_ptr(attr->raw_tracepoint.name),
+ sizeof(buf) - 1) < 0) {
+ err = -EFAULT;
+ goto out_put_prog;
+ }
+ buf[sizeof(buf) - 1] = 0;
+ tp_name = buf;
+ }
btp = bpf_get_raw_tracepoint(tp_name);
- if (!btp)
- return -ENOENT;
+ if (!btp) {
+ err = -ENOENT;
+ goto out_put_prog;
+ }
raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
if (!raw_tp) {
@@ -1807,38 +2299,27 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
goto out_put_btp;
}
raw_tp->btp = btp;
-
- prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
- if (IS_ERR(prog)) {
- err = PTR_ERR(prog);
- goto out_free_tp;
- }
- if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
- prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
- err = -EINVAL;
- goto out_put_prog;
- }
+ raw_tp->prog = prog;
err = bpf_probe_register(raw_tp->btp, prog);
if (err)
- goto out_put_prog;
+ goto out_free_tp;
- raw_tp->prog = prog;
tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp,
O_CLOEXEC);
if (tp_fd < 0) {
bpf_probe_unregister(raw_tp->btp, prog);
err = tp_fd;
- goto out_put_prog;
+ goto out_free_tp;
}
return tp_fd;
-out_put_prog:
- bpf_prog_put(prog);
out_free_tp:
kfree(raw_tp);
out_put_btp:
bpf_put_raw_tracepoint(btp);
+out_put_prog:
+ bpf_prog_put(prog);
return err;
}
@@ -1859,10 +2340,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
}
}
-#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
#define BPF_F_ATTACH_MASK \
- (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
static int bpf_prog_attach(const union bpf_attr *attr)
{
@@ -2124,17 +2605,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
-static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+struct bpf_prog *bpf_prog_by_id(u32 id)
{
struct bpf_prog *prog;
- u32 id = attr->prog_id;
- int fd;
- if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!id)
+ return ERR_PTR(-ENOENT);
spin_lock_bh(&prog_idr_lock);
prog = idr_find(&prog_idr, id);
@@ -2143,7 +2619,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
else
prog = ERR_PTR(-ENOENT);
spin_unlock_bh(&prog_idr_lock);
+ return prog;
+}
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ u32 id = attr->prog_id;
+ int fd;
+ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ prog = bpf_prog_by_id(id);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -2177,7 +2668,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
- map = bpf_map_inc_not_zero(map, true);
+ map = __bpf_map_inc_not_zero(map, true);
else
map = ERR_PTR(-ENOENT);
spin_unlock_bh(&map_idr_lock);
@@ -2593,6 +3084,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}
+ info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_info_fill(&info, map);
@@ -2805,6 +3297,61 @@ out:
return err;
}
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+#define BPF_DO_BATCH(fn) \
+ do { \
+ if (!fn) { \
+ err = -ENOTSUPP; \
+ goto err_put; \
+ } \
+ err = fn(map, attr, uattr); \
+ } while (0)
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ int cmd)
+{
+ struct bpf_map *map;
+ int err, ufd;
+ struct fd f;
+
+ if (CHECK_ATTR(BPF_MAP_BATCH))
+ return -EINVAL;
+
+ ufd = attr->batch.map_fd;
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if ((cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd != BPF_MAP_LOOKUP_BATCH &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd == BPF_MAP_LOOKUP_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_batch);
+ else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ else if (cmd == BPF_MAP_UPDATE_BATCH)
+ BPF_DO_BATCH(map->ops->map_update_batch);
+ else
+ BPF_DO_BATCH(map->ops->map_delete_batch);
+
+err_put:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -2874,6 +3421,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_obj_get_next_id(&attr, uattr,
&map_idr, &map_idr_lock);
break;
+ case BPF_BTF_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &btf_idr, &btf_idr_lock);
+ break;
case BPF_PROG_GET_FD_BY_ID:
err = bpf_prog_get_fd_by_id(&attr);
break;
@@ -2898,6 +3449,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
+ case BPF_MAP_LOOKUP_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+ break;
+ case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr,
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+ break;
+ case BPF_MAP_UPDATE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+ break;
+ case BPF_MAP_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
new file mode 100644
index 000000000000..7ae5dddd1fe6
--- /dev/null
+++ b/kernel/bpf/sysfs_btf.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide kernel BTF information for introspection and use by eBPF tools.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/init.h>
+#include <linux/sysfs.h>
+
+/* See scripts/link-vmlinux.sh, gen_btf() func for details */
+extern char __weak _binary__btf_vmlinux_bin_start[];
+extern char __weak _binary__btf_vmlinux_bin_end[];
+
+static ssize_t
+btf_vmlinux_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t len)
+{
+ memcpy(buf, _binary__btf_vmlinux_bin_start + off, len);
+ return len;
+}
+
+static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
+ .attr = { .name = "vmlinux", .mode = 0444, },
+ .read = btf_vmlinux_read,
+};
+
+static struct kobject *btf_kobj;
+
+static int __init btf_vmlinux_init(void)
+{
+ if (!_binary__btf_vmlinux_bin_start)
+ return 0;
+
+ btf_kobj = kobject_create_and_add("btf", kernel_kobj);
+ if (!btf_kobj)
+ return -ENOMEM;
+
+ bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end -
+ _binary__btf_vmlinux_bin_start;
+
+ return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux);
+}
+
+subsys_initcall(btf_vmlinux_init);
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index ca52b9642943..d4f335a9a899 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -44,14 +44,19 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
return TNUM(a.value >> shift, a.mask >> shift);
}
-struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness)
{
/* if a.value is negative, arithmetic shifting by minimum shift
* will have larger negative offset compared to more shifting.
* If a.value is nonnegative, arithmetic shifting by minimum shift
* will have larger positive offset compare to more shifting.
*/
- return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+ if (insn_bitness == 32)
+ return TNUM((u32)(((s32)a.value) >> min_shift),
+ (u32)(((s32)a.mask) >> min_shift));
+ else
+ return TNUM((s64)a.value >> min_shift,
+ (s64)a.mask >> min_shift);
}
struct tnum tnum_add(struct tnum a, struct tnum b)
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
new file mode 100644
index 000000000000..6b264a92064b
--- /dev/null
+++ b/kernel/bpf/trampoline.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ftrace.h>
+#include <linux/rbtree_latch.h>
+
+/* dummy _ops. The verifier will operate on target program's ops. */
+const struct bpf_verifier_ops bpf_extension_verifier_ops = {
+};
+const struct bpf_prog_ops bpf_extension_prog_ops = {
+};
+
+/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
+#define TRAMPOLINE_HASH_BITS 10
+#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
+
+static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
+static struct latch_tree_root image_tree __cacheline_aligned;
+
+/* serializes access to trampoline_table and image_tree */
+static DEFINE_MUTEX(trampoline_mutex);
+
+static void *bpf_jit_alloc_exec_page(void)
+{
+ void *image;
+
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!image)
+ return NULL;
+
+ set_vm_flush_reset_perms(image);
+ /* Keep image as writeable. The alternative is to keep flipping ro/rw
+ * everytime new program is attached or detached.
+ */
+ set_memory_x((long)image, 1);
+ return image;
+}
+
+static __always_inline bool image_tree_less(struct latch_tree_node *a,
+ struct latch_tree_node *b)
+{
+ struct bpf_image *ia = container_of(a, struct bpf_image, tnode);
+ struct bpf_image *ib = container_of(b, struct bpf_image, tnode);
+
+ return ia < ib;
+}
+
+static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n)
+{
+ void *image = container_of(n, struct bpf_image, tnode);
+
+ if (addr < image)
+ return -1;
+ if (addr >= image + PAGE_SIZE)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops image_tree_ops = {
+ .less = image_tree_less,
+ .comp = image_tree_comp,
+};
+
+static void *__bpf_image_alloc(bool lock)
+{
+ struct bpf_image *image;
+
+ image = bpf_jit_alloc_exec_page();
+ if (!image)
+ return NULL;
+
+ if (lock)
+ mutex_lock(&trampoline_mutex);
+ latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops);
+ if (lock)
+ mutex_unlock(&trampoline_mutex);
+ return image->data;
+}
+
+void *bpf_image_alloc(void)
+{
+ return __bpf_image_alloc(true);
+}
+
+bool is_bpf_image_address(unsigned long addr)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
+{
+ struct bpf_trampoline *tr;
+ struct hlist_head *head;
+ void *image;
+ int i;
+
+ mutex_lock(&trampoline_mutex);
+ head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
+ hlist_for_each_entry(tr, head, hlist) {
+ if (tr->key == key) {
+ refcount_inc(&tr->refcnt);
+ goto out;
+ }
+ }
+ tr = kzalloc(sizeof(*tr), GFP_KERNEL);
+ if (!tr)
+ goto out;
+
+ /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
+ image = __bpf_image_alloc(false);
+ if (!image) {
+ kfree(tr);
+ tr = NULL;
+ goto out;
+ }
+
+ tr->key = key;
+ INIT_HLIST_NODE(&tr->hlist);
+ hlist_add_head(&tr->hlist, head);
+ refcount_set(&tr->refcnt, 1);
+ mutex_init(&tr->mutex);
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ INIT_HLIST_HEAD(&tr->progs_hlist[i]);
+ tr->image = image;
+out:
+ mutex_unlock(&trampoline_mutex);
+ return tr;
+}
+
+static int is_ftrace_location(void *ip)
+{
+ long addr;
+
+ addr = ftrace_location((long)ip);
+ if (!addr)
+ return 0;
+ if (WARN_ON_ONCE(addr != (long)ip))
+ return -EFAULT;
+ return 1;
+}
+
+static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
+{
+ void *ip = tr->func.addr;
+ int ret;
+
+ if (tr->func.ftrace_managed)
+ ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+ else
+ ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+ return ret;
+}
+
+static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+{
+ void *ip = tr->func.addr;
+ int ret;
+
+ if (tr->func.ftrace_managed)
+ ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
+ else
+ ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+ return ret;
+}
+
+/* first time registering */
+static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
+{
+ void *ip = tr->func.addr;
+ int ret;
+
+ ret = is_ftrace_location(ip);
+ if (ret < 0)
+ return ret;
+ tr->func.ftrace_managed = ret;
+
+ if (tr->func.ftrace_managed)
+ ret = register_ftrace_direct((long)ip, (long)new_addr);
+ else
+ ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+ return ret;
+}
+
+/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
+ * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2
+ */
+#define BPF_MAX_TRAMP_PROGS 40
+
+static int bpf_trampoline_update(struct bpf_trampoline *tr)
+{
+ void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2;
+ void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2;
+ struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
+ int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
+ int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
+ struct bpf_prog **progs, **fentry, **fexit;
+ u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+ struct bpf_prog_aux *aux;
+ int err;
+
+ if (fentry_cnt + fexit_cnt == 0) {
+ err = unregister_fentry(tr, old_image);
+ tr->selector = 0;
+ goto out;
+ }
+
+ /* populate fentry progs */
+ fentry = progs = progs_to_run;
+ hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist)
+ *progs++ = aux->prog;
+
+ /* populate fexit progs */
+ fexit = progs;
+ hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist)
+ *progs++ = aux->prog;
+
+ if (fexit_cnt)
+ flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+
+ /* Though the second half of trampoline page is unused a task could be
+ * preempted in the middle of the first half of trampoline and two
+ * updates to trampoline would change the code from underneath the
+ * preempted task. Hence wait for tasks to voluntarily schedule or go
+ * to userspace.
+ */
+ synchronize_rcu_tasks();
+
+ err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2,
+ &tr->func.model, flags,
+ fentry, fentry_cnt,
+ fexit, fexit_cnt,
+ tr->func.addr);
+ if (err < 0)
+ goto out;
+
+ if (tr->selector)
+ /* progs already running at this address */
+ err = modify_fentry(tr, old_image, new_image);
+ else
+ /* first time registering */
+ err = register_fentry(tr, new_image);
+ if (err)
+ goto out;
+ tr->selector++;
+out:
+ return err;
+}
+
+static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
+{
+ switch (t) {
+ case BPF_TRACE_FENTRY:
+ return BPF_TRAMP_FENTRY;
+ case BPF_TRACE_FEXIT:
+ return BPF_TRAMP_FEXIT;
+ default:
+ return BPF_TRAMP_REPLACE;
+ }
+}
+
+int bpf_trampoline_link_prog(struct bpf_prog *prog)
+{
+ enum bpf_tramp_prog_type kind;
+ struct bpf_trampoline *tr;
+ int err = 0;
+ int cnt;
+
+ tr = prog->aux->trampoline;
+ kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
+ mutex_lock(&tr->mutex);
+ if (tr->extension_prog) {
+ /* cannot attach fentry/fexit if extension prog is attached.
+ * cannot overwrite extension prog either.
+ */
+ err = -EBUSY;
+ goto out;
+ }
+ cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
+ if (kind == BPF_TRAMP_REPLACE) {
+ /* Cannot attach extension if fentry/fexit are in use. */
+ if (cnt) {
+ err = -EBUSY;
+ goto out;
+ }
+ tr->extension_prog = prog;
+ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+ prog->bpf_func);
+ goto out;
+ }
+ if (cnt >= BPF_MAX_TRAMP_PROGS) {
+ err = -E2BIG;
+ goto out;
+ }
+ if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
+ /* prog already linked */
+ err = -EBUSY;
+ goto out;
+ }
+ hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
+ tr->progs_cnt[kind]++;
+ err = bpf_trampoline_update(prog->aux->trampoline);
+ if (err) {
+ hlist_del(&prog->aux->tramp_hlist);
+ tr->progs_cnt[kind]--;
+ }
+out:
+ mutex_unlock(&tr->mutex);
+ return err;
+}
+
+/* bpf_trampoline_unlink_prog() should never fail. */
+int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
+{
+ enum bpf_tramp_prog_type kind;
+ struct bpf_trampoline *tr;
+ int err;
+
+ tr = prog->aux->trampoline;
+ kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
+ mutex_lock(&tr->mutex);
+ if (kind == BPF_TRAMP_REPLACE) {
+ WARN_ON_ONCE(!tr->extension_prog);
+ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
+ tr->extension_prog->bpf_func, NULL);
+ tr->extension_prog = NULL;
+ goto out;
+ }
+ hlist_del(&prog->aux->tramp_hlist);
+ tr->progs_cnt[kind]--;
+ err = bpf_trampoline_update(prog->aux->trampoline);
+out:
+ mutex_unlock(&tr->mutex);
+ return err;
+}
+
+void bpf_trampoline_put(struct bpf_trampoline *tr)
+{
+ struct bpf_image *image;
+
+ if (!tr)
+ return;
+ mutex_lock(&trampoline_mutex);
+ if (!refcount_dec_and_test(&tr->refcnt))
+ goto out;
+ WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
+ if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
+ goto out;
+ if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
+ goto out;
+ image = container_of(tr->image, struct bpf_image, data);
+ latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops);
+ /* wait for tasks to get out of trampoline before freeing it */
+ synchronize_rcu_tasks();
+ bpf_jit_free_exec(image);
+ hlist_del(&tr->hlist);
+ kfree(tr);
+out:
+ mutex_unlock(&trampoline_mutex);
+}
+
+/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that
+ * are needed for trampoline. The macro is split into
+ * call _bpf_prog_enter
+ * call prog->bpf_func
+ * call __bpf_prog_exit
+ */
+u64 notrace __bpf_prog_enter(void)
+{
+ u64 start = 0;
+
+ rcu_read_lock();
+ preempt_disable();
+ if (static_branch_unlikely(&bpf_stats_enabled_key))
+ start = sched_clock();
+ return start;
+}
+
+void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
+{
+ struct bpf_prog_stats *stats;
+
+ if (static_branch_unlikely(&bpf_stats_enabled_key) &&
+ /* static_key could be enabled in __bpf_prog_enter
+ * and disabled in __bpf_prog_exit.
+ * And vice versa.
+ * Hence check that 'start' is not zero.
+ */
+ start) {
+ stats = this_cpu_ptr(prog->aux->stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->cnt++;
+ stats->nsecs += sched_clock() - start;
+ u64_stats_update_end(&stats->syncp);
+ }
+ preempt_enable();
+ rcu_read_unlock();
+}
+
+int __weak
+arch_prepare_bpf_trampoline(void *image, void *image_end,
+ const struct btf_func_model *m, u32 flags,
+ struct bpf_prog **fentry_progs, int fentry_cnt,
+ struct bpf_prog **fexit_progs, int fexit_cnt,
+ void *orig_call)
+{
+ return -ENOTSUPP;
+}
+
+static int __init init_trampolines(void)
+{
+ int i;
+
+ for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&trampoline_table[i]);
+ return 0;
+}
+late_initcall(init_trampolines);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c84d83f86141..1cc945daa9c8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,7 +23,7 @@
#include "disasm.h"
static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
-#define BPF_PROG_TYPE(_id, _name) \
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _verifier_ops,
#define BPF_MAP_TYPE(_id, _ops)
#include <linux/bpf_types.h>
@@ -171,6 +171,9 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
#define BPF_COMPLEXITY_LIMIT_STATES 64
+#define BPF_MAP_KEY_POISON (1ULL << 63)
+#define BPF_MAP_KEY_SEEN (1ULL << 62)
+
#define BPF_MAP_PTR_UNPRIV 1UL
#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
POISON_POINTER_DELTA))
@@ -178,12 +181,12 @@ struct bpf_verifier_stack_elem {
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
- return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
+ return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
}
static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
{
- return aux->map_state & BPF_MAP_PTR_UNPRIV;
+ return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV;
}
static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
@@ -191,8 +194,31 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
{
BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
unpriv |= bpf_map_ptr_unpriv(aux);
- aux->map_state = (unsigned long)map |
- (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+ aux->map_ptr_state = (unsigned long)map |
+ (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+}
+
+static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
+{
+ return aux->map_key_state & BPF_MAP_KEY_POISON;
+}
+
+static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
+{
+ return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
+}
+
+static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
+{
+ return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON);
+}
+
+static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
+{
+ bool poisoned = bpf_map_key_poisoned(aux);
+
+ aux->map_key_state = state | BPF_MAP_KEY_SEEN |
+ (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}
struct bpf_call_arg_meta {
@@ -205,8 +231,11 @@ struct bpf_call_arg_meta {
u64 msize_umax_value;
int ref_obj_id;
int func_id;
+ u32 btf_id;
};
+struct btf *btf_vmlinux;
+
static DEFINE_MUTEX(bpf_verifier_lock);
static const struct bpf_line_info *
@@ -243,6 +272,10 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
n = min(log->len_total - log->len_used - 1, n);
log->kbuf[n] = '\0';
+ if (log->level == BPF_LOG_KERNEL) {
+ pr_err("BPF:%s\n", log->kbuf);
+ return;
+ }
if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
log->len_used += n;
else
@@ -280,6 +313,19 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
va_end(args);
}
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+
static const char *ltrim(const char *s)
{
while (isspace(*s))
@@ -400,6 +446,7 @@ static const char * const reg_type_str[] = {
[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
[PTR_TO_TP_BUFFER] = "tp_buffer",
[PTR_TO_XDP_SOCK] = "xdp_sock",
+ [PTR_TO_BTF_ID] = "ptr_",
};
static char slot_type_char[] = {
@@ -430,6 +477,12 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env,
return cur->frame[reg->frameno];
}
+const char *kernel_type_name(u32 id)
+{
+ return btf_name_by_offset(btf_vmlinux,
+ btf_type_by_id(btf_vmlinux, id)->name_off);
+}
+
static void print_verifier_state(struct bpf_verifier_env *env,
const struct bpf_func_state *state)
{
@@ -454,6 +507,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
+ if (t == PTR_TO_BTF_ID)
+ verbose(env, "%s", kernel_type_name(reg->btf_id));
verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t))
verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
@@ -852,7 +907,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void __mark_reg_not_init(struct bpf_reg_state *reg);
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
/* Mark the unknown part of a register (variable offset or scalar value) as
* known to have the value @imm.
@@ -890,7 +946,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
return;
}
__mark_reg_known_zero(regs + regno);
@@ -978,6 +1034,17 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
reg->umax_value));
}
+static void __reg_bound_offset32(struct bpf_reg_state *reg)
+{
+ u64 mask = 0xffffFFFF;
+ struct tnum range = tnum_range(reg->umin_value & mask,
+ reg->umax_value & mask);
+ struct tnum lo32 = tnum_cast(reg->var_off, 4);
+ struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32);
+
+ reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range));
+}
+
/* Reset the min/max bounds of a register */
static void __mark_reg_unbounded(struct bpf_reg_state *reg)
{
@@ -985,13 +1052,11 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
reg->smax_value = S64_MAX;
reg->umin_value = 0;
reg->umax_value = U64_MAX;
-
- /* constant backtracking is enabled for root only for now */
- reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
}
/* Mark a register as having a completely unknown (scalar) value. */
-static void __mark_reg_unknown(struct bpf_reg_state *reg)
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
{
/*
* Clear type, id, off, and union(map_ptr, range) and
@@ -1001,6 +1066,8 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
reg->type = SCALAR_VALUE;
reg->var_off = tnum_unknown;
reg->frameno = 0;
+ reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ?
+ true : false;
__mark_reg_unbounded(reg);
}
@@ -1011,15 +1078,16 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs except FP */
for (regno = 0; regno < BPF_REG_FP; regno++)
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
return;
}
- __mark_reg_unknown(regs + regno);
+ __mark_reg_unknown(env, regs + regno);
}
-static void __mark_reg_not_init(struct bpf_reg_state *reg)
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
{
- __mark_reg_unknown(reg);
+ __mark_reg_unknown(env, reg);
reg->type = NOT_INIT;
}
@@ -1030,10 +1098,10 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs except FP */
for (regno = 0; regno < BPF_REG_FP; regno++)
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
return;
}
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
}
#define DEF_NOT_SUBREG (0)
@@ -1054,10 +1122,6 @@ static void init_reg_state(struct bpf_verifier_env *env,
regs[BPF_REG_FP].type = PTR_TO_STACK;
mark_reg_known_zero(env, regs, BPF_REG_FP);
regs[BPF_REG_FP].frameno = state->frameno;
-
- /* 1st arg to a function */
- regs[BPF_REG_1].type = PTR_TO_CTX;
- mark_reg_known_zero(env, regs, BPF_REG_1);
}
#define BPF_MAIN_FUNC (-1)
@@ -1771,16 +1835,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
bitmap_from_u64(mask, stack_mask);
for_each_set_bit(i, mask, 64) {
if (i >= func->allocated_stack / BPF_REG_SIZE) {
- /* This can happen if backtracking
- * is propagating stack precision where
- * caller has larger stack frame
- * than callee, but backtrack_insn() should
- * have returned -ENOTSUPP.
+ /* the sequence of instructions:
+ * 2: (bf) r3 = r10
+ * 3: (7b) *(u64 *)(r3 -8) = r0
+ * 4: (79) r4 = *(u64 *)(r10 -8)
+ * doesn't contain jmps. It's backtracked
+ * as a single block.
+ * During backtracking insn 3 is not recognized as
+ * stack access, so at the end of backtracking
+ * stack slot fp-8 is still marked in stack_mask.
+ * However the parent state may not have accessed
+ * fp-8 and it's "unallocated" stack space.
+ * In such case fallback to conservative.
*/
- verbose(env, "BUG spi %d stack_size %d\n",
- i, func->allocated_stack);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
+ mark_all_scalars_precise(env, st);
+ return 0;
}
if (func->stack[i].slot_type[0] != STACK_SPILL) {
@@ -1843,6 +1912,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
+ case PTR_TO_BTF_ID:
return true;
default:
return false;
@@ -2325,10 +2395,12 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
- enum bpf_access_type t, enum bpf_reg_type *reg_type)
+ enum bpf_access_type t, enum bpf_reg_type *reg_type,
+ u32 *btf_id)
{
struct bpf_insn_access_aux info = {
.reg_type = *reg_type,
+ .log = &env->log,
};
if (env->ops->is_valid_access &&
@@ -2342,7 +2414,10 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/
*reg_type = info.reg_type;
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+ if (*reg_type == PTR_TO_BTF_ID)
+ *btf_id = info.btf_id;
+ else
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
@@ -2660,8 +2735,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
}
#endif
-static int check_ctx_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno)
+int check_ctx_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
{
/* Access to ctx or passing it to a helper is only allowed in
* its original, unmodified form.
@@ -2733,6 +2808,98 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
reg->smax_value = reg->umax_value;
}
+static bool bpf_map_is_rdonly(const struct bpf_map *map)
+{
+ return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen;
+}
+
+static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
+{
+ void *ptr;
+ u64 addr;
+ int err;
+
+ err = map->ops->map_direct_value_addr(map, &addr, off);
+ if (err)
+ return err;
+ ptr = (void *)(long)addr + off;
+
+ switch (size) {
+ case sizeof(u8):
+ *val = (u64)*(u8 *)ptr;
+ break;
+ case sizeof(u16):
+ *val = (u64)*(u16 *)ptr;
+ break;
+ case sizeof(u32):
+ *val = (u64)*(u32 *)ptr;
+ break;
+ case sizeof(u64):
+ *val = *(u64 *)ptr;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs,
+ int regno, int off, int size,
+ enum bpf_access_type atype,
+ int value_regno)
+{
+ struct bpf_reg_state *reg = regs + regno;
+ const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id);
+ const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+ u32 btf_id;
+ int ret;
+
+ if (off < 0) {
+ verbose(env,
+ "R%d is ptr_%s invalid negative access: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env,
+ "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
+ regno, tname, off, tn_buf);
+ return -EACCES;
+ }
+
+ if (env->ops->btf_struct_access) {
+ ret = env->ops->btf_struct_access(&env->log, t, off, size,
+ atype, &btf_id);
+ } else {
+ if (atype != BPF_READ) {
+ verbose(env, "only read is supported\n");
+ return -EACCES;
+ }
+
+ ret = btf_struct_access(&env->log, t, off, size, atype,
+ &btf_id);
+ }
+
+ if (ret < 0)
+ return ret;
+
+ if (atype == BPF_READ) {
+ if (ret == SCALAR_VALUE) {
+ mark_reg_unknown(env, regs, value_regno);
+ return 0;
+ }
+ mark_reg_known_zero(env, regs, value_regno);
+ regs[value_regno].type = PTR_TO_BTF_ID;
+ regs[value_regno].btf_id = btf_id;
+ }
+
+ return 0;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -2770,11 +2937,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err)
return err;
err = check_map_access(env, regno, off, size, false);
- if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(env, regs, value_regno);
+ if (!err && t == BPF_READ && value_regno >= 0) {
+ struct bpf_map *map = reg->map_ptr;
+
+ /* if map is read-only, track its contents as scalars */
+ if (tnum_is_const(reg->var_off) &&
+ bpf_map_is_rdonly(map) &&
+ map->ops->map_direct_value_addr) {
+ int map_off = off + reg->var_off.value;
+ u64 val = 0;
+
+ err = bpf_map_direct_read(map, map_off, size,
+ &val);
+ if (err)
+ return err;
+ regs[value_regno].type = SCALAR_VALUE;
+ __mark_reg_known(&regs[value_regno], val);
+ } else {
+ mark_reg_unknown(env, regs, value_regno);
+ }
+ }
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
+ u32 btf_id = 0;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
@@ -2786,7 +2972,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err < 0)
return err;
- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf_id);
+ if (err)
+ verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
* PTR_TO_PACKET[_META,_END]. In the latter
@@ -2805,6 +2993,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
+ if (reg_type == PTR_TO_BTF_ID)
+ regs[value_regno].btf_id = btf_id;
}
regs[value_regno].type = reg_type;
}
@@ -2864,6 +3054,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_tp_buffer_access(env, reg, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_BTF_ID) {
+ err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
+ value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str[reg->type]);
@@ -3049,7 +3242,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
}
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
- __mark_reg_unknown(&state->stack[spi].spilled_ptr);
+ __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
state->stack[spi].slot_type[j] = STACK_MISC;
goto mark;
@@ -3292,6 +3485,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
expected_type = PTR_TO_SOCKET;
if (type != expected_type)
goto err_type;
+ } else if (arg_type == ARG_PTR_TO_BTF_ID) {
+ expected_type = PTR_TO_BTF_ID;
+ if (type != expected_type)
+ goto err_type;
+ if (reg->btf_id != meta->btf_id) {
+ verbose(env, "Helper has type %s got %s in R%d\n",
+ kernel_type_name(meta->btf_id),
+ kernel_type_name(reg->btf_id), regno);
+
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) {
+ verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n",
+ regno);
+ return -EACCES;
+ }
} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
if (meta->func_id == BPF_FUNC_spin_lock) {
if (process_spin_lock(env, regno, true))
@@ -3439,6 +3648,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
if (func_id != BPF_FUNC_perf_event_read &&
func_id != BPF_FUNC_perf_event_output &&
+ func_id != BPF_FUNC_skb_output &&
func_id != BPF_FUNC_perf_event_read_value)
goto error;
break;
@@ -3457,6 +3667,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
goto error;
break;
case BPF_MAP_TYPE_DEVMAP:
+ case BPF_MAP_TYPE_DEVMAP_HASH:
if (func_id != BPF_FUNC_redirect_map &&
func_id != BPF_FUNC_map_lookup_elem)
goto error;
@@ -3525,6 +3736,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_perf_event_read:
case BPF_FUNC_perf_event_output:
case BPF_FUNC_perf_event_read_value:
+ case BPF_FUNC_skb_output:
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
@@ -3539,6 +3751,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_FUNC_redirect_map:
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+ map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
map->map_type != BPF_MAP_TYPE_CPUMAP &&
map->map_type != BPF_MAP_TYPE_XSKMAP)
goto error;
@@ -3687,7 +3900,7 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
if (!reg)
continue;
if (reg_is_pkt_pointer_any(reg))
- __mark_reg_unknown(reg);
+ __mark_reg_unknown(env, reg);
}
}
@@ -3715,7 +3928,7 @@ static void release_reg_references(struct bpf_verifier_env *env,
if (!reg)
continue;
if (reg->ref_obj_id == ref_obj_id)
- __mark_reg_unknown(reg);
+ __mark_reg_unknown(env, reg);
}
}
@@ -3739,12 +3952,26 @@ static int release_reference(struct bpf_verifier_env *env,
return 0;
}
+static void clear_caller_saved_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
+{
+ int i;
+
+ /* after the call registers r0 - r5 were scratched */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ mark_reg_not_init(env, regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
+}
+
static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
int i, err, subprog, target_insn;
+ bool is_global = false;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep\n",
@@ -3767,6 +3994,32 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
+ func_info_aux = env->prog->aux->func_info_aux;
+ if (func_info_aux)
+ is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+ if (is_global) {
+ if (err) {
+ verbose(env, "Caller passes invalid args into func#%d\n",
+ subprog);
+ return err;
+ } else {
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env,
+ "Func#%d is global and valid. Skipping.\n",
+ subprog);
+ clear_caller_saved_regs(env, caller->regs);
+
+ /* All global functions return SCALAR_VALUE */
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+
+ /* continue with next insn after call */
+ return 0;
+ }
+ }
+
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
if (!callee)
return -ENOMEM;
@@ -3793,11 +4046,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
callee->regs[i] = caller->regs[i];
- /* after the call registers r0 - r5 were scratched */
- for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(env, caller->regs, caller_saved[i]);
- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
- }
+ clear_caller_saved_regs(env, caller->regs);
/* only increment it after check_reg_arg() finished */
state->curframe++;
@@ -3908,15 +4157,54 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EACCES;
}
- if (!BPF_MAP_PTR(aux->map_state))
+ if (!BPF_MAP_PTR(aux->map_ptr_state))
bpf_map_ptr_store(aux, meta->map_ptr,
meta->map_ptr->unpriv_array);
- else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
+ else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
meta->map_ptr->unpriv_array);
return 0;
}
+static int
+record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+ int func_id, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ struct bpf_reg_state *regs = cur_regs(env), *reg;
+ struct bpf_map *map = meta->map_ptr;
+ struct tnum range;
+ u64 val;
+ int err;
+
+ if (func_id != BPF_FUNC_tail_call)
+ return 0;
+ if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
+ verbose(env, "kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+
+ range = tnum_range(0, map->max_entries - 1);
+ reg = &regs[BPF_REG_3];
+
+ if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
+ bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
+ return 0;
+ }
+
+ err = mark_chain_precision(env, BPF_REG_3);
+ if (err)
+ return err;
+
+ val = reg->var_off.value;
+ if (bpf_map_key_unseen(aux))
+ bpf_map_key_store(aux, val);
+ else if (!bpf_map_key_poisoned(aux) &&
+ bpf_map_key_immediate(aux) != val)
+ bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
+ return 0;
+}
+
static int check_reference_leak(struct bpf_verifier_env *env)
{
struct bpf_func_state *state = cur_func(env);
@@ -3978,23 +4266,20 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
meta.func_id = func_id;
/* check args */
- err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
- if (err)
- return err;
- err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
- if (err)
- return err;
- err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
- if (err)
- return err;
- err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta);
- if (err)
- return err;
- err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta);
+ for (i = 0; i < 5; i++) {
+ err = btf_resolve_helper_id(&env->log, fn, i);
+ if (err > 0)
+ meta.btf_id = err;
+ err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta);
+ if (err)
+ return err;
+ }
+
+ err = record_func_map(env, &meta, func_id, insn_idx);
if (err)
return err;
- err = record_func_map(env, &meta, func_id, insn_idx);
+ err = record_func_key(env, &meta, func_id, insn_idx);
if (err)
return err;
@@ -4338,7 +4623,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* Taint dst register if offset had invalid bounds derived from
* e.g. dead branches.
*/
- __mark_reg_unknown(dst_reg);
+ __mark_reg_unknown(env, dst_reg);
return 0;
}
@@ -4590,13 +4875,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Taint dst register if offset had invalid bounds derived from
* e.g. dead branches.
*/
- __mark_reg_unknown(dst_reg);
+ __mark_reg_unknown(env, dst_reg);
return 0;
}
if (!src_known &&
opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
- __mark_reg_unknown(dst_reg);
+ __mark_reg_unknown(env, dst_reg);
return 0;
}
@@ -4804,9 +5089,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Upon reaching here, src_known is true and
* umax_val is equal to umin_val.
*/
- dst_reg->smin_value >>= umin_val;
- dst_reg->smax_value >>= umin_val;
- dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+ if (insn_bitness == 32) {
+ dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val);
+ dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val);
+ } else {
+ dst_reg->smin_value >>= umin_val;
+ dst_reg->smax_value >>= umin_val;
+ }
+
+ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val,
+ insn_bitness);
/* blow away the dst_reg umin_value/umax_value and rely on
* dst_reg var_off to refine the result.
@@ -5425,6 +5717,10 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
/* We might have learned some bits from the bounds. */
__reg_bound_offset(false_reg);
__reg_bound_offset(true_reg);
+ if (is_jmp32) {
+ __reg_bound_offset32(false_reg);
+ __reg_bound_offset32(true_reg);
+ }
/* Intersecting with the old var_off might have improved our bounds
* slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
* then new var_off is (0; 0x7f...fc) which improves our umax.
@@ -5534,6 +5830,10 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
/* We might have learned some bits from the bounds. */
__reg_bound_offset(false_reg);
__reg_bound_offset(true_reg);
+ if (is_jmp32) {
+ __reg_bound_offset32(false_reg);
+ __reg_bound_offset32(true_reg);
+ }
/* Intersecting with the old var_off might have improved our bounds
* slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
* then new var_off is (0; 0x7f...fc) which improves our umax.
@@ -6011,6 +6311,7 @@ static bool may_access_skb(enum bpf_prog_type type)
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
struct bpf_reg_state *regs = cur_regs(env);
+ static const int ctx_reg = BPF_REG_6;
u8 mode = BPF_MODE(insn->code);
int i, err;
@@ -6044,7 +6345,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(env, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, ctx_reg, SRC_OP);
if (err)
return err;
@@ -6063,7 +6364,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
@@ -6076,6 +6377,10 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
+ err = check_ctx_reg(env, &regs[ctx_reg], ctx_reg);
+ if (err < 0)
+ return err;
+
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -6095,8 +6400,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
static int check_return_code(struct bpf_verifier_env *env)
{
struct tnum enforce_attach_type_range = tnum_unknown;
+ const struct bpf_prog *prog = env->prog;
struct bpf_reg_state *reg;
struct tnum range = tnum_range(0, 1);
+ int err;
+
+ /* The struct_ops func-ptr's return type could be "void" */
+ if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
+ !prog->aux->attach_func_proto->type)
+ return 0;
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, BPF_REG_0)) {
+ verbose(env, "R0 leaks addr as return value\n");
+ return -EACCES;
+ }
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -6116,6 +6443,11 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
break;
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ if (!env->prog->aux->attach_btf_id)
+ return 0;
+ range = tnum_const(0);
+ break;
default:
return 0;
}
@@ -6398,6 +6730,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
u32 i, nfuncs, urec_size, min_size;
u32 krec_size = sizeof(struct bpf_func_info);
struct bpf_func_info *krecord;
+ struct bpf_func_info_aux *info_aux = NULL;
const struct btf_type *type;
struct bpf_prog *prog;
const struct btf *btf;
@@ -6431,6 +6764,9 @@ static int check_btf_func(struct bpf_verifier_env *env,
krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
if (!krecord)
return -ENOMEM;
+ info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
+ if (!info_aux)
+ goto err_free;
for (i = 0; i < nfuncs; i++) {
ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
@@ -6476,35 +6812,38 @@ static int check_btf_func(struct bpf_verifier_env *env,
/* check type_id */
type = btf_type_by_id(btf, krecord[i].type_id);
- if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
+ if (!type || !btf_type_is_func(type)) {
verbose(env, "invalid type id %d in func info",
krecord[i].type_id);
ret = -EINVAL;
goto err_free;
}
-
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
prev_offset = krecord[i].insn_off;
urecord += urec_size;
}
prog->aux->func_info = krecord;
prog->aux->func_info_cnt = nfuncs;
+ prog->aux->func_info_aux = info_aux;
return 0;
err_free:
kvfree(krecord);
+ kfree(info_aux);
return ret;
}
static void adjust_btf_func(struct bpf_verifier_env *env)
{
+ struct bpf_prog_aux *aux = env->prog->aux;
int i;
- if (!env->prog->aux->func_info)
+ if (!aux->func_info)
return;
for (i = 0; i < env->subprog_cnt; i++)
- env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start;
+ aux->func_info[i].insn_off = env->subprog_info[i].start;
}
#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
@@ -6719,7 +7058,7 @@ static void clean_func_state(struct bpf_verifier_env *env,
/* since the register is unused, clear its state
* to make further comparison simpler
*/
- __mark_reg_not_init(&st->regs[i]);
+ __mark_reg_not_init(env, &st->regs[i]);
}
for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
@@ -6727,7 +7066,7 @@ static void clean_func_state(struct bpf_verifier_env *env,
/* liveness must not touch this stack slot anymore */
st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
if (!(live & REG_LIVE_READ)) {
- __mark_reg_not_init(&st->stack[i].spilled_ptr);
+ __mark_reg_not_init(env, &st->stack[i].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
st->stack[i].slot_type[j] = STACK_INVALID;
}
@@ -7220,7 +7559,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
- bool add_new_state = false;
+ bool add_new_state = env->test_state_freq ? true : false;
cur->last_insn_idx = env->prev_insn_idx;
if (!env->insn_aux_data[insn_idx].prune_point)
@@ -7432,6 +7771,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
+ case PTR_TO_BTF_ID:
return false;
default:
return true;
@@ -7458,32 +7798,13 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
static int do_check(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
bool do_print_state = false;
int prev_insn_idx = -1;
- env->prev_linfo = NULL;
-
- state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
- if (!state)
- return -ENOMEM;
- state->curframe = 0;
- state->speculative = false;
- state->branches = 1;
- state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
- if (!state->frame[0]) {
- kfree(state);
- return -ENOMEM;
- }
- env->cur_state = state;
- init_func_state(env, state->frame[0],
- BPF_MAIN_FUNC /* callsite */,
- 0 /* frameno */,
- 0 /* subprogno, zero == main subprog */);
-
for (;;) {
struct bpf_insn *insn;
u8 class;
@@ -7561,7 +7882,7 @@ static int do_check(struct bpf_verifier_env *env)
}
regs = cur_regs(env);
- env->insn_aux_data[env->insn_idx].seen = true;
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
prev_insn_idx = env->insn_idx;
if (class == BPF_ALU || class == BPF_ALU64) {
@@ -7747,21 +8068,6 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- /* eBPF calling convetion is such that R0 is used
- * to return the value from eBPF program.
- * Make sure that it's readable at this time
- * of bpf_exit, which means that program wrote
- * something into it earlier
- */
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
- if (err)
- return err;
-
- if (is_pointer_value(env, BPF_REG_0)) {
- verbose(env, "R0 leaks addr as return value\n");
- return -EACCES;
- }
-
err = check_return_code(env);
if (err)
return err;
@@ -7796,7 +8102,7 @@ process_bpf_exit:
return err;
env->insn_idx++;
- env->insn_aux_data[env->insn_idx].seen = true;
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
} else {
verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
@@ -7809,7 +8115,6 @@ process_bpf_exit:
env->insn_idx++;
}
- env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return 0;
}
@@ -7869,6 +8174,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ verbose(env, "bpf_struct_ops map cannot be used in prog\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -8000,17 +8310,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
* will be used by the valid program until it's unloaded
* and all maps are released in free_used_maps()
*/
- map = bpf_map_inc(map, false);
- if (IS_ERR(map)) {
- fdput(f);
- return PTR_ERR(map);
- }
+ bpf_map_inc(map);
aux->map_index = env->used_map_cnt;
env->used_maps[env->used_map_cnt++] = map;
if (bpf_map_is_cgroup_storage(map) &&
- bpf_cgroup_storage_assign(env->prog, map)) {
+ bpf_cgroup_storage_assign(env->prog->aux, map)) {
verbose(env, "only one cgroup storage of each type is allowed\n");
fdput(f);
return -EBUSY;
@@ -8040,18 +8346,8 @@ next_insn:
/* drop refcnt of maps used by the rejected program */
static void release_maps(struct bpf_verifier_env *env)
{
- enum bpf_cgroup_storage_type stype;
- int i;
-
- for_each_cgroup_storage_type(stype) {
- if (!env->prog->aux->cgroup_storage[stype])
- continue;
- bpf_cgroup_storage_release(env->prog,
- env->prog->aux->cgroup_storage[stype]);
- }
-
- for (i = 0; i < env->used_map_cnt; i++)
- bpf_map_put(env->used_maps[i]);
+ __bpf_free_used_maps(env->prog->aux, env->used_maps,
+ env->used_map_cnt);
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
@@ -8095,7 +8391,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
for (i = off; i < off + cnt - 1; i++) {
- new_data[i].seen = true;
+ new_data[i].seen = env->pass_cnt;
new_data[i].zext_dst = insn_has_def32(env, insn + i);
}
env->insn_aux_data = new_data;
@@ -8573,6 +8869,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
case PTR_TO_XDP_SOCK:
convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
break;
+ case PTR_TO_BTF_ID:
+ if (type == BPF_READ) {
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ env->prog->aux->num_exentries++;
+ } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) {
+ verbose(env, "Writes through BTF pointers are not allowed\n");
+ return -EINVAL;
+ }
+ continue;
default:
continue;
}
@@ -8616,8 +8922,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
if (is_narrower_load && size < target_size) {
- u8 shift = bpf_ctx_narrow_load_shift(off, size,
- size_default);
+ u8 shift = bpf_ctx_narrow_access_offset(
+ off, size, size_default) * 8;
if (ctx_field_size <= 4) {
if (shift)
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
@@ -8863,6 +9169,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
static int fixup_bpf_calls(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
+ bool expect_blinding = bpf_jit_blinding_enabled(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
@@ -8871,7 +9178,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
- int i, cnt, delta = 0;
+ int i, ret, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
@@ -9015,6 +9322,27 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
+ if (env->allow_ptr_leaks && !expect_blinding &&
+ prog->jit_requested &&
+ !bpf_map_key_poisoned(aux) &&
+ !bpf_map_ptr_poisoned(aux) &&
+ !bpf_map_ptr_unpriv(aux)) {
+ struct bpf_jit_poke_descriptor desc = {
+ .reason = BPF_POKE_REASON_TAIL_CALL,
+ .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
+ .tail_call.key = bpf_map_key_immediate(aux),
+ };
+
+ ret = bpf_jit_add_poke_descriptor(prog, &desc);
+ if (ret < 0) {
+ verbose(env, "adding tail call poke descriptor failed\n");
+ return ret;
+ }
+
+ insn->imm = ret + 1;
+ continue;
+ }
+
if (!bpf_map_ptr_unpriv(aux))
continue;
@@ -9029,7 +9357,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
return -EINVAL;
}
- map_ptr = BPF_MAP_PTR(aux->map_state);
+ map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
map_ptr->max_entries, 2);
insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -9063,7 +9391,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
- map_ptr = BPF_MAP_PTR(aux->map_state);
+ map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
ops = map_ptr->ops;
if (insn->imm == BPF_FUNC_map_lookup_elem &&
ops->map_gen_lookup) {
@@ -9129,6 +9457,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_jiffies64) {
+ struct bpf_insn ld_jiffies_addr[2] = {
+ BPF_LD_IMM64(BPF_REG_0,
+ (unsigned long)&jiffies),
+ };
+
+ insn_buf[0] = ld_jiffies_addr[0];
+ insn_buf[1] = ld_jiffies_addr[1];
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
+ BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+ cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
@@ -9143,6 +9495,23 @@ patch_call_imm:
insn->imm = fn->func - __bpf_call_base;
}
+ /* Since poke tab is now finalized, publish aux to tracker. */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+ if (!map_ptr->ops->map_poke_track ||
+ !map_ptr->ops->map_poke_untrack ||
+ !map_ptr->ops->map_poke_run) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
+ if (ret < 0) {
+ verbose(env, "tracking tail call prog failed\n");
+ return ret;
+ }
+ }
+
return 0;
}
@@ -9158,6 +9527,7 @@ static void free_states(struct bpf_verifier_env *env)
kfree(sl);
sl = sln;
}
+ env->free_list = NULL;
if (!env->explored_states)
return;
@@ -9171,11 +9541,164 @@ static void free_states(struct bpf_verifier_env *env)
kfree(sl);
sl = sln;
}
+ env->explored_states[i] = NULL;
}
+}
- kvfree(env->explored_states);
+/* The verifier is using insn_aux_data[] to store temporary data during
+ * verification and to store information for passes that run after the
+ * verification like dead code sanitization. do_check_common() for subprogram N
+ * may analyze many other subprograms. sanitize_insn_aux_data() clears all
+ * temporary data after do_check_common() finds that subprogram N cannot be
+ * verified independently. pass_cnt counts the number of times
+ * do_check_common() was run and insn->aux->seen tells the pass number
+ * insn_aux_data was touched. These variables are compared to clear temporary
+ * data from failed pass. For testing and experiments do_check_common() can be
+ * run multiple times even when prior attempt to verify is unsuccessful.
+ */
+static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ struct bpf_insn_aux_data *aux;
+ int i, class;
+
+ for (i = 0; i < env->prog->len; i++) {
+ class = BPF_CLASS(insn[i].code);
+ if (class != BPF_LDX && class != BPF_STX)
+ continue;
+ aux = &env->insn_aux_data[i];
+ if (aux->seen != env->pass_cnt)
+ continue;
+ memset(aux, 0, offsetof(typeof(*aux), orig_idx));
+ }
}
+static int do_check_common(struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_verifier_state *state;
+ struct bpf_reg_state *regs;
+ int ret, i;
+
+ env->prev_linfo = NULL;
+ env->pass_cnt++;
+
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+ state->curframe = 0;
+ state->speculative = false;
+ state->branches = 1;
+ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
+ if (!state->frame[0]) {
+ kfree(state);
+ return -ENOMEM;
+ }
+ env->cur_state = state;
+ init_func_state(env, state->frame[0],
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno */,
+ subprog);
+
+ regs = state->frame[state->curframe]->regs;
+ if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
+ ret = btf_prepare_func_args(env, subprog, regs);
+ if (ret)
+ goto out;
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (regs[i].type == PTR_TO_CTX)
+ mark_reg_known_zero(env, regs, i);
+ else if (regs[i].type == SCALAR_VALUE)
+ mark_reg_unknown(env, regs, i);
+ }
+ } else {
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+ mark_reg_known_zero(env, regs, BPF_REG_1);
+ ret = btf_check_func_arg_match(env, subprog, regs);
+ if (ret == -EFAULT)
+ /* unlikely verifier bug. abort.
+ * ret == 0 and ret < 0 are sadly acceptable for
+ * main() function due to backward compatibility.
+ * Like socket filter program may be written as:
+ * int bpf_prog(struct pt_regs *ctx)
+ * and never dereference that ctx in the program.
+ * 'struct pt_regs' is a type mismatch for socket
+ * filter that should be using 'struct __sk_buff'.
+ */
+ goto out;
+ }
+
+ ret = do_check(env);
+out:
+ /* check for NULL is necessary, since cur_state can be freed inside
+ * do_check() under memory pressure.
+ */
+ if (env->cur_state) {
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ }
+ while (!pop_stack(env, NULL, NULL));
+ free_states(env);
+ if (ret)
+ /* clean aux data in case subprog was rejected */
+ sanitize_insn_aux_data(env);
+ return ret;
+}
+
+/* Verify all global functions in a BPF program one by one based on their BTF.
+ * All global functions must pass verification. Otherwise the whole program is rejected.
+ * Consider:
+ * int bar(int);
+ * int foo(int f)
+ * {
+ * return bar(f);
+ * }
+ * int bar(int b)
+ * {
+ * ...
+ * }
+ * foo() will be verified first for R1=any_scalar_value. During verification it
+ * will be assumed that bar() already verified successfully and call to bar()
+ * from foo() will be checked for type match only. Later bar() will be verified
+ * independently to check that it's safe for R1=any_scalar_value.
+ */
+static int do_check_subprogs(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int i, ret;
+
+ if (!aux->func_info)
+ return 0;
+
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
+ continue;
+ env->insn_idx = env->subprog_info[i].start;
+ WARN_ON_ONCE(env->insn_idx == 0);
+ ret = do_check_common(env, i);
+ if (ret) {
+ return ret;
+ } else if (env->log.level & BPF_LOG_LEVEL) {
+ verbose(env,
+ "Func#%d is safe for any args that match its prototype\n",
+ i);
+ }
+ }
+ return 0;
+}
+
+static int do_check_main(struct bpf_verifier_env *env)
+{
+ int ret;
+
+ env->insn_idx = 0;
+ ret = do_check_common(env, 0);
+ if (!ret)
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+ return ret;
+}
+
+
static void print_verification_stats(struct bpf_verifier_env *env)
{
int i;
@@ -9200,6 +9723,264 @@ static void print_verification_stats(struct bpf_verifier_env *env)
env->peak_states, env->longest_mark_read_walk);
}
+static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
+{
+ const struct btf_type *t, *func_proto;
+ const struct bpf_struct_ops *st_ops;
+ const struct btf_member *member;
+ struct bpf_prog *prog = env->prog;
+ u32 btf_id, member_idx;
+ const char *mname;
+
+ btf_id = prog->aux->attach_btf_id;
+ st_ops = bpf_struct_ops_find(btf_id);
+ if (!st_ops) {
+ verbose(env, "attach_btf_id %u is not a supported struct\n",
+ btf_id);
+ return -ENOTSUPP;
+ }
+
+ t = st_ops->type;
+ member_idx = prog->expected_attach_type;
+ if (member_idx >= btf_type_vlen(t)) {
+ verbose(env, "attach to invalid member idx %u of struct %s\n",
+ member_idx, st_ops->name);
+ return -EINVAL;
+ }
+
+ member = &btf_type_member(t)[member_idx];
+ mname = btf_name_by_offset(btf_vmlinux, member->name_off);
+ func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
+ NULL);
+ if (!func_proto) {
+ verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
+ mname, member_idx, st_ops->name);
+ return -EINVAL;
+ }
+
+ if (st_ops->check_member) {
+ int err = st_ops->check_member(t, member);
+
+ if (err) {
+ verbose(env, "attach to unsupported member %s of struct %s\n",
+ mname, st_ops->name);
+ return err;
+ }
+ }
+
+ prog->aux->attach_func_proto = func_proto;
+ prog->aux->attach_func_name = mname;
+ env->ops = st_ops->verifier_ops;
+
+ return 0;
+}
+
+static int check_attach_btf_id(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
+ struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+ u32 btf_id = prog->aux->attach_btf_id;
+ const char prefix[] = "btf_trace_";
+ int ret = 0, subprog = -1, i;
+ struct bpf_trampoline *tr;
+ const struct btf_type *t;
+ bool conservative = true;
+ const char *tname;
+ struct btf *btf;
+ long addr;
+ u64 key;
+
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
+ return check_struct_ops_btf_id(env);
+
+ if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension)
+ return 0;
+
+ if (!btf_id) {
+ verbose(env, "Tracing programs must provide btf_id\n");
+ return -EINVAL;
+ }
+ btf = bpf_prog_get_target_btf(prog);
+ if (!btf) {
+ verbose(env,
+ "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, btf_id);
+ if (!t) {
+ verbose(env, "attach_btf_id %u is invalid\n", btf_id);
+ return -EINVAL;
+ }
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!tname) {
+ verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id);
+ return -EINVAL;
+ }
+ if (tgt_prog) {
+ struct bpf_prog_aux *aux = tgt_prog->aux;
+
+ for (i = 0; i < aux->func_info_cnt; i++)
+ if (aux->func_info[i].type_id == btf_id) {
+ subprog = i;
+ break;
+ }
+ if (subprog == -1) {
+ verbose(env, "Subprog %s doesn't exist\n", tname);
+ return -EINVAL;
+ }
+ conservative = aux->func_info_aux[subprog].unreliable;
+ if (prog_extension) {
+ if (conservative) {
+ verbose(env,
+ "Cannot replace static functions\n");
+ return -EINVAL;
+ }
+ if (!prog->jit_requested) {
+ verbose(env,
+ "Extension programs should be JITed\n");
+ return -EINVAL;
+ }
+ env->ops = bpf_verifier_ops[tgt_prog->type];
+ }
+ if (!tgt_prog->jited) {
+ verbose(env, "Can attach to only JITed progs\n");
+ return -EINVAL;
+ }
+ if (tgt_prog->type == prog->type) {
+ /* Cannot fentry/fexit another fentry/fexit program.
+ * Cannot attach program extension to another extension.
+ * It's ok to attach fentry/fexit to extension program.
+ */
+ verbose(env, "Cannot recursively attach\n");
+ return -EINVAL;
+ }
+ if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
+ prog_extension &&
+ (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
+ tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+ /* Program extensions can extend all program types
+ * except fentry/fexit. The reason is the following.
+ * The fentry/fexit programs are used for performance
+ * analysis, stats and can be attached to any program
+ * type except themselves. When extension program is
+ * replacing XDP function it is necessary to allow
+ * performance analysis of all functions. Both original
+ * XDP program and its program extension. Hence
+ * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
+ * allowed. If extending of fentry/fexit was allowed it
+ * would be possible to create long call chain
+ * fentry->extension->fentry->extension beyond
+ * reasonable stack size. Hence extending fentry is not
+ * allowed.
+ */
+ verbose(env, "Cannot extend fentry/fexit\n");
+ return -EINVAL;
+ }
+ key = ((u64)aux->id) << 32 | btf_id;
+ } else {
+ if (prog_extension) {
+ verbose(env, "Cannot replace kernel functions\n");
+ return -EINVAL;
+ }
+ key = btf_id;
+ }
+
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_RAW_TP:
+ if (tgt_prog) {
+ verbose(env,
+ "Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
+ return -EINVAL;
+ }
+ if (!btf_type_is_typedef(t)) {
+ verbose(env, "attach_btf_id %u is not a typedef\n",
+ btf_id);
+ return -EINVAL;
+ }
+ if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
+ verbose(env, "attach_btf_id %u points to wrong type name %s\n",
+ btf_id, tname);
+ return -EINVAL;
+ }
+ tname += sizeof(prefix) - 1;
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_ptr(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_func_proto(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+
+ /* remember two read only pointers that are valid for
+ * the life time of the kernel
+ */
+ prog->aux->attach_func_name = tname;
+ prog->aux->attach_func_proto = t;
+ prog->aux->attach_btf_trace = true;
+ return 0;
+ default:
+ if (!prog_extension)
+ return -EINVAL;
+ /* fallthrough */
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ if (!btf_type_is_func(t)) {
+ verbose(env, "attach_btf_id %u is not a function\n",
+ btf_id);
+ return -EINVAL;
+ }
+ if (prog_extension &&
+ btf_check_type_match(env, prog, btf, t))
+ return -EINVAL;
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_func_proto(t))
+ return -EINVAL;
+ tr = bpf_trampoline_lookup(key);
+ if (!tr)
+ return -ENOMEM;
+ prog->aux->attach_func_name = tname;
+ /* t is either vmlinux type or another program's type */
+ prog->aux->attach_func_proto = t;
+ mutex_lock(&tr->mutex);
+ if (tr->func.addr) {
+ prog->aux->trampoline = tr;
+ goto out;
+ }
+ if (tgt_prog && conservative) {
+ prog->aux->attach_func_proto = NULL;
+ t = NULL;
+ }
+ ret = btf_distill_func_proto(&env->log, btf, t,
+ tname, &tr->func.model);
+ if (ret < 0)
+ goto out;
+ if (tgt_prog) {
+ if (subprog == 0)
+ addr = (long) tgt_prog->bpf_func;
+ else
+ addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
+ } else {
+ addr = kallsyms_lookup_name(tname);
+ if (!addr) {
+ verbose(env,
+ "The address of function %s cannot be found\n",
+ tname);
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+ tr->func.addr = (void *)addr;
+ prog->aux->trampoline = tr;
+out:
+ mutex_unlock(&tr->mutex);
+ if (ret)
+ bpf_trampoline_put(tr);
+ return ret;
+ }
+}
+
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
union bpf_attr __user *uattr)
{
@@ -9233,6 +10014,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->ops = bpf_verifier_ops[env->prog->type];
is_priv = capable(CAP_SYS_ADMIN);
+ if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ mutex_lock(&bpf_verifier_lock);
+ if (!btf_vmlinux)
+ btf_vmlinux = btf_parse_vmlinux();
+ mutex_unlock(&bpf_verifier_lock);
+ }
+
/* grab the mutex to protect few globals used by verifier */
if (!is_priv)
mutex_lock(&bpf_verifier_lock);
@@ -9252,6 +10040,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
goto err_unlock;
}
+ if (IS_ERR(btf_vmlinux)) {
+ /* Either gcc or pahole or kernel are broken. */
+ verbose(env, "in-kernel BTF is malformed\n");
+ ret = PTR_ERR(btf_vmlinux);
+ goto skip_full_check;
+ }
+
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
@@ -9260,6 +10055,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->allow_ptr_leaks = is_priv;
+ if (is_priv)
+ env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
@@ -9285,22 +10083,22 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (ret < 0)
goto skip_full_check;
+ ret = check_attach_btf_id(env);
+ if (ret)
+ goto skip_full_check;
+
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
- ret = do_check(env);
- if (env->cur_state) {
- free_verifier_state(env->cur_state, true);
- env->cur_state = NULL;
- }
+ ret = do_check_subprogs(env);
+ ret = ret ?: do_check_main(env);
if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
- while (!pop_stack(env, NULL, NULL));
- free_states(env);
+ kvfree(env->explored_states);
if (ret == 0)
ret = check_max_stack_depth(env);
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 9bb96ace9fa1..2cc5c8f4c800 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -9,17 +9,72 @@
#include <linux/slab.h>
#include <linux/sched.h>
-struct xsk_map {
- struct bpf_map map;
- struct xdp_sock **xsk_map;
- struct list_head __percpu *flush_list;
-};
+int xsk_map_inc(struct xsk_map *map)
+{
+ bpf_map_inc(&map->map);
+ return 0;
+}
+
+void xsk_map_put(struct xsk_map *map)
+{
+ bpf_map_put(&map->map);
+}
+
+static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
+ struct xdp_sock **map_entry)
+{
+ struct xsk_map_node *node;
+ int err;
+
+ node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+
+ err = xsk_map_inc(map);
+ if (err) {
+ kfree(node);
+ return ERR_PTR(err);
+ }
+
+ node->map = map;
+ node->map_entry = map_entry;
+ return node;
+}
+
+static void xsk_map_node_free(struct xsk_map_node *node)
+{
+ xsk_map_put(node->map);
+ kfree(node);
+}
+
+static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
+{
+ spin_lock_bh(&xs->map_list_lock);
+ list_add_tail(&node->node, &xs->map_list);
+ spin_unlock_bh(&xs->map_list_lock);
+}
+
+static void xsk_map_sock_delete(struct xdp_sock *xs,
+ struct xdp_sock **map_entry)
+{
+ struct xsk_map_node *n, *tmp;
+
+ spin_lock_bh(&xs->map_list_lock);
+ list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
+ if (map_entry == n->map_entry) {
+ list_del(&n->node);
+ xsk_map_node_free(n);
+ }
+ }
+ spin_unlock_bh(&xs->map_list_lock);
+}
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
+ struct bpf_map_memory mem;
+ int err, numa_node;
struct xsk_map *m;
- int cpu, err;
- u64 cost;
+ u64 size;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@@ -29,66 +84,33 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
return ERR_PTR(-EINVAL);
- m = kzalloc(sizeof(*m), GFP_USER);
- if (!m)
- return ERR_PTR(-ENOMEM);
+ numa_node = bpf_map_attr_numa_node(attr);
+ size = struct_size(m, xsk_map, attr->max_entries);
- bpf_map_init_from_attr(&m->map, attr);
-
- cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
- cost += sizeof(struct list_head) * num_possible_cpus();
-
- /* Notice returns -EPERM on if map size is larger than memlock limit */
- err = bpf_map_charge_init(&m->map.memory, cost);
- if (err)
- goto free_m;
-
- err = -ENOMEM;
+ err = bpf_map_charge_init(&mem, size);
+ if (err < 0)
+ return ERR_PTR(err);
- m->flush_list = alloc_percpu(struct list_head);
- if (!m->flush_list)
- goto free_charge;
+ m = bpf_map_area_alloc(size, numa_node);
+ if (!m) {
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
+ bpf_map_init_from_attr(&m->map, attr);
+ bpf_map_charge_move(&m->map.memory, &mem);
+ spin_lock_init(&m->lock);
- m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
- sizeof(struct xdp_sock *),
- m->map.numa_node);
- if (!m->xsk_map)
- goto free_percpu;
return &m->map;
-
-free_percpu:
- free_percpu(m->flush_list);
-free_charge:
- bpf_map_charge_finish(&m->map.memory);
-free_m:
- kfree(m);
- return ERR_PTR(err);
}
static void xsk_map_free(struct bpf_map *map)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
- int i;
bpf_clear_redirect_map(map);
synchronize_net();
-
- for (i = 0; i < map->max_entries; i++) {
- struct xdp_sock *xs;
-
- xs = m->xsk_map[i];
- if (!xs)
- continue;
-
- sock_put((struct sock *)xs);
- }
-
- free_percpu(m->flush_list);
- bpf_map_area_free(m->xsk_map);
- kfree(m);
+ bpf_map_area_free(m);
}
static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -108,45 +130,20 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *xs;
-
- if (key >= map->max_entries)
- return NULL;
-
- xs = READ_ONCE(m->xsk_map[key]);
- return xs;
-}
-
-int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
- struct xdp_sock *xs)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct list_head *flush_list = this_cpu_ptr(m->flush_list);
- int err;
-
- err = xsk_rcv(xs, xdp);
- if (err)
- return err;
-
- if (!xs->flush_node.prev)
- list_add(&xs->flush_node, flush_list);
-
- return 0;
-}
-
-void __xsk_map_flush(struct bpf_map *map)
+static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct list_head *flush_list = this_cpu_ptr(m->flush_list);
- struct xdp_sock *xs, *tmp;
-
- list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
- xsk_flush(xs);
- __list_del_clearprev(&xs->flush_node);
- }
+ const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
+ struct bpf_insn *insn = insn_buf;
+
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+ return insn - insn_buf;
}
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
@@ -164,8 +161,9 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock *xs, *old_xs, **map_entry;
u32 i = *(u32 *)key, fd = *(u32 *)value;
- struct xdp_sock *xs, *old_xs;
+ struct xsk_map_node *node;
struct socket *sock;
int err;
@@ -173,8 +171,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EINVAL;
if (unlikely(i >= m->map.max_entries))
return -E2BIG;
- if (unlikely(map_flags == BPF_NOEXIST))
- return -EEXIST;
sock = sockfd_lookup(fd, &err);
if (!sock)
@@ -192,37 +188,76 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EOPNOTSUPP;
}
- sock_hold(sock->sk);
+ map_entry = &m->xsk_map[i];
+ node = xsk_map_node_alloc(m, map_entry);
+ if (IS_ERR(node)) {
+ sockfd_put(sock);
+ return PTR_ERR(node);
+ }
- old_xs = xchg(&m->xsk_map[i], xs);
+ spin_lock_bh(&m->lock);
+ old_xs = READ_ONCE(*map_entry);
+ if (old_xs == xs) {
+ err = 0;
+ goto out;
+ } else if (old_xs && map_flags == BPF_NOEXIST) {
+ err = -EEXIST;
+ goto out;
+ } else if (!old_xs && map_flags == BPF_EXIST) {
+ err = -ENOENT;
+ goto out;
+ }
+ xsk_map_sock_add(xs, node);
+ WRITE_ONCE(*map_entry, xs);
if (old_xs)
- sock_put((struct sock *)old_xs);
-
+ xsk_map_sock_delete(old_xs, map_entry);
+ spin_unlock_bh(&m->lock);
sockfd_put(sock);
return 0;
+
+out:
+ spin_unlock_bh(&m->lock);
+ sockfd_put(sock);
+ xsk_map_node_free(node);
+ return err;
}
static int xsk_map_delete_elem(struct bpf_map *map, void *key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *old_xs;
+ struct xdp_sock *old_xs, **map_entry;
int k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
- old_xs = xchg(&m->xsk_map[k], NULL);
+ spin_lock_bh(&m->lock);
+ map_entry = &m->xsk_map[k];
+ old_xs = xchg(map_entry, NULL);
if (old_xs)
- sock_put((struct sock *)old_xs);
+ xsk_map_sock_delete(old_xs, map_entry);
+ spin_unlock_bh(&m->lock);
return 0;
}
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+ struct xdp_sock **map_entry)
+{
+ spin_lock_bh(&map->lock);
+ if (READ_ONCE(*map_entry) == xs) {
+ WRITE_ONCE(*map_entry, NULL);
+ xsk_map_sock_delete(xs, map_entry);
+ }
+ spin_unlock_bh(&map->lock);
+}
+
const struct bpf_map_ops xsk_map_ops = {
.map_alloc = xsk_map_alloc,
.map_free = xsk_map_free,
.map_get_next_key = xsk_map_get_next_key,
.map_lookup_elem = xsk_map_lookup_elem,
+ .map_gen_lookup = xsk_map_gen_lookup,
.map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
.map_update_elem = xsk_map_update_elem,
.map_delete_elem = xsk_map_delete_elem,
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 809e34a3c017..bfbeabc17a9d 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -7,7 +7,7 @@
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/refcount.h>
-#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
@@ -231,9 +231,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
-struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+ bool *locked)
__acquires(&cgroup_threadgroup_rwsem);
-void cgroup_procs_write_finish(struct task_struct *task)
+void cgroup_procs_write_finish(struct task_struct *task, bool locked)
__releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
@@ -264,7 +265,7 @@ extern const struct proc_ns_operations cgroupns_operations;
*/
extern struct cftype cgroup1_base_files[];
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
-extern const struct fs_parameter_description cgroup1_fs_parameters;
+extern const struct fs_parameter_spec cgroup1_fs_parameters[];
int proc_cgroupstats_show(struct seq_file *m, void *v);
bool cgroup1_ssid_disabled(int ssid);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 88006be40ea3..be1a1c83cdd1 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -18,8 +18,6 @@
#include <trace/events/cgroup.h>
-#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__)
-
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
@@ -194,25 +192,6 @@ struct cgroup_pidlist {
};
/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
- if (PIDLIST_TOO_LARGE(count))
- return vmalloc(array_size(count, sizeof(pid_t)));
- else
- return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
- kvfree(p);
-}
-
-/*
* Used to destroy all pidlists lingering waiting for destroy timer. None
* should be left afterwards.
*/
@@ -244,7 +223,7 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
*/
if (!delayed_work_pending(dwork)) {
list_del(&l->links);
- pidlist_free(l->list);
+ kvfree(l->list);
put_pid_ns(l->key.ns);
tofree = l;
}
@@ -365,7 +344,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
- array = pidlist_allocate(length);
+ array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
if (!array)
return -ENOMEM;
/* now, populate the array */
@@ -390,12 +369,12 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
- pidlist_free(array);
+ kvfree(array);
return -ENOMEM;
}
/* store array, freeing old if necessary */
- pidlist_free(l->list);
+ kvfree(l->list);
l->list = array;
l->length = length;
*lp = l;
@@ -514,12 +493,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
struct task_struct *task;
const struct cred *cred, *tcred;
ssize_t ret;
+ bool locked;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup);
+ task = cgroup_procs_write_start(buf, threadgroup, &locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -541,7 +521,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
ret = cgroup_attach_task(cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task);
+ cgroup_procs_write_finish(task, locked);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -905,7 +885,7 @@ enum cgroup1_param {
Opt_xattr,
};
-static const struct fs_parameter_spec cgroup1_param_specs[] = {
+const struct fs_parameter_spec cgroup1_fs_parameters[] = {
fsparam_flag ("all", Opt_all),
fsparam_flag ("clone_children", Opt_clone_children),
fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
@@ -917,11 +897,6 @@ static const struct fs_parameter_spec cgroup1_param_specs[] = {
{}
};
-const struct fs_parameter_description cgroup1_fs_parameters = {
- .name = "cgroup1",
- .specs = cgroup1_param_specs,
-};
-
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
@@ -929,7 +904,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct fs_parse_result result;
int opt, i;
- opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result);
+ opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
if (strcmp(param->key, "source") == 0) {
fc->source = param->string;
@@ -942,7 +917,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->subsys_mask |= (1 << i);
return 0;
}
- return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key);
+ return invalfc(fc, "Unknown subsys name '%s'", param->key);
}
if (opt < 0)
return opt;
@@ -970,7 +945,7 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
- return cg_invalf(fc, "cgroup1: release_agent respecified");
+ return invalfc(fc, "release_agent respecified");
ctx->release_agent = param->string;
param->string = NULL;
break;
@@ -980,9 +955,9 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -ENOENT;
/* Can't specify an empty name */
if (!param->size)
- return cg_invalf(fc, "cgroup1: Empty name");
+ return invalfc(fc, "Empty name");
if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
- return cg_invalf(fc, "cgroup1: Name too long");
+ return invalfc(fc, "Name too long");
/* Must match [\w.-]+ */
for (i = 0; i < param->size; i++) {
char c = param->string[i];
@@ -990,11 +965,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
continue;
if ((c == '.') || (c == '-') || (c == '_'))
continue;
- return cg_invalf(fc, "cgroup1: Invalid name");
+ return invalfc(fc, "Invalid name");
}
/* Specifying two names is forbidden */
if (ctx->name)
- return cg_invalf(fc, "cgroup1: name respecified");
+ return invalfc(fc, "name respecified");
ctx->name = param->string;
param->string = NULL;
break;
@@ -1029,7 +1004,7 @@ static int check_cgroupfs_options(struct fs_context *fc)
if (ctx->all_ss) {
/* Mutually exclusive option 'all' + subsystem name */
if (ctx->subsys_mask)
- return cg_invalf(fc, "cgroup1: subsys name conflicts with all");
+ return invalfc(fc, "subsys name conflicts with all");
/* 'all' => select all the subsystems */
ctx->subsys_mask = enabled;
}
@@ -1039,7 +1014,7 @@ static int check_cgroupfs_options(struct fs_context *fc)
* empty hierarchies must have a name).
*/
if (!ctx->subsys_mask && !ctx->name)
- return cg_invalf(fc, "cgroup1: Need name or subsystem set");
+ return invalfc(fc, "Need name or subsystem set");
/*
* Option noprefix was introduced just for backward compatibility
@@ -1047,11 +1022,11 @@ static int check_cgroupfs_options(struct fs_context *fc)
* the cpuset subsystem.
*/
if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
- return cg_invalf(fc, "cgroup1: noprefix used incorrectly");
+ return invalfc(fc, "noprefix used incorrectly");
/* Can't specify "none" and some subsystems */
if (ctx->subsys_mask && ctx->none)
- return cg_invalf(fc, "cgroup1: none used incorrectly");
+ return invalfc(fc, "none used incorrectly");
return 0;
}
@@ -1081,7 +1056,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
/* Don't allow flags or name to change at remount */
if ((ctx->flags ^ root->flags) ||
(ctx->name && strcmp(ctx->name, root->name))) {
- cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
+ errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
ctx->flags, ctx->name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
@@ -1198,7 +1173,7 @@ static int cgroup1_root_to_use(struct fs_context *fc)
* can't create new one without subsys specification.
*/
if (!ctx->subsys_mask && !ctx->none)
- return cg_invalf(fc, "cgroup1: No subsys list or none specified");
+ return invalfc(fc, "No subsys list or none specified");
/* Hierarchies may only be created in the initial cgroup namespace. */
if (ctx->ns != &init_cgroup_ns)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 753afbca549f..75f687301bbf 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -488,7 +488,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
rcu_read_lock();
css = cgroup_css(cgrp, ss);
- if (!css || !css_tryget_online(css))
+ if (css && !css_tryget_online(css))
css = NULL;
rcu_read_unlock();
@@ -899,8 +899,7 @@ static void css_set_move_task(struct task_struct *task,
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
- * against cgroup_exit() changing the css_set to
- * init_css_set and dropping the old one.
+ * against cgroup_exit()/cgroup_free() dropping the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -1309,10 +1308,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
void cgroup_free_root(struct cgroup_root *root)
{
- if (root) {
- idr_destroy(&root->cgroup_idr);
- kfree(root);
- }
+ kfree(root);
}
static void cgroup_destroy_root(struct cgroup_root *root)
@@ -1374,6 +1370,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
res = &root->cgrp;
+ } else if (root == &cgrp_dfl_root) {
+ res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
@@ -1430,9 +1428,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
{
/*
- * No need to lock the task - since we hold cgroup_mutex the
- * task can't change groups, so the only thing that can happen
- * is that it exits and its css is set back to init_css_set.
+ * No need to lock the task - since we hold css_set_lock the
+ * task can't change groups.
*/
return cset_cgroup_from_root(task_css_set(task), root);
}
@@ -1819,24 +1816,19 @@ enum cgroup2_param {
nr__cgroup2_params
};
-static const struct fs_parameter_spec cgroup2_param_specs[] = {
+static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
fsparam_flag("memory_localevents", Opt_memory_localevents),
{}
};
-static const struct fs_parameter_description cgroup2_fs_parameters = {
- .name = "cgroup2",
- .specs = cgroup2_param_specs,
-};
-
static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct fs_parse_result result;
int opt;
- opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result);
+ opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -1883,65 +1875,6 @@ static int cgroup_reconfigure(struct fs_context *fc)
return 0;
}
-/*
- * To reduce the fork() overhead for systems that are not actually using
- * their cgroups capability, we don't maintain the lists running through
- * each css_set to its tasks until we see the list actually used - in other
- * words after the first mount.
- */
-static bool use_task_css_set_links __read_mostly;
-
-static void cgroup_enable_task_cg_lists(void)
-{
- struct task_struct *p, *g;
-
- /*
- * We need tasklist_lock because RCU is not safe against
- * while_each_thread(). Besides, a forking task that has passed
- * cgroup_post_fork() without seeing use_task_css_set_links = 1
- * is not guaranteed to have its child immediately visible in the
- * tasklist if we walk through it with RCU.
- */
- read_lock(&tasklist_lock);
- spin_lock_irq(&css_set_lock);
-
- if (use_task_css_set_links)
- goto out_unlock;
-
- use_task_css_set_links = true;
-
- do_each_thread(g, p) {
- WARN_ON_ONCE(!list_empty(&p->cg_list) ||
- task_css_set(p) != &init_css_set);
-
- /*
- * We should check if the process is exiting, otherwise
- * it will race with cgroup_exit() in that the list
- * entry won't be deleted though the process has exited.
- * Do it while holding siglock so that we don't end up
- * racing against cgroup_exit().
- *
- * Interrupts were already disabled while acquiring
- * the css_set_lock, so we do not need to disable it
- * again when acquiring the sighand->siglock here.
- */
- spin_lock(&p->sighand->siglock);
- if (!(p->flags & PF_EXITING)) {
- struct css_set *cset = task_css_set(p);
-
- if (!css_set_populated(cset))
- css_set_update_populated(cset, true);
- list_add_tail(&p->cg_list, &cset->tasks);
- get_css_set(cset);
- cset->nr_tasks++;
- }
- spin_unlock(&p->sighand->siglock);
- } while_each_thread(g, p);
-out_unlock:
- spin_unlock_irq(&css_set_lock);
- read_unlock(&tasklist_lock);
-}
-
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
@@ -1976,7 +1909,6 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
- idr_init(&root->cgroup_idr);
root->flags = ctx->flags;
if (ctx->release_agent)
@@ -1997,12 +1929,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
lockdep_assert_held(&cgroup_mutex);
- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
- if (ret < 0)
- goto out;
- root_cgrp->id = ret;
- root_cgrp->ancestor_ids[0] = ret;
-
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
0, GFP_KERNEL);
if (ret)
@@ -2035,6 +1961,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
goto exit_root_id;
}
root_cgrp->kn = root->kf_root->kn;
+ WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
+ root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
ret = css_populate_dir(&root_cgrp->self);
if (ret)
@@ -2119,11 +2047,12 @@ int cgroup_do_get_tree(struct fs_context *fc)
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(fc->root);
- fc->root = nsdentry;
if (IS_ERR(nsdentry)) {
- ret = PTR_ERR(nsdentry);
deactivate_locked_super(sb);
+ ret = PTR_ERR(nsdentry);
+ nsdentry = NULL;
}
+ fc->root = nsdentry;
}
if (!ctx->kfc.new_sb_created)
@@ -2187,13 +2116,6 @@ static int cgroup_init_fs_context(struct fs_context *fc)
if (!ctx)
return -ENOMEM;
- /*
- * The first time anyone tries to mount a cgroup, enable the list
- * linking each css_set to its tasks and fix up all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
-
ctx->ns = current->nsproxy->cgroup_ns;
get_cgroup_ns(ctx->ns);
fc->fs_private = &ctx->kfc;
@@ -2229,7 +2151,7 @@ static void cgroup_kill_sb(struct super_block *sb)
struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.init_fs_context = cgroup_init_fs_context,
- .parameters = &cgroup1_fs_parameters,
+ .parameters = cgroup1_fs_parameters,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
@@ -2237,7 +2159,7 @@ struct file_system_type cgroup_fs_type = {
static struct file_system_type cgroup2_fs_type = {
.name = "cgroup2",
.init_fs_context = cgroup_init_fs_context,
- .parameters = &cgroup2_fs_parameters,
+ .parameters = cgroup2_fs_parameters,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
@@ -2371,9 +2293,8 @@ static void cgroup_migrate_add_task(struct task_struct *task,
if (task->flags & PF_EXITING)
return;
- /* leave @task alone if post_fork() hasn't linked it yet */
- if (list_empty(&task->cg_list))
- return;
+ /* cgroup_threadgroup_rwsem protects racing against forks */
+ WARN_ON_ONCE(list_empty(&task->cg_list));
cset = task_css_set(task);
if (!cset->mg_src_cgrp)
@@ -2824,7 +2745,8 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
return ret;
}
-struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+ bool *locked)
__acquires(&cgroup_threadgroup_rwsem)
{
struct task_struct *tsk;
@@ -2833,7 +2755,21 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ /*
+ * If we migrate a single thread, we don't care about threadgroup
+ * stability. If the thread is `current`, it won't exit(2) under our
+ * hands or change PID through exec(2). We exclude
+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
+ * callers by cgroup_mutex.
+ * Therefore, we can skip the global lock.
+ */
+ lockdep_assert_held(&cgroup_mutex);
+ if (pid || threadgroup) {
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+ *locked = true;
+ } else {
+ *locked = false;
+ }
rcu_read_lock();
if (pid) {
@@ -2864,13 +2800,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
goto out_unlock_rcu;
out_unlock_threadgroup:
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ if (*locked) {
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ *locked = false;
+ }
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task)
+void cgroup_procs_write_finish(struct task_struct *task, bool locked)
__releases(&cgroup_threadgroup_rwsem)
{
struct cgroup_subsys *ss;
@@ -2879,7 +2818,8 @@ void cgroup_procs_write_finish(struct task_struct *task)
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ if (locked)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2894,7 +2834,7 @@ static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
do_each_subsys_mask(ss, ssid, ss_mask) {
if (printed)
seq_putc(seq, ' ');
- seq_printf(seq, "%s", ss->name);
+ seq_puts(seq, ss->name);
printed = true;
} while_each_subsys_mask();
if (printed)
@@ -3110,8 +3050,6 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
- WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
continue;
@@ -3121,6 +3059,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
return PTR_ERR(css);
}
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
if (css_visible(css)) {
ret = css_populate_dir(css);
if (ret)
@@ -3156,11 +3096,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
- WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
if (!css)
continue;
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
kill_css(css);
@@ -3447,7 +3387,8 @@ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
if (strcmp(strstrip(buf), "threaded"))
return -EINVAL;
- cgrp = cgroup_kn_lock_live(of->kn, false);
+ /* drain dying csses before we re-apply (threaded) subtree control */
+ cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENOENT;
@@ -3600,22 +3541,22 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
- struct cgroup *cgroup = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
- struct cgroup *cgroup = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
- struct cgroup *cgroup = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_CPU);
}
@@ -4567,9 +4508,6 @@ repeat:
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
- /* no one should try to iterate before mounting cgroups */
- WARN_ON_ONCE(!use_task_css_set_links);
-
memset(it, 0, sizeof(*it));
spin_lock_irq(&css_set_lock);
@@ -4754,12 +4692,13 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
ssize_t ret;
+ bool locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, true);
+ task = cgroup_procs_write_start(buf, true, &locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4777,7 +4716,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
ret = cgroup_attach_task(dst_cgrp, task, true);
out_finish:
- cgroup_procs_write_finish(task);
+ cgroup_procs_write_finish(task, locked);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -4795,6 +4734,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
ssize_t ret;
+ bool locked;
buf = strstrip(buf);
@@ -4802,7 +4742,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, false);
+ task = cgroup_procs_write_start(buf, false, &locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4826,7 +4766,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
ret = cgroup_attach_task(dst_cgrp, task, false);
out_finish:
- cgroup_procs_write_finish(task);
+ cgroup_procs_write_finish(task, locked);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -5036,9 +4976,6 @@ static void css_release_work_fn(struct work_struct *work)
tcgrp->nr_dying_descendants--;
spin_unlock_irq(&css_set_lock);
- cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
- cgrp->id = -1;
-
/*
* There are two control paths which try to determine
* cgroup from dentry without going through kernfs -
@@ -5203,10 +5140,12 @@ err_free_css:
* it isn't associated with its kernfs_node and doesn't have the control
* mask applied.
*/
-static struct cgroup *cgroup_create(struct cgroup *parent)
+static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
+ umode_t mode)
{
struct cgroup_root *root = parent->root;
struct cgroup *cgrp, *tcgrp;
+ struct kernfs_node *kn;
int level = parent->level + 1;
int ret;
@@ -5226,15 +5165,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
goto out_cancel_ref;
}
- /*
- * Temporarily set the pointer to NULL, so idr_find() won't return
- * a half-baked cgroup.
- */
- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
- if (cgrp->id < 0) {
- ret = -ENOMEM;
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
goto out_stat_exit;
}
+ cgrp->kn = kn;
init_cgroup_housekeeping(cgrp);
@@ -5244,7 +5181,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
ret = psi_cgroup_alloc(cgrp);
if (ret)
- goto out_idr_free;
+ goto out_kernfs_remove;
ret = cgroup_bpf_inherit(cgrp);
if (ret)
@@ -5255,12 +5192,20 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
* if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
- if (cgrp->freezer.e_freeze)
+ if (cgrp->freezer.e_freeze) {
+ /*
+ * Set the CGRP_FREEZE flag, so when a process will be
+ * attached to the child cgroup, it will become frozen.
+ * At this point the new cgroup is unpopulated, so we can
+ * consider it frozen immediately.
+ */
+ set_bit(CGRP_FREEZE, &cgrp->flags);
set_bit(CGRP_FROZEN, &cgrp->flags);
+ }
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
+ cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
@@ -5290,12 +5235,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgroup_get_live(parent);
/*
- * @cgrp is now fully operational. If something fails after this
- * point, it'll be released via the normal destruction path.
- */
- cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
-
- /*
* On the default hierarchy, a child doesn't automatically inherit
* subtree_control from the parent. Each is configured manually.
*/
@@ -5308,8 +5247,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
out_psi_free:
psi_cgroup_free(cgrp);
-out_idr_free:
- cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_kernfs_remove:
+ kernfs_remove(cgrp->kn);
out_stat_exit:
if (cgroup_on_dfl(parent))
cgroup_rstat_exit(cgrp);
@@ -5346,7 +5285,6 @@ fail:
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
- struct kernfs_node *kn;
int ret;
/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
@@ -5362,27 +5300,19 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
goto out_unlock;
}
- cgrp = cgroup_create(parent);
+ cgrp = cgroup_create(parent, name, mode);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
goto out_unlock;
}
- /* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
- if (IS_ERR(kn)) {
- ret = PTR_ERR(kn);
- goto out_destroy;
- }
- cgrp->kn = kn;
-
/*
* This extra ref will be put in cgroup_free_fn() and guarantees
* that @cgrp->kn is always accessible.
*/
- kernfs_get(kn);
+ kernfs_get(cgrp->kn);
- ret = cgroup_kn_set_ugid(kn);
+ ret = cgroup_kn_set_ugid(cgrp->kn);
if (ret)
goto out_destroy;
@@ -5397,7 +5327,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
TRACE_CGROUP_PATH(mkdir, cgrp);
/* let's create and online css's */
- kernfs_activate(kn);
+ kernfs_activate(cgrp->kn);
ret = 0;
goto out_unlock;
@@ -5827,12 +5757,11 @@ static int __init cgroup_wq_init(void)
}
core_initcall(cgroup_wq_init);
-void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
- char *buf, size_t buflen)
+void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{
struct kernfs_node *kn;
- kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
return;
kernfs_path(kn, buf, buflen);
@@ -5993,62 +5922,41 @@ void cgroup_cancel_fork(struct task_struct *child)
void cgroup_post_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
+ struct css_set *cset;
int i;
+ spin_lock_irq(&css_set_lock);
+
+ /* init tasks are special, only link regular threads */
+ if (likely(child->pid)) {
+ WARN_ON_ONCE(!list_empty(&child->cg_list));
+ cset = task_css_set(current); /* current is @child's parent */
+ get_css_set(cset);
+ cset->nr_tasks++;
+ css_set_move_task(child, NULL, cset, false);
+ }
+
/*
- * This may race against cgroup_enable_task_cg_lists(). As that
- * function sets use_task_css_set_links before grabbing
- * tasklist_lock and we just went through tasklist_lock to add
- * @child, it's guaranteed that either we see the set
- * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
- * @child during its iteration.
- *
- * If we won the race, @child is associated with %current's
- * css_set. Grabbing css_set_lock guarantees both that the
- * association is stable, and, on completion of the parent's
- * migration, @child is visible in the source of migration or
- * already in the destination cgroup. This guarantee is necessary
- * when implementing operations which need to migrate all tasks of
- * a cgroup to another.
- *
- * Note that if we lose to cgroup_enable_task_cg_lists(), @child
- * will remain in init_css_set. This is safe because all tasks are
- * in the init_css_set before cg_links is enabled and there's no
- * operation which transfers all tasks out of init_css_set.
+ * If the cgroup has to be frozen, the new task has too. Let's set
+ * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
+ * frozen state.
*/
- if (use_task_css_set_links) {
- struct css_set *cset;
-
- spin_lock_irq(&css_set_lock);
- cset = task_css_set(current);
- if (list_empty(&child->cg_list)) {
- get_css_set(cset);
- cset->nr_tasks++;
- css_set_move_task(child, NULL, cset, false);
- }
+ if (unlikely(cgroup_task_freeze(child))) {
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
/*
- * If the cgroup has to be frozen, the new task has too.
- * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
- * the task into the frozen state.
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later from
+ * do_freezer_trap(). So we avoid cgroup's transient switch
+ * from the frozen state and back.
*/
- if (unlikely(cgroup_task_freeze(child))) {
- spin_lock(&child->sighand->siglock);
- WARN_ON_ONCE(child->frozen);
- child->jobctl |= JOBCTL_TRAP_FREEZE;
- spin_unlock(&child->sighand->siglock);
-
- /*
- * Calling cgroup_update_frozen() isn't required here,
- * because it will be called anyway a bit later
- * from do_freezer_trap(). So we avoid cgroup's
- * transient switch from the frozen state and back.
- */
- }
-
- spin_unlock_irq(&css_set_lock);
}
+ spin_unlock_irq(&css_set_lock);
+
/*
* Call ss->fork(). This must happen after @child is linked on
* css_set; otherwise, @child might change state between ->fork()
@@ -6063,20 +5971,8 @@ void cgroup_post_fork(struct task_struct *child)
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
- * Description: Detach cgroup from @tsk and release it.
+ * Description: Detach cgroup from @tsk.
*
- * Note that cgroups marked notify_on_release force every task in
- * them to take the global cgroup_mutex mutex when exiting.
- * This could impact scaling on very large systems. Be reluctant to
- * use notify_on_release cgroups where very high task exit scaling
- * is required on large systems.
- *
- * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
- * call cgroup_exit() while the task is still competent to handle
- * notify_on_release(), then leave the task attached to the root cgroup in
- * each hierarchy for the remainder of its exit. No need to bother with
- * init_css_set refcnting. init_css_set never goes away and we can't race
- * with migration path - PF_EXITING is visible to migration path.
*/
void cgroup_exit(struct task_struct *tsk)
{
@@ -6084,26 +5980,19 @@ void cgroup_exit(struct task_struct *tsk)
struct css_set *cset;
int i;
- /*
- * Unlink from @tsk from its css_set. As migration path can't race
- * with us, we can check css_set and cg_list without synchronization.
- */
- cset = task_css_set(tsk);
+ spin_lock_irq(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- spin_lock_irq(&css_set_lock);
- css_set_move_task(tsk, cset, NULL, false);
- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
- cset->nr_tasks--;
+ WARN_ON_ONCE(list_empty(&tsk->cg_list));
+ cset = task_css_set(tsk);
+ css_set_move_task(tsk, cset, NULL, false);
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
+ cset->nr_tasks--;
- WARN_ON_ONCE(cgroup_task_frozen(tsk));
- if (unlikely(cgroup_task_freeze(tsk)))
- cgroup_update_frozen(task_dfl_cgroup(tsk));
+ WARN_ON_ONCE(cgroup_task_frozen(tsk));
+ if (unlikely(cgroup_task_freeze(tsk)))
+ cgroup_update_frozen(task_dfl_cgroup(tsk));
- spin_unlock_irq(&css_set_lock);
- } else {
- get_css_set(cset);
- }
+ spin_unlock_irq(&css_set_lock);
/* see cgroup_post_fork() for details */
do_each_subsys_mask(ss, i, have_exit_callback) {
@@ -6120,12 +6009,10 @@ void cgroup_release(struct task_struct *task)
ss->release(task);
} while_each_subsys_mask();
- if (use_task_css_set_links) {
- spin_lock_irq(&css_set_lock);
- css_set_skip_task_iters(task_css_set(task), task);
- list_del_init(&task->cg_list);
- spin_unlock_irq(&css_set_lock);
- }
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
}
void cgroup_free(struct task_struct *task)
@@ -6400,12 +6287,13 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 flags)
+ struct bpf_prog *replace_prog, enum bpf_attach_type type,
+ u32 flags)
{
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
mutex_unlock(&cgroup_mutex);
return ret;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 5aa37531ce76..58f5073acff7 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -45,6 +45,7 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
@@ -332,7 +333,18 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
-static DEFINE_MUTEX(cpuset_mutex);
+DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
+
+void cpuset_read_lock(void)
+{
+ percpu_down_read(&cpuset_rwsem);
+}
+
+void cpuset_read_unlock(void)
+{
+ percpu_up_read(&cpuset_rwsem);
+}
+
static DEFINE_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
@@ -786,7 +798,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
continue;
- if (is_sched_load_balance(cp))
+ if (is_sched_load_balance(cp) &&
+ !cpumask_empty(cp->effective_cpus))
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
@@ -894,6 +907,65 @@ done:
return ndoms;
}
+static void update_tasks_root_domain(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, 0, &it);
+
+ while ((task = css_task_iter_next(&it)))
+ dl_add_task_root_domain(task);
+
+ css_task_iter_end(&it);
+}
+
+static void rebuild_root_domains(void)
+{
+ struct cpuset *cs = NULL;
+ struct cgroup_subsys_state *pos_css;
+
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&sched_domains_mutex);
+
+ rcu_read_lock();
+
+ /*
+ * Clear default root domain DL accounting, it will be computed again
+ * if a task belongs to it.
+ */
+ dl_clear_root_domain(&def_root_domain);
+
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+
+ if (cpumask_empty(cs->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ css_get(&cs->css);
+
+ rcu_read_unlock();
+
+ update_tasks_root_domain(cs);
+
+ rcu_read_lock();
+ css_put(&cs->css);
+ }
+ rcu_read_unlock();
+}
+
+static void
+partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ mutex_lock(&sched_domains_mutex);
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+ rebuild_root_domains();
+ mutex_unlock(&sched_domains_mutex);
+}
+
/*
* Rebuild scheduler domains.
*
@@ -911,8 +983,8 @@ static void rebuild_sched_domains_locked(void)
cpumask_var_t *doms;
int ndoms;
- lockdep_assert_held(&cpuset_mutex);
- get_online_cpus();
+ lockdep_assert_cpus_held();
+ percpu_rwsem_assert_held(&cpuset_rwsem);
/*
* We have raced with CPU hotplug. Don't do anything to avoid
@@ -921,19 +993,17 @@ static void rebuild_sched_domains_locked(void)
*/
if (!top_cpuset.nr_subparts_cpus &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
+ return;
if (top_cpuset.nr_subparts_cpus &&
!cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
+ return;
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
-out:
- put_online_cpus();
+ partition_and_rebuild_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
@@ -943,9 +1013,11 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
rebuild_sched_domains_locked();
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
}
/**
@@ -1051,7 +1123,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
bool part_error = false; /* Partition error? */
- lockdep_assert_held(&cpuset_mutex);
+ percpu_rwsem_assert_held(&cpuset_rwsem);
/*
* The parent must be a partition root.
@@ -2039,7 +2111,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
cs = css_cs(css);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
@@ -2063,7 +2135,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
cs->attach_in_progress++;
ret = 0;
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
return ret;
}
@@ -2073,9 +2145,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
css_cs(css)->attach_in_progress--;
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/*
@@ -2098,7 +2170,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/* prepare for attach */
if (cs == &top_cpuset)
@@ -2152,7 +2224,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/* The various types of files and directories in a cpuset file system */
@@ -2183,7 +2255,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
@@ -2219,7 +2292,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
return retval;
}
@@ -2230,7 +2304,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2243,7 +2318,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
return retval;
}
@@ -2282,7 +2358,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2306,7 +2383,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -2437,13 +2515,15 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
return -EINVAL;
css_get(&cs->css);
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
retval = update_prstate(cs, val);
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
css_put(&cs->css);
return retval ?: nbytes;
}
@@ -2649,7 +2729,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
set_bit(CS_ONLINE, &cs->flags);
if (is_spread_page(parent))
@@ -2700,7 +2781,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
return 0;
}
@@ -2719,7 +2801,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (is_partition_root(cs))
update_prstate(cs, 0);
@@ -2738,7 +2821,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2750,7 +2834,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
@@ -2763,7 +2847,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}
spin_unlock_irq(&callback_lock);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/*
@@ -2805,6 +2889,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
int __init cpuset_init(void)
{
+ BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
+
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
@@ -2876,7 +2962,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
/*
* Move tasks to the nearest ancestor with execution resources,
@@ -2886,7 +2972,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
if (is_empty)
remove_tasks_in_empty_cpuset(cs);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
}
static void
@@ -2936,14 +3022,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/*
* We have raced with task attaching. We wait until attaching
* is finished, so we won't attach a task to an empty cpuset.
*/
if (cs->attach_in_progress) {
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
goto retry;
}
@@ -3011,7 +3097,7 @@ update_tasks:
hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/**
@@ -3041,7 +3127,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
ptmp = &tmp;
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/* fetch the available cpus/mems and find out which changed how */
cpumask_copy(&new_cpus, cpu_active_mask);
@@ -3091,7 +3177,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
update_tasks_nodemask(&top_cpuset);
}
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
/* if cpus or mems changed, we need to propagate to descendants */
if (cpus_updated || mems_updated) {
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 8cf010680678..3984dd6b8ddb 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -231,6 +231,15 @@ void cgroup_freezer_migrate_task(struct task_struct *task,
return;
/*
+ * It's not necessary to do changes if both of the src and dst cgroups
+ * are not freezing and task is not frozen.
+ */
+ if (!test_bit(CGRP_FREEZE, &src->flags) &&
+ !test_bit(CGRP_FREEZE, &dst->flags) &&
+ !task->frozen)
+ return;
+
+ /*
* Adjust counters of freezing and frozen tasks.
* Note, that if the task is frozen, but the destination cgroup is not
* frozen, we bump both counters to keep them balanced.
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 8e513a573fe9..138059eb730d 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -45,7 +45,7 @@ struct pids_cgroup {
* %PIDS_MAX = (%PID_MAX_LIMIT + 1).
*/
atomic64_t counter;
- int64_t limit;
+ atomic64_t limit;
/* Handle for "pids.events" */
struct cgroup_file events_file;
@@ -73,8 +73,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
if (!pids)
return ERR_PTR(-ENOMEM);
- pids->limit = PIDS_MAX;
atomic64_set(&pids->counter, 0);
+ atomic64_set(&pids->limit, PIDS_MAX);
atomic64_set(&pids->events_limit, 0);
return &pids->css;
}
@@ -146,13 +146,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
for (p = pids; parent_pids(p); p = parent_pids(p)) {
int64_t new = atomic64_add_return(num, &p->counter);
+ int64_t limit = atomic64_read(&p->limit);
/*
* Since new is capped to the maximum number of pid_t, if
* p->limit is %PIDS_MAX then we know that this test will never
* fail.
*/
- if (new > p->limit)
+ if (new > limit)
goto revert;
}
@@ -277,7 +278,7 @@ set_limit:
* Limit updates don't need to be mutex'd, since it isn't
* critical that any racing fork()s follow the new limit.
*/
- pids->limit = limit;
+ atomic64_set(&pids->limit, limit);
return nbytes;
}
@@ -285,7 +286,7 @@ static int pids_max_show(struct seq_file *sf, void *v)
{
struct cgroup_subsys_state *css = seq_css(sf);
struct pids_cgroup *pids = css_pids(css);
- int64_t limit = pids->limit;
+ int64_t limit = atomic64_read(&pids->limit);
if (limit >= PIDS_MAX)
seq_printf(sf, "%s\n", PIDS_MAX_STR);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index ca19b4c8acf5..6f87352f8219 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -33,7 +33,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
return;
/*
- * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we
+ * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we
* see NULL updated_next or they see our updated stat.
*/
smp_mb();
@@ -304,44 +304,48 @@ void __init cgroup_rstat_boot(void)
* Functions for cgroup basic resource statistics implemented on top of
* rstat.
*/
-static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat,
- struct cgroup_base_stat *src_bstat)
+static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
+ struct cgroup_base_stat *src_bstat)
{
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
}
+static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
+ struct cgroup_base_stat *src_bstat)
+{
+ dst_bstat->cputime.utime -= src_bstat->cputime.utime;
+ dst_bstat->cputime.stime -= src_bstat->cputime.stime;
+ dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+}
+
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
- struct task_cputime *last_cputime = &rstatc->last_bstat.cputime;
- struct task_cputime cputime;
- struct cgroup_base_stat delta;
+ struct cgroup_base_stat cur, delta;
unsigned seq;
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
- cputime = rstatc->bstat.cputime;
+ cur.cputime = rstatc->bstat.cputime;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
- /* calculate the delta to propgate */
- delta.cputime.utime = cputime.utime - last_cputime->utime;
- delta.cputime.stime = cputime.stime - last_cputime->stime;
- delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
- last_cputime->sum_exec_runtime;
- *last_cputime = cputime;
-
- /* transfer the pending stat into delta */
- cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat);
- memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat));
-
- /* propagate delta into the global stat and the parent's pending */
- cgroup_base_stat_accumulate(&cgrp->bstat, &delta);
- if (parent)
- cgroup_base_stat_accumulate(&parent->pending_bstat, &delta);
+ /* propagate percpu delta to global */
+ delta = cur;
+ cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
+ cgroup_base_stat_add(&cgrp->bstat, &delta);
+ cgroup_base_stat_add(&rstatc->last_bstat, &delta);
+
+ /* propagate global delta to parent */
+ if (parent) {
+ delta = cgrp->bstat;
+ cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
+ cgroup_base_stat_add(&parent->bstat, &delta);
+ cgroup_base_stat_add(&cgrp->last_bstat, &delta);
+ }
}
static struct cgroup_rstat_cpu *
diff --git a/kernel/compat.c b/kernel/compat.c
index a2bc1d6ceb57..95005f849c68 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -90,30 +90,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts)
}
EXPORT_SYMBOL_GPL(compat_put_timespec);
-int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i)
-{
- struct compat_itimerval v32;
-
- if (copy_from_user(&v32, i, sizeof(struct compat_itimerval)))
- return -EFAULT;
- o->it_interval.tv_sec = v32.it_interval.tv_sec;
- o->it_interval.tv_usec = v32.it_interval.tv_usec;
- o->it_value.tv_sec = v32.it_value.tv_sec;
- o->it_value.tv_usec = v32.it_value.tv_usec;
- return 0;
-}
-
-int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i)
-{
- struct compat_itimerval v32;
-
- v32.it_interval.tv_sec = i->it_interval.tv_sec;
- v32.it_interval.tv_usec = i->it_interval.tv_usec;
- v32.it_value.tv_sec = i->it_value.tv_sec;
- v32.it_value.tv_usec = i->it_value.tv_usec;
- return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0;
-}
-
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/*
diff --git a/kernel/configs.c b/kernel/configs.c
index c09ea4c995e1..a28c79c5f713 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -47,10 +47,9 @@ ikconfig_read_current(struct file *file, char __user *buf,
&kernel_config_data);
}
-static const struct file_operations ikconfig_file_ops = {
- .owner = THIS_MODULE,
- .read = ikconfig_read_current,
- .llseek = default_llseek,
+static const struct proc_ops config_gz_proc_ops = {
+ .proc_read = ikconfig_read_current,
+ .proc_lseek = default_llseek,
};
static int __init ikconfig_init(void)
@@ -59,7 +58,7 @@ static int __init ikconfig_init(void)
/* create the current config file */
entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
- &ikconfig_file_ops);
+ &config_gz_proc_ops);
if (!entry)
return -ENOMEM;
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index be01a4d627c9..0296b4bda8f1 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -25,8 +25,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
-DEFINE_STATIC_KEY_FALSE(context_tracking_enabled);
-EXPORT_SYMBOL_GPL(context_tracking_enabled);
+DEFINE_STATIC_KEY_FALSE(context_tracking_key);
+EXPORT_SYMBOL_GPL(context_tracking_key);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
@@ -192,7 +192,7 @@ void __init context_tracking_cpu_set(int cpu)
if (!per_cpu(context_tracking.active, cpu)) {
per_cpu(context_tracking.active, cpu) = true;
- static_branch_inc(&context_tracking_enabled);
+ static_branch_inc(&context_tracking_key);
}
if (initialized)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e84c0873559e..9c706af713fb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -62,7 +62,6 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
- bool booted_once;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -76,6 +75,10 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
.fail = CPUHP_INVALID,
};
+#ifdef CONFIG_SMP
+cpumask_t cpus_booted_once_mask;
+#endif
+
#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
static struct lockdep_map cpuhp_state_up_map =
STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
@@ -333,7 +336,7 @@ static void lockdep_acquire_cpus_lock(void)
static void lockdep_release_cpus_lock(void)
{
- rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
+ rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_);
}
/*
@@ -389,8 +392,7 @@ enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
void __init cpu_smt_disable(bool force)
{
- if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
- cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ if (!cpu_smt_possible())
return;
if (force) {
@@ -433,8 +435,16 @@ static inline bool cpu_smt_allowed(unsigned int cpu)
* CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any
* core will shutdown the machine.
*/
- return !per_cpu(cpuhp_state, cpu).booted_once;
+ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
+}
+
+/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
+bool cpu_smt_possible(void)
+{
+ return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
+ cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
+EXPORT_SYMBOL_GPL(cpu_smt_possible);
#else
static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
#endif
@@ -515,8 +525,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
- /* Unpark the stopper thread and the hotplug thread of the target cpu */
- stop_machine_unpark(cpu);
+ /* Unpark the hotplug thread of the target cpu */
kthread_unpark(st->thread);
/*
@@ -1066,7 +1075,7 @@ void notify_cpu_starting(unsigned int cpu)
int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
- st->booted_once = true;
+ cpumask_set_cpu(cpu, &cpus_booted_once_mask);
while (st->state < target) {
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
@@ -1079,8 +1088,8 @@ void notify_cpu_starting(unsigned int cpu)
/*
* Called from the idle task. Wake up the controlling task which brings the
- * stopper and the hotplug thread of the upcoming CPU up and then delegates
- * the rest of the online bringup to the hotplug thread.
+ * hotplug thread of the upcoming CPU up and then delegates the rest of the
+ * online bringup to the hotplug thread.
*/
void cpuhp_online_idle(enum cpuhp_state state)
{
@@ -1090,6 +1099,12 @@ void cpuhp_online_idle(enum cpuhp_state state)
if (state != CPUHP_AP_ONLINE_IDLE)
return;
+ /*
+ * Unpart the stopper thread before we start the idle loop (and start
+ * scheduling); this ensures the stopper task is always available.
+ */
+ stop_machine_unpark(smp_processor_id());
+
st->state = CPUHP_AP_ONLINE_IDLE;
complete_ap_thread(st, true);
}
@@ -1899,6 +1914,78 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
}
EXPORT_SYMBOL(__cpuhp_remove_state);
+#ifdef CONFIG_HOTPLUG_SMT
+static void cpuhp_offline_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = true;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+}
+
+static void cpuhp_online_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = false;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+}
+
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ for_each_online_cpu(cpu) {
+ if (topology_is_primary_thread(cpu))
+ continue;
+ ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (ret)
+ break;
+ /*
+ * As this needs to hold the cpu maps lock it's impossible
+ * to call device_offline() because that ends up calling
+ * cpu_down() which takes cpu maps lock. cpu maps lock
+ * needs to be held as this might race against in kernel
+ * abusers of the hotplug machinery (thermal management).
+ *
+ * So nothing would update device:offline state. That would
+ * leave the sysfs entry stale and prevent onlining after
+ * smt control has been changed to 'off' again. This is
+ * called under the sysfs hotplug lock, so it is properly
+ * serialized against the regular offline usage.
+ */
+ cpuhp_offline_cpu_device(cpu);
+ }
+ if (!ret)
+ cpu_smt_control = ctrlval;
+ cpu_maps_update_done();
+ return ret;
+}
+
+int cpuhp_smt_enable(void)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ cpu_smt_control = CPU_SMT_ENABLED;
+ for_each_present_cpu(cpu) {
+ /* Skip online CPUs and CPUs on offline nodes */
+ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
+ continue;
+ ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
+ if (ret)
+ break;
+ /* See comment in cpuhp_smt_disable() */
+ cpuhp_online_cpu_device(cpu);
+ }
+ cpu_maps_update_done();
+ return ret;
+}
+#endif
+
#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t show_cpuhp_state(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -2053,77 +2140,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
#ifdef CONFIG_HOTPLUG_SMT
-static void cpuhp_offline_cpu_device(unsigned int cpu)
-{
- struct device *dev = get_cpu_device(cpu);
-
- dev->offline = true;
- /* Tell user space about the state change */
- kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
-}
-
-static void cpuhp_online_cpu_device(unsigned int cpu)
-{
- struct device *dev = get_cpu_device(cpu);
-
- dev->offline = false;
- /* Tell user space about the state change */
- kobject_uevent(&dev->kobj, KOBJ_ONLINE);
-}
-
-int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
-{
- int cpu, ret = 0;
-
- cpu_maps_update_begin();
- for_each_online_cpu(cpu) {
- if (topology_is_primary_thread(cpu))
- continue;
- ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
- if (ret)
- break;
- /*
- * As this needs to hold the cpu maps lock it's impossible
- * to call device_offline() because that ends up calling
- * cpu_down() which takes cpu maps lock. cpu maps lock
- * needs to be held as this might race against in kernel
- * abusers of the hotplug machinery (thermal management).
- *
- * So nothing would update device:offline state. That would
- * leave the sysfs entry stale and prevent onlining after
- * smt control has been changed to 'off' again. This is
- * called under the sysfs hotplug lock, so it is properly
- * serialized against the regular offline usage.
- */
- cpuhp_offline_cpu_device(cpu);
- }
- if (!ret)
- cpu_smt_control = ctrlval;
- cpu_maps_update_done();
- return ret;
-}
-
-int cpuhp_smt_enable(void)
-{
- int cpu, ret = 0;
-
- cpu_maps_update_begin();
- cpu_smt_control = CPU_SMT_ENABLED;
- for_each_present_cpu(cpu) {
- /* Skip online CPUs and CPUs on offline nodes */
- if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
- continue;
- ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
- if (ret)
- break;
- /* See comment in cpuhp_smt_disable() */
- cpuhp_online_cpu_device(cpu);
- }
- cpu_maps_update_done();
- return ret;
-}
-
-
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
@@ -2295,6 +2311,9 @@ EXPORT_SYMBOL(__cpu_present_mask);
struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);
+atomic_t __num_online_cpus __read_mostly;
+EXPORT_SYMBOL(__num_online_cpus);
+
void init_cpu_present(const struct cpumask *src)
{
cpumask_copy(&__cpu_present_mask, src);
@@ -2310,6 +2329,27 @@ void init_cpu_online(const struct cpumask *src)
cpumask_copy(&__cpu_online_mask, src);
}
+void set_cpu_online(unsigned int cpu, bool online)
+{
+ /*
+ * atomic_inc/dec() is required to handle the horrid abuse of this
+ * function by the reboot and kexec code which invoke it from
+ * IPI/NMI broadcasts when shutting down CPUs. Invocation from
+ * regular CPU hotplug is properly serialized.
+ *
+ * Note, that the fact that __num_online_cpus is of type atomic_t
+ * does not protect readers which are not serialized against
+ * concurrent hotplug operations.
+ */
+ if (online) {
+ if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
+ atomic_inc(&__num_online_cpus);
+ } else {
+ if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
+ atomic_dec(&__num_online_cpus);
+ }
+}
+
/*
* Activate the first processor.
*/
@@ -2334,12 +2374,23 @@ void __init boot_cpu_init(void)
void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
- this_cpu_write(cpuhp_state.booted_once, true);
+ cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
-enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+/*
+ * These are used for a global "mitigations=" cmdline option for toggling
+ * optional CPU mitigations.
+ */
+enum cpu_mitigations {
+ CPU_MITIGATIONS_OFF,
+ CPU_MITIGATIONS_AUTO,
+ CPU_MITIGATIONS_AUTO_NOSMT,
+};
+
+static enum cpu_mitigations cpu_mitigations __ro_after_init =
+ CPU_MITIGATIONS_AUTO;
static int __init mitigations_parse_cmdline(char *arg)
{
@@ -2356,3 +2407,17 @@ static int __init mitigations_parse_cmdline(char *arg)
return 0;
}
early_param("mitigations", mitigations_parse_cmdline);
+
+/* mitigations=off */
+bool cpu_mitigations_off(void)
+{
+ return cpu_mitigations == CPU_MITIGATIONS_OFF;
+}
+EXPORT_SYMBOL_GPL(cpu_mitigations_off);
+
+/* mitigations=auto,nosmt */
+bool cpu_mitigations_auto_nosmt(void)
+{
+ return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
+}
+EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
diff --git a/kernel/cred.c b/kernel/cred.c
index c0a4c12d38b2..809a985b1793 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -175,8 +175,8 @@ void exit_creds(struct task_struct *tsk)
put_cred(cred);
#ifdef CONFIG_KEYS_REQUEST_CACHE
- key_put(current->cached_requested_key);
- current->cached_requested_key = NULL;
+ key_put(tsk->cached_requested_key);
+ tsk->cached_requested_key = NULL;
#endif
}
@@ -223,7 +223,7 @@ struct cred *cred_alloc_blank(void)
new->magic = CRED_MAGIC;
#endif
- if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+ if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
return new;
@@ -282,7 +282,7 @@ struct cred *prepare_creds(void)
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
validate_creds(new);
return new;
@@ -715,7 +715,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
put_cred(old);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 5cc608de6883..2b7c9b67931d 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -441,6 +441,37 @@ setundefined:
return 0;
}
+#ifdef CONFIG_KGDB_KDB
+void kdb_dump_stack_on_cpu(int cpu)
+{
+ if (cpu == raw_smp_processor_id() || !IS_ENABLED(CONFIG_SMP)) {
+ dump_stack();
+ return;
+ }
+
+ if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) {
+ kdb_printf("ERROR: Task on cpu %d didn't stop in the debugger\n",
+ cpu);
+ return;
+ }
+
+ /*
+ * In general, architectures don't support dumping the stack of a
+ * "running" process that's not the current one. From the point of
+ * view of the Linux, kernel processes that are looping in the kgdb
+ * slave loop are still "running". There's also no API (that actually
+ * works across all architectures) that can do a stack crawl based
+ * on registers passed as a parameter.
+ *
+ * Solve this conundrum by asking slave CPUs to do the backtrace
+ * themselves.
+ */
+ kgdb_info[cpu].exception_state |= DCPU_WANT_BT;
+ while (kgdb_info[cpu].exception_state & DCPU_WANT_BT)
+ cpu_relax();
+}
+#endif
+
/*
* Return true if there is a valid kgdb I/O module. Also if no
* debugger is attached a message can be printed to the console about
@@ -580,6 +611,9 @@ cpu_loop:
atomic_xchg(&kgdb_active, cpu);
break;
}
+ } else if (kgdb_info[cpu].exception_state & DCPU_WANT_BT) {
+ dump_stack();
+ kgdb_info[cpu].exception_state &= ~DCPU_WANT_BT;
} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
if (!raw_spin_is_locked(&dbg_slave_lock))
goto return_normal;
@@ -787,11 +821,8 @@ out:
}
/*
- * GDB places a breakpoint at this function to know dynamically
- * loaded objects. It's not defined static so that only one instance with this
- * name exists in the kernel.
+ * GDB places a breakpoint at this function to know dynamically loaded objects.
*/
-
static int module_event(struct notifier_block *self, unsigned long val,
void *data)
{
@@ -896,30 +927,25 @@ static struct sysrq_key_op sysrq_dbg_op = {
};
#endif
-static int kgdb_panic_event(struct notifier_block *self,
- unsigned long val,
- void *data)
+void kgdb_panic(const char *msg)
{
+ if (!kgdb_io_module_registered)
+ return;
+
/*
- * Avoid entering the debugger if we were triggered due to a panic
- * We don't want to get stuck waiting for input from user in such case.
- * panic_timeout indicates the system should automatically
+ * We don't want to get stuck waiting for input from user if
+ * "panic_timeout" indicates the system should automatically
* reboot on panic.
*/
if (panic_timeout)
- return NOTIFY_DONE;
+ return;
if (dbg_kdb_mode)
- kdb_printf("PANIC: %s\n", (char *)data);
+ kdb_printf("PANIC: %s\n", msg);
+
kgdb_breakpoint();
- return NOTIFY_DONE;
}
-static struct notifier_block kgdb_panic_event_nb = {
- .notifier_call = kgdb_panic_event,
- .priority = INT_MAX,
-};
-
void __weak kgdb_arch_late(void)
{
}
@@ -968,8 +994,6 @@ static void kgdb_register_callbacks(void)
kgdb_arch_late();
register_module_notifier(&dbg_module_load_nb);
register_reboot_notifier(&dbg_reboot_notifier);
- atomic_notifier_chain_register(&panic_notifier_list,
- &kgdb_panic_event_nb);
#ifdef CONFIG_MAGIC_SYSRQ
register_sysrq_key('g', &sysrq_dbg_op);
#endif
@@ -983,16 +1007,14 @@ static void kgdb_register_callbacks(void)
static void kgdb_unregister_callbacks(void)
{
/*
- * When this routine is called KGDB should unregister from the
- * panic handler and clean up, making sure it is not handling any
+ * When this routine is called KGDB should unregister from
+ * handlers and clean up, making sure it is not handling any
* break exceptions at the time.
*/
if (kgdb_io_module_registered) {
kgdb_io_module_registered = 0;
unregister_reboot_notifier(&dbg_reboot_notifier);
unregister_module_notifier(&dbg_module_load_nb);
- atomic_notifier_chain_unregister(&panic_notifier_list,
- &kgdb_panic_event_nb);
kgdb_arch_exit();
#ifdef CONFIG_MAGIC_SYSRQ
unregister_sysrq_key('g', &sysrq_dbg_op);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index b4a7c326d546..cd22b5f68831 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -33,7 +33,7 @@ struct kgdb_state {
#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
-#define DCPU_SSTEP 0x8 /* CPU is single stepping */
+#define DCPU_WANT_BT 0x8 /* Slave cpu should backtrace then clear flag */
struct debuggerinfo_struct {
void *debuggerinfo;
@@ -76,6 +76,7 @@ extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
extern int kdb_common_init_state(struct kgdb_state *ks);
extern int kdb_common_deinit_state(void);
+extern void kdb_dump_stack_on_cpu(int cpu);
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 62c301ad0773..d7ebb2c79cb8 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -412,7 +412,6 @@ static int kdb_bc(int argc, const char **argv)
* assume that the breakpoint number is desired.
*/
if (addr < KDB_MAXBPT) {
- bp = &kdb_breakpoints[addr];
lowbp = highbp = addr;
highbp++;
} else {
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7e2379aa0a1e..3de0cc780c16 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -22,20 +22,15 @@
static void kdb_show_stack(struct task_struct *p, void *addr)
{
int old_lvl = console_loglevel;
+
console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
- kdb_set_current_task(p);
- if (addr) {
- show_stack((struct task_struct *)p, addr);
- } else if (kdb_current_regs) {
-#ifdef CONFIG_X86
- show_stack(p, &kdb_current_regs->sp);
-#else
- show_stack(p, NULL);
-#endif
- } else {
- show_stack(p, NULL);
- }
+
+ if (!addr && kdb_task_has_cpu(p))
+ kdb_dump_stack_on_cpu(kdb_process_cpu(p));
+ else
+ show_stack(p, addr);
+
console_loglevel = old_lvl;
kdb_trap_printk--;
}
@@ -78,12 +73,12 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
*/
static int
-kdb_bt1(struct task_struct *p, unsigned long mask,
- int argcount, int btaprompt)
+kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt)
{
- char buffer[2];
- if (kdb_getarea(buffer[0], (unsigned long)p) ||
- kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+ char ch;
+
+ if (kdb_getarea(ch, (unsigned long)p) ||
+ kdb_getarea(ch, (unsigned long)(p+1)-1))
return KDB_BADADDR;
if (!kdb_task_state(p, mask))
return 0;
@@ -91,22 +86,46 @@ kdb_bt1(struct task_struct *p, unsigned long mask,
kdb_ps1(p);
kdb_show_stack(p, NULL);
if (btaprompt) {
- kdb_getstr(buffer, sizeof(buffer),
- "Enter <q> to end, <cr> to continue:");
- if (buffer[0] == 'q') {
- kdb_printf("\n");
+ kdb_printf("Enter <q> to end, <cr> or <space> to continue:");
+ do {
+ ch = kdb_getchar();
+ } while (!strchr("\r\n q", ch));
+ kdb_printf("\n");
+
+ /* reset the pager */
+ kdb_nextline = 1;
+
+ if (ch == 'q')
return 1;
- }
}
touch_nmi_watchdog();
return 0;
}
+static void
+kdb_bt_cpu(unsigned long cpu)
+{
+ struct task_struct *kdb_tsk;
+
+ if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+ kdb_printf("WARNING: no process for cpu %ld\n", cpu);
+ return;
+ }
+
+ /* If a CPU failed to round up we could be here */
+ kdb_tsk = KDB_TSK(cpu);
+ if (!kdb_tsk) {
+ kdb_printf("WARNING: no task for cpu %ld\n", cpu);
+ return;
+ }
+
+ kdb_bt1(kdb_tsk, ~0UL, false);
+}
+
int
kdb_bt(int argc, const char **argv)
{
int diag;
- int argcount = 5;
int btaprompt = 1;
int nextarg;
unsigned long addr;
@@ -125,7 +144,7 @@ kdb_bt(int argc, const char **argv)
/* Run the active tasks first */
for_each_online_cpu(cpu) {
p = kdb_curr_task(cpu);
- if (kdb_bt1(p, mask, argcount, btaprompt))
+ if (kdb_bt1(p, mask, btaprompt))
return 0;
}
/* Now the inactive tasks */
@@ -134,7 +153,7 @@ kdb_bt(int argc, const char **argv)
return 0;
if (task_curr(p))
continue;
- if (kdb_bt1(p, mask, argcount, btaprompt))
+ if (kdb_bt1(p, mask, btaprompt))
return 0;
} kdb_while_each_thread(g, p);
} else if (strcmp(argv[0], "btp") == 0) {
@@ -146,10 +165,8 @@ kdb_bt(int argc, const char **argv)
if (diag)
return diag;
p = find_task_by_pid_ns(pid, &init_pid_ns);
- if (p) {
- kdb_set_current_task(p);
- return kdb_bt1(p, ~0UL, argcount, 0);
- }
+ if (p)
+ return kdb_bt1(p, ~0UL, false);
kdb_printf("No process with pid == %ld found\n", pid);
return 0;
} else if (strcmp(argv[0], "btt") == 0) {
@@ -158,12 +175,9 @@ kdb_bt(int argc, const char **argv)
diag = kdbgetularg((char *)argv[1], &addr);
if (diag)
return diag;
- kdb_set_current_task((struct task_struct *)addr);
- return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+ return kdb_bt1((struct task_struct *)addr, ~0UL, false);
} else if (strcmp(argv[0], "btc") == 0) {
unsigned long cpu = ~0;
- struct task_struct *save_current_task = kdb_current_task;
- char buf[80];
if (argc > 1)
return KDB_ARGCOUNT;
if (argc == 1) {
@@ -171,35 +185,21 @@ kdb_bt(int argc, const char **argv)
if (diag)
return diag;
}
- /* Recursive use of kdb_parse, do not use argv after
- * this point */
- argv = NULL;
if (cpu != ~0) {
- if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
- kdb_printf("no process for cpu %ld\n", cpu);
- return 0;
- }
- sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
- kdb_parse(buf);
- return 0;
- }
- kdb_printf("btc: cpu status: ");
- kdb_parse("cpu\n");
- for_each_online_cpu(cpu) {
- void *kdb_tsk = KDB_TSK(cpu);
-
- /* If a CPU failed to round up we could be here */
- if (!kdb_tsk) {
- kdb_printf("WARNING: no task for cpu %ld\n",
- cpu);
- continue;
+ kdb_bt_cpu(cpu);
+ } else {
+ /*
+ * Recursive use of kdb_parse, do not use argv after
+ * this point.
+ */
+ argv = NULL;
+ kdb_printf("btc: cpu status: ");
+ kdb_parse("cpu\n");
+ for_each_online_cpu(cpu) {
+ kdb_bt_cpu(cpu);
+ touch_nmi_watchdog();
}
-
- sprintf(buf, "btt 0x%px\n", kdb_tsk);
- kdb_parse(buf);
- touch_nmi_watchdog();
}
- kdb_set_current_task(save_current_task);
return 0;
} else {
if (argc) {
@@ -211,7 +211,7 @@ kdb_bt(int argc, const char **argv)
kdb_show_stack(kdb_current_task, (void *)addr);
return 0;
} else {
- return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+ return kdb_bt1(kdb_current_task, ~0UL, false);
}
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 3a5184eb6977..924bc9298a42 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -49,14 +49,88 @@ static int kgdb_transition_check(char *buffer)
return 0;
}
-static int kdb_read_get_key(char *buffer, size_t bufsize)
+/**
+ * kdb_handle_escape() - validity check on an accumulated escape sequence.
+ * @buf: Accumulated escape characters to be examined. Note that buf
+ * is not a string, it is an array of characters and need not be
+ * nil terminated.
+ * @sz: Number of accumulated escape characters.
+ *
+ * Return: -1 if the escape sequence is unwanted, 0 if it is incomplete,
+ * otherwise it returns a mapped key value to pass to the upper layers.
+ */
+static int kdb_handle_escape(char *buf, size_t sz)
+{
+ char *lastkey = buf + sz - 1;
+
+ switch (sz) {
+ case 1:
+ if (*lastkey == '\e')
+ return 0;
+ break;
+
+ case 2: /* \e<something> */
+ if (*lastkey == '[')
+ return 0;
+ break;
+
+ case 3:
+ switch (*lastkey) {
+ case 'A': /* \e[A, up arrow */
+ return 16;
+ case 'B': /* \e[B, down arrow */
+ return 14;
+ case 'C': /* \e[C, right arrow */
+ return 6;
+ case 'D': /* \e[D, left arrow */
+ return 2;
+ case '1': /* \e[<1,3,4>], may be home, del, end */
+ case '3':
+ case '4':
+ return 0;
+ }
+ break;
+
+ case 4:
+ if (*lastkey == '~') {
+ switch (buf[2]) {
+ case '1': /* \e[1~, home */
+ return 1;
+ case '3': /* \e[3~, del */
+ return 4;
+ case '4': /* \e[4~, end */
+ return 5;
+ }
+ }
+ break;
+ }
+
+ return -1;
+}
+
+/**
+ * kdb_getchar() - Read a single character from a kdb console (or consoles).
+ *
+ * Other than polling the various consoles that are currently enabled,
+ * most of the work done in this function is dealing with escape sequences.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right. The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters. kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources. Escape sequence processing
+ * has to be done as states in the polling loop.
+ *
+ * Return: The key pressed or a control code derived from an escape sequence.
+ */
+char kdb_getchar(void)
{
#define ESCAPE_UDELAY 1000
#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
- char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
- char *ped = escape_data;
+ char buf[4]; /* longest vt100 escape sequence is 4 bytes */
+ char *pbuf = buf;
int escape_delay = 0;
- get_char_func *f, *f_escape = NULL;
+ get_char_func *f, *f_prev = NULL;
int key;
for (f = &kdb_poll_funcs[0]; ; ++f) {
@@ -65,109 +139,37 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
touch_nmi_watchdog();
f = &kdb_poll_funcs[0];
}
- if (escape_delay == 2) {
- *ped = '\0';
- ped = escape_data;
- --escape_delay;
- }
- if (escape_delay == 1) {
- key = *ped++;
- if (!*ped)
- --escape_delay;
- break;
- }
+
key = (*f)();
if (key == -1) {
if (escape_delay) {
udelay(ESCAPE_UDELAY);
- --escape_delay;
+ if (--escape_delay == 0)
+ return '\e';
}
continue;
}
- if (bufsize <= 2) {
- if (key == '\r')
- key = '\n';
- *buffer++ = key;
- *buffer = '\0';
- return -1;
- }
- if (escape_delay == 0 && key == '\e') {
+
+ /*
+ * When the first character is received (or we get a change
+ * input source) we set ourselves up to handle an escape
+ * sequences (just in case).
+ */
+ if (f_prev != f) {
+ f_prev = f;
+ pbuf = buf;
escape_delay = ESCAPE_DELAY;
- ped = escape_data;
- f_escape = f;
- }
- if (escape_delay) {
- *ped++ = key;
- if (f_escape != f) {
- escape_delay = 2;
- continue;
- }
- if (ped - escape_data == 1) {
- /* \e */
- continue;
- } else if (ped - escape_data == 2) {
- /* \e<something> */
- if (key != '[')
- escape_delay = 2;
- continue;
- } else if (ped - escape_data == 3) {
- /* \e[<something> */
- int mapkey = 0;
- switch (key) {
- case 'A': /* \e[A, up arrow */
- mapkey = 16;
- break;
- case 'B': /* \e[B, down arrow */
- mapkey = 14;
- break;
- case 'C': /* \e[C, right arrow */
- mapkey = 6;
- break;
- case 'D': /* \e[D, left arrow */
- mapkey = 2;
- break;
- case '1': /* dropthrough */
- case '3': /* dropthrough */
- /* \e[<1,3,4>], may be home, del, end */
- case '4':
- mapkey = -1;
- break;
- }
- if (mapkey != -1) {
- if (mapkey > 0) {
- escape_data[0] = mapkey;
- escape_data[1] = '\0';
- }
- escape_delay = 2;
- }
- continue;
- } else if (ped - escape_data == 4) {
- /* \e[<1,3,4><something> */
- int mapkey = 0;
- if (key == '~') {
- switch (escape_data[2]) {
- case '1': /* \e[1~, home */
- mapkey = 1;
- break;
- case '3': /* \e[3~, del */
- mapkey = 4;
- break;
- case '4': /* \e[4~, end */
- mapkey = 5;
- break;
- }
- }
- if (mapkey > 0) {
- escape_data[0] = mapkey;
- escape_data[1] = '\0';
- }
- escape_delay = 2;
- continue;
- }
}
- break; /* A key to process */
+
+ *pbuf++ = key;
+ key = kdb_handle_escape(buf, pbuf - buf);
+ if (key < 0) /* no escape sequence; return best character */
+ return buf[pbuf - buf == 2 ? 1 : 0];
+ if (key > 0)
+ return key;
}
- return key;
+
+ unreachable();
}
/*
@@ -188,17 +190,7 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
* function. It is not reentrant - it relies on the fact
* that while kdb is running on only one "master debug" cpu.
* Remarks:
- *
- * The buffer size must be >= 2. A buffer size of 2 means that the caller only
- * wants a single key.
- *
- * An escape key could be the start of a vt100 control sequence such as \e[D
- * (left arrow) or it could be a character in its own right. The standard
- * method for detecting the difference is to wait for 2 seconds to see if there
- * are any other characters. kdb is complicated by the lack of a timer service
- * (interrupts are off), by multiple input sources and by the need to sometimes
- * return after just one key. Escape sequence processing has to be done as
- * states in the polling loop.
+ * The buffer size must be >= 2.
*/
static char *kdb_read(char *buffer, size_t bufsize)
@@ -233,9 +225,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
*cp = '\0';
kdb_printf("%s", buffer);
poll_again:
- key = kdb_read_get_key(buffer, bufsize);
- if (key == -1)
- return buffer;
+ key = kdb_getchar();
if (key != 9)
tab = 0;
switch (key) {
@@ -563,7 +553,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
- struct console *c = console_drivers;
+ struct console *c;
unsigned long uninitialized_var(flags);
/* Serialize kdb_printf if multiple cpus try to write at once.
@@ -708,10 +698,9 @@ kdb_printit:
cp2++;
}
}
- while (c) {
+ for_each_console(c) {
c->write(c, cp, retlen - (cp - kdb_buffer));
touch_nmi_watchdog();
- c = c->next;
}
}
if (logging) {
@@ -746,7 +735,7 @@ kdb_printit:
/* check for having reached the LINES number of printed lines */
if (kdb_nextline >= linecount) {
- char buf1[16] = "";
+ char ch;
/* Watch out for recursion here. Any routine that calls
* kdb_printf will come back through here. And kdb_read
@@ -762,7 +751,6 @@ kdb_printit:
moreprompt = "more> ";
kdb_input_flush();
- c = console_drivers;
if (dbg_io_ops && !dbg_io_ops->is_console) {
len = strlen(moreprompt);
@@ -772,48 +760,46 @@ kdb_printit:
cp++;
}
}
- while (c) {
+ for_each_console(c) {
c->write(c, moreprompt, strlen(moreprompt));
touch_nmi_watchdog();
- c = c->next;
}
if (logging)
printk("%s", moreprompt);
- kdb_read(buf1, 2); /* '2' indicates to return
- * immediately after getting one key. */
+ ch = kdb_getchar();
kdb_nextline = 1; /* Really set output line 1 */
/* empty and reset the buffer: */
kdb_buffer[0] = '\0';
next_avail = kdb_buffer;
size_avail = sizeof(kdb_buffer);
- if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+ if ((ch == 'q') || (ch == 'Q')) {
/* user hit q or Q */
KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
KDB_STATE_CLEAR(PAGER);
/* end of command output; back to normal mode */
kdb_grepping_flag = 0;
kdb_printf("\n");
- } else if (buf1[0] == ' ') {
+ } else if (ch == ' ') {
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
- } else if (buf1[0] == '\n') {
+ } else if (ch == '\n' || ch == '\r') {
kdb_nextline = linecount - 1;
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
- } else if (buf1[0] == '/' && !kdb_grepping_flag) {
+ } else if (ch == '/' && !kdb_grepping_flag) {
kdb_printf("\r");
kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN,
kdbgetenv("SEARCHPROMPT") ?: "search> ");
*strchrnul(kdb_grep_string, '\n') = '\0';
kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH;
suspend_grep = 1; /* for this recursion */
- } else if (buf1[0] && buf1[0] != '\n') {
- /* user hit something other than enter */
+ } else if (ch) {
+ /* user hit something unexpected */
suspend_grep = 1; /* for this recursion */
- if (buf1[0] != '/')
+ if (ch != '/')
kdb_printf(
"\nOnly 'q', 'Q' or '/' are processed at "
"more prompt, input ignored\n");
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 9ecfa37c7fbf..ba12e9f4661e 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -73,7 +73,6 @@ int kdb_nextline = 1;
int kdb_state; /* General KDB state */
struct task_struct *kdb_current_task;
-EXPORT_SYMBOL(kdb_current_task);
struct pt_regs *kdb_current_regs;
const char *kdb_diemsg;
@@ -830,7 +829,7 @@ static void parse_grep(const char *str)
cp++;
while (isspace(*cp))
cp++;
- if (strncmp(cp, "grep ", 5)) {
+ if (!str_has_prefix(cp, "grep ")) {
kdb_printf("invalid 'pipe', see grephelp\n");
return;
}
@@ -1139,7 +1138,7 @@ static void kdb_dumpregs(struct pt_regs *regs)
console_loglevel = old_lvl;
}
-void kdb_set_current_task(struct task_struct *p)
+static void kdb_set_current_task(struct task_struct *p)
{
kdb_current_task = p;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 2118d8258b7c..2e296e4a234c 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -210,6 +210,7 @@ extern void kdb_ps1(const struct task_struct *p);
extern void kdb_print_nameval(const char *name, unsigned long val);
extern void kdb_send_sig(struct task_struct *p, int sig);
extern void kdb_meminfo_proc_show(void);
+extern char kdb_getchar(void);
extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
@@ -239,8 +240,8 @@ extern void *debug_kmalloc(size_t size, gfp_t flags);
extern void debug_kfree(void *);
extern void debug_kusage(void);
-extern void kdb_set_current_task(struct task_struct *);
extern struct task_struct *kdb_current_task;
+extern struct pt_regs *kdb_current_regs;
#ifdef CONFIG_KDB_KEYBOARD
extern void kdb_kbd_cleanup_state(void);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 9decbba255fc..4c103a24e380 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -20,6 +20,15 @@ config ARCH_HAS_DMA_COHERENCE_H
config ARCH_HAS_DMA_SET_MASK
bool
+#
+# Select this option if the architecture needs special handling for
+# DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what
+# people thing of when saying write combine, so very few platforms should
+# need to enable this.
+#
+config ARCH_HAS_DMA_WRITE_COMBINE
+ bool
+
config DMA_DECLARE_COHERENT
bool
@@ -42,12 +51,6 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
config ARCH_HAS_DMA_PREP_COHERENT
bool
-config ARCH_HAS_DMA_COHERENT_TO_PFN
- bool
-
-config ARCH_HAS_DMA_MMAP_PGPROT
- bool
-
config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
@@ -62,9 +65,18 @@ config SWIOTLB
bool
select NEED_DMA_MAP_STATE
+#
+# Should be selected if we can mmap non-coherent mappings to userspace.
+# The only thing that is really required is a way to set an uncached bit
+# in the pagetables
+#
+config DMA_NONCOHERENT_MMAP
+ bool
+
config DMA_REMAP
depends on MMU
select GENERIC_ALLOCATOR
+ select DMA_NONCOHERENT_MMAP
bool
config DMA_DIRECT_REMAP
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 29fd6590dc1e..551b0eb7028a 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -122,21 +122,10 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
dma_release_coherent_memory(mem);
return ret;
}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-void dma_release_declared_memory(struct device *dev)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
-
- if (!mem)
- return;
- dma_release_coherent_memory(mem);
- dev->dma_mem = NULL;
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
-
-static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
- ssize_t size, dma_addr_t *dma_handle)
+static void *__dma_alloc_from_coherent(struct device *dev,
+ struct dma_coherent_mem *mem,
+ ssize_t size, dma_addr_t *dma_handle)
{
int order = get_order(size);
unsigned long flags;
@@ -155,7 +144,7 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
/*
* Memory was found in the coherent area.
*/
- *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
+ *dma_handle = dma_get_device_base(dev, mem) + (pageno << PAGE_SHIFT);
ret = mem->virt_base + (pageno << PAGE_SHIFT);
spin_unlock_irqrestore(&mem->spinlock, flags);
memset(ret, 0, size);
@@ -187,17 +176,18 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
if (!mem)
return 0;
- *ret = __dma_alloc_from_coherent(mem, size, dma_handle);
+ *ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle);
return 1;
}
-void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
+void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle)
{
if (!dma_coherent_default_memory)
return NULL;
- return __dma_alloc_from_coherent(dma_coherent_default_memory, size,
- dma_handle);
+ return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
+ dma_handle);
}
static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
@@ -288,7 +278,6 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
}
-EXPORT_SYMBOL(dma_mmap_from_dev_coherent);
int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
size_t size, int *ret)
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 2bd410f934b3..daa4e6eefdde 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -42,10 +42,11 @@ struct cma *dma_contiguous_default_area;
* Users, who want to set the size of global CMA area for their system
* should use cma= kernel parameter.
*/
-static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
-static phys_addr_t size_cmdline = -1;
-static phys_addr_t base_cmdline;
-static phys_addr_t limit_cmdline;
+static const phys_addr_t size_bytes __initconst =
+ (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
+static phys_addr_t size_cmdline __initdata = -1;
+static phys_addr_t base_cmdline __initdata;
+static phys_addr_t limit_cmdline __initdata;
static int __init early_cma(char *p)
{
@@ -230,9 +231,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
*/
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
- int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
- size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- size_t align = get_order(PAGE_ALIGN(size));
+ size_t count = size >> PAGE_SHIFT;
struct page *page = NULL;
struct cma *cma = NULL;
@@ -243,14 +242,12 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
/* CMA can be used only in the context which permits sleeping */
if (cma && gfpflags_allow_blocking(gfp)) {
+ size_t align = get_order(size);
size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
}
- /* Fallback allocation of normal pages */
- if (!page)
- page = alloc_pages_node(node, gfp, align);
return page;
}
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 099002d84f46..2031ed1ad7fa 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -27,7 +27,7 @@
#include <asm/sections.h>
-#define HASH_SIZE 1024ULL
+#define HASH_SIZE 16384ULL
#define HASH_FN_SHIFT 13
#define HASH_FN_MASK (HASH_SIZE - 1)
@@ -54,40 +54,40 @@ enum map_err_types {
* struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping
* @list: node on pre-allocated free_entries list
* @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent
- * @type: single, page, sg, coherent
- * @pfn: page frame of the start address
- * @offset: offset of mapping relative to pfn
* @size: length of the mapping
+ * @type: single, page, sg, coherent
* @direction: enum dma_data_direction
* @sg_call_ents: 'nents' from dma_map_sg
* @sg_mapped_ents: 'mapped_ents' from dma_map_sg
+ * @pfn: page frame of the start address
+ * @offset: offset of mapping relative to pfn
* @map_err_type: track whether dma_mapping_error() was checked
* @stacktrace: support backtraces when a violation is detected
*/
struct dma_debug_entry {
struct list_head list;
struct device *dev;
- int type;
- unsigned long pfn;
- size_t offset;
u64 dev_addr;
u64 size;
+ int type;
int direction;
int sg_call_ents;
int sg_mapped_ents;
+ unsigned long pfn;
+ size_t offset;
enum map_err_types map_err_type;
#ifdef CONFIG_STACKTRACE
unsigned int stack_len;
unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
#endif
-};
+} ____cacheline_aligned_in_smp;
typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *);
struct hash_bucket {
struct list_head list;
spinlock_t lock;
-} ____cacheline_aligned_in_smp;
+};
/* Hash list to save the allocated dma addresses */
static struct hash_bucket dma_entry_hash[HASH_SIZE];
@@ -161,7 +161,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
{
#ifdef CONFIG_STACKTRACE
if (entry) {
- pr_warning("Mapped at:\n");
+ pr_warn("Mapped at:\n");
stack_trace_print(entry->stack_entries, entry->stack_len, 0);
}
#endif
@@ -255,12 +255,10 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry,
* Give up exclusive access to the hash bucket
*/
static void put_hash_bucket(struct hash_bucket *bucket,
- unsigned long *flags)
+ unsigned long flags)
__releases(&bucket->lock)
{
- unsigned long __flags = *flags;
-
- spin_unlock_irqrestore(&bucket->lock, __flags);
+ spin_unlock_irqrestore(&bucket->lock, flags);
}
static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b)
@@ -359,7 +357,7 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
/*
* Nothing found, go back a hash bucket
*/
- put_hash_bucket(*bucket, flags);
+ put_hash_bucket(*bucket, *flags);
range += (1 << HASH_FN_SHIFT);
index.dev_addr -= (1 << HASH_FN_SHIFT);
*bucket = get_hash_bucket(&index, flags);
@@ -420,6 +418,7 @@ void debug_dma_dump_mappings(struct device *dev)
}
spin_unlock_irqrestore(&bucket->lock, flags);
+ cond_resched();
}
}
@@ -608,7 +607,7 @@ static void add_dma_entry(struct dma_debug_entry *entry)
bucket = get_hash_bucket(entry, &flags);
hash_bucket_add(bucket, entry);
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
rc = active_cacheline_insert(entry);
if (rc == -ENOMEM) {
@@ -1001,7 +1000,7 @@ static void check_unmap(struct dma_debug_entry *ref)
if (!entry) {
/* must drop lock before calling dma_mapping_error */
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
if (dma_mapping_error(ref->dev, ref->dev_addr)) {
err_printk(ref->dev, NULL,
@@ -1083,7 +1082,7 @@ static void check_unmap(struct dma_debug_entry *ref)
hash_bucket_del(entry);
dma_entry_free(entry);
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
}
static void check_for_stack(struct device *dev,
@@ -1203,7 +1202,7 @@ static void check_sync(struct device *dev,
}
out:
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
}
static void check_sg_segment(struct device *dev, struct scatterlist *sg)
@@ -1318,7 +1317,7 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
}
}
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
}
EXPORT_SYMBOL(debug_dma_mapping_error);
@@ -1391,7 +1390,7 @@ static int get_nr_mapped_entries(struct device *dev,
if (entry)
mapped_ents = entry->sg_mapped_ents;
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
return mapped_ents;
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 795c9b095d75..6af7ae83c4ad 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -12,25 +12,25 @@
#include <linux/dma-contiguous.h>
#include <linux/dma-noncoherent.h>
#include <linux/pfn.h>
+#include <linux/vmalloc.h>
#include <linux/set_memory.h>
#include <linux/swiotlb.h>
/*
- * Most architectures use ZONE_DMA for the first 16 Megabytes, but
- * some use it for entirely different regions:
+ * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it
+ * it for entirely different regions. In that case the arch code needs to
+ * override the variable below for dma-direct to work properly.
*/
-#ifndef ARCH_ZONE_DMA_BITS
-#define ARCH_ZONE_DMA_BITS 24
-#endif
+unsigned int zone_dma_bits __ro_after_init = 24;
static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
{
if (!dev->dma_mask) {
dev_err_once(dev, "DMA map on device without dma_mask\n");
- } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
+ } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) {
dev_err_once(dev,
- "overflow %pad+%zu of DMA mask %llx bus mask %llx\n",
- &dma_addr, size, *dev->dma_mask, dev->bus_dma_mask);
+ "overflow %pad+%zu of DMA mask %llx bus limit %llx\n",
+ &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
}
WARN_ON_ONCE(1);
}
@@ -43,6 +43,12 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev,
return phys_to_dma(dev, phys);
}
+static inline struct page *dma_direct_to_page(struct device *dev,
+ dma_addr_t dma_addr)
+{
+ return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr)));
+}
+
u64 dma_direct_get_required_mask(struct device *dev)
{
u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
@@ -51,15 +57,14 @@ u64 dma_direct_get_required_mask(struct device *dev)
}
static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
- u64 *phys_mask)
+ u64 *phys_limit)
{
- if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
- dma_mask = dev->bus_dma_mask;
+ u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
if (force_dma_unencrypted(dev))
- *phys_mask = __dma_to_phys(dev, dma_mask);
+ *phys_limit = __dma_to_phys(dev, dma_limit);
else
- *phys_mask = dma_to_phys(dev, dma_mask);
+ *phys_limit = dma_to_phys(dev, dma_limit);
/*
* Optimistically try the zone that the physical address mask falls
@@ -69,9 +74,9 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
* Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
* zones.
*/
- if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+ if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
return GFP_DMA;
- if (*phys_mask <= DMA_BIT_MASK(32))
+ if (*phys_limit <= DMA_BIT_MASK(32))
return GFP_DMA32;
return 0;
}
@@ -79,14 +84,16 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
return phys_to_dma_direct(dev, phys) + size - 1 <=
- min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
+ min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+ gfp_t gfp, unsigned long attrs)
{
+ size_t alloc_size = PAGE_ALIGN(size);
+ int node = dev_to_node(dev);
struct page *page = NULL;
- u64 phys_mask;
+ u64 phys_limit;
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
@@ -94,15 +101,21 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
/* we always manually zero the memory once we are done: */
gfp &= ~__GFP_ZERO;
gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_mask);
+ &phys_limit);
+ page = dma_alloc_contiguous(dev, alloc_size, gfp);
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ dma_free_contiguous(dev, page, alloc_size);
+ page = NULL;
+ }
again:
- page = dma_alloc_contiguous(dev, size, gfp);
+ if (!page)
+ page = alloc_pages_node(node, gfp, get_order(alloc_size));
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
page = NULL;
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
- phys_mask < DMA_BIT_MASK(64) &&
+ phys_limit < DMA_BIT_MASK(64) &&
!(gfp & (GFP_DMA32 | GFP_DMA))) {
gfp |= GFP_DMA32;
goto again;
@@ -123,7 +136,16 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
struct page *page;
void *ret;
- page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
+ if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ dma_alloc_need_uncached(dev, attrs) &&
+ !gfpflags_allow_blocking(gfp)) {
+ ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp);
+ if (!ret)
+ return NULL;
+ goto done;
+ }
+
+ page = __dma_direct_alloc_pages(dev, size, gfp, attrs);
if (!page)
return NULL;
@@ -132,9 +154,28 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
/* remove any dirty cache lines on the kernel alias */
if (!PageHighMem(page))
arch_dma_prep_coherent(page, size);
- *dma_handle = phys_to_dma(dev, page_to_phys(page));
/* return the page pointer as the opaque cookie */
- return page;
+ ret = page;
+ goto done;
+ }
+
+ if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ dma_alloc_need_uncached(dev, attrs)) ||
+ (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
+ /* remove any dirty cache lines on the kernel alias */
+ arch_dma_prep_coherent(page, PAGE_ALIGN(size));
+
+ /* create a coherent mapping */
+ ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size),
+ dma_pgprot(dev, PAGE_KERNEL, attrs),
+ __builtin_return_address(0));
+ if (!ret) {
+ dma_free_contiguous(dev, page, size);
+ return ret;
+ }
+
+ memset(ret, 0, size);
+ goto done;
}
if (PageHighMem(page)) {
@@ -145,17 +186,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
* so log an error and fail.
*/
dev_info(dev, "Rejecting highmem page from CMA.\n");
- __dma_direct_free_pages(dev, size, page);
+ dma_free_contiguous(dev, page, size);
return NULL;
}
ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
+ if (force_dma_unencrypted(dev))
set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
- *dma_handle = __phys_to_dma(dev, page_to_phys(page));
- } else {
- *dma_handle = phys_to_dma(dev, page_to_phys(page));
- }
+
memset(ret, 0, size);
if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
@@ -163,15 +201,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
arch_dma_prep_coherent(page, size);
ret = uncached_kernel_address(ret);
}
-
+done:
+ if (force_dma_unencrypted(dev))
+ *dma_handle = __phys_to_dma(dev, page_to_phys(page));
+ else
+ *dma_handle = phys_to_dma(dev, page_to_phys(page));
return ret;
}
-void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
-{
- dma_free_contiguous(dev, page, size);
-}
-
void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_addr, unsigned long attrs)
{
@@ -180,23 +217,28 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
!force_dma_unencrypted(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
- __dma_direct_free_pages(dev, size, cpu_addr);
+ dma_free_contiguous(dev, cpu_addr, size);
return;
}
+ if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ dma_free_from_pool(cpu_addr, PAGE_ALIGN(size)))
+ return;
+
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
- if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
- dma_alloc_need_uncached(dev, attrs))
- cpu_addr = cached_kernel_address(cpu_addr);
- __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
+ if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
+ vunmap(cpu_addr);
+
+ dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
}
void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
dma_alloc_need_uncached(dev, attrs))
return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
@@ -206,6 +248,7 @@ void dma_direct_free(struct device *dev, size_t size,
void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
dma_alloc_need_uncached(dev, attrs))
arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
else
@@ -223,7 +266,7 @@ void dma_direct_sync_single_for_device(struct device *dev,
swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(dev, paddr, size, dir);
+ arch_sync_dma_for_device(paddr, size, dir);
}
EXPORT_SYMBOL(dma_direct_sync_single_for_device);
@@ -241,7 +284,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
dir, SYNC_FOR_DEVICE);
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(dev, paddr, sg->length,
+ arch_sync_dma_for_device(paddr, sg->length,
dir);
}
}
@@ -257,8 +300,8 @@ void dma_direct_sync_single_for_cpu(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, addr);
if (!dev_is_dma_coherent(dev)) {
- arch_sync_dma_for_cpu(dev, paddr, size, dir);
- arch_sync_dma_for_cpu_all(dev);
+ arch_sync_dma_for_cpu(paddr, size, dir);
+ arch_sync_dma_for_cpu_all();
}
if (unlikely(is_swiotlb_buffer(paddr)))
@@ -276,7 +319,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_cpu(dev, paddr, sg->length, dir);
+ arch_sync_dma_for_cpu(paddr, sg->length, dir);
if (unlikely(is_swiotlb_buffer(paddr)))
swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
@@ -284,7 +327,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
}
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_cpu_all(dev);
+ arch_sync_dma_for_cpu_all();
}
EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
@@ -297,7 +340,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
if (unlikely(is_swiotlb_buffer(phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+ swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
}
EXPORT_SYMBOL(dma_direct_unmap_page);
@@ -318,7 +361,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
size_t size)
{
return swiotlb_force != SWIOTLB_FORCE &&
- dma_capable(dev, dma_addr, size);
+ dma_capable(dev, dma_addr, size, true);
}
dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
@@ -335,7 +378,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
}
if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- arch_sync_dma_for_device(dev, phys, size, dir);
+ arch_sync_dma_for_device(phys, size, dir);
return dma_addr;
}
EXPORT_SYMBOL(dma_direct_map_page);
@@ -367,7 +410,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
{
dma_addr_t dma_addr = paddr;
- if (unlikely(!dma_direct_possible(dev, dma_addr, size))) {
+ if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
report_addr(dev, dma_addr, size);
return DMA_MAPPING_ERROR;
}
@@ -376,6 +419,59 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
}
EXPORT_SYMBOL(dma_direct_map_resource);
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ struct page *page = dma_direct_to_page(dev, dma_addr);
+ int ret;
+
+ ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+ if (!ret)
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ return ret;
+}
+
+#ifdef CONFIG_MMU
+bool dma_direct_can_mmap(struct device *dev)
+{
+ return dev_is_dma_coherent(dev) ||
+ IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP);
+}
+
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ unsigned long user_count = vma_pages(vma);
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr));
+ int ret = -ENXIO;
+
+ vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+
+ if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+ return ret;
+
+ if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff)
+ return -ENXIO;
+ return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+ user_count << PAGE_SHIFT, vma->vm_page_prot);
+}
+#else /* CONFIG_MMU */
+bool dma_direct_can_mmap(struct device *dev)
+{
+ return false;
+}
+
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ return -ENXIO;
+}
+#endif /* CONFIG_MMU */
+
/*
* Because 32-bit DMA masks are so common we expect every architecture to be
* able to satisfy them - either by not supporting more physical memory, or by
@@ -387,7 +483,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
u64 min_mask;
if (IS_ENABLED(CONFIG_ZONE_DMA))
- min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
+ min_mask = DMA_BIT_MASK(zone_dma_bits);
else
min_mask = DMA_BIT_MASK(32);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b0038ca3aa92..12ff766ec1fa 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -112,41 +112,38 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
- struct page *page;
+ struct page *page = virt_to_page(cpu_addr);
int ret;
- if (!dev_is_dma_coherent(dev)) {
- unsigned long pfn;
-
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
- return -ENXIO;
-
- /* If the PFN is not valid, we do not have a struct page */
- pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
- if (!pfn_valid(pfn))
- return -ENXIO;
- page = pfn_to_page(pfn);
- } else {
- page = virt_to_page(cpu_addr);
- }
-
ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
if (!ret)
sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
return ret;
}
+/*
+ * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
+ * that the intention is to allow exporting memory allocated via the
+ * coherent DMA APIs through the dma_buf API, which only accepts a
+ * scattertable. This presents a couple of problems:
+ * 1. Not all memory allocated via the coherent DMA APIs is backed by
+ * a struct page
+ * 2. Passing coherent DMA memory into the streaming APIs is not allowed
+ * as we will try to flush the memory through a different alias to that
+ * actually being used (and the flushes are redundant.)
+ */
int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (!dma_is_direct(ops) && ops->get_sgtable)
- return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
- attrs);
- return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
- attrs);
+ if (dma_is_direct(ops))
+ return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+ size, attrs);
+ if (!ops->get_sgtable)
+ return -ENXIO;
+ return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs);
}
EXPORT_SYMBOL(dma_get_sgtable_attrs);
@@ -161,9 +158,11 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
(IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
(attrs & DMA_ATTR_NON_CONSISTENT)))
return prot;
- if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT))
- return arch_dma_mmap_pgprot(dev, prot, attrs);
- return pgprot_noncached(prot);
+#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
+ if (attrs & DMA_ATTR_WRITE_COMBINE)
+ return pgprot_writecombine(prot);
+#endif
+ return pgprot_dmacoherent(prot);
}
#endif /* CONFIG_MMU */
@@ -174,11 +173,10 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
-#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP
+#ifdef CONFIG_MMU
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
- unsigned long pfn;
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
@@ -189,26 +187,32 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
if (off >= count || user_count > count - off)
return -ENXIO;
- if (!dev_is_dma_coherent(dev)) {
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
- return -ENXIO;
-
- /* If the PFN is not valid, we do not have a struct page */
- pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
- if (!pfn_valid(pfn))
- return -ENXIO;
- } else {
- pfn = page_to_pfn(virt_to_page(cpu_addr));
- }
-
- return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+ return remap_pfn_range(vma, vma->vm_start,
+ page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
user_count << PAGE_SHIFT, vma->vm_page_prot);
#else
return -ENXIO;
-#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
+#endif /* CONFIG_MMU */
}
/**
+ * dma_can_mmap - check if a given device supports dma_mmap_*
+ * @dev: device to check
+ *
+ * Returns %true if @dev supports dma_mmap_coherent() and dma_mmap_attrs() to
+ * map DMA allocations to userspace.
+ */
+bool dma_can_mmap(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_is_direct(ops))
+ return dma_direct_can_mmap(dev);
+ return ops->mmap != NULL;
+}
+EXPORT_SYMBOL_GPL(dma_can_mmap);
+
+/**
* dma_mmap_attrs - map a coherent DMA allocation into user space
* @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
* @vma: vm_area_struct describing requested user mapping
@@ -227,31 +231,15 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (!dma_is_direct(ops) && ops->mmap)
- return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
- return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+ if (dma_is_direct(ops))
+ return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
+ attrs);
+ if (!ops->mmap)
+ return -ENXIO;
+ return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
}
EXPORT_SYMBOL(dma_mmap_attrs);
-static u64 dma_default_get_required_mask(struct device *dev)
-{
- u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
- u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT));
- u64 mask;
-
- if (!high_totalram) {
- /* convert to mask just covering totalram */
- low_totalram = (1 << (fls(low_totalram) - 1));
- low_totalram += low_totalram - 1;
- mask = low_totalram;
- } else {
- high_totalram = (1 << (fls(high_totalram) - 1));
- high_totalram += high_totalram - 1;
- mask = (((u64)high_totalram) << 32) + 0xffffffff;
- }
- return mask;
-}
-
u64 dma_get_required_mask(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -260,7 +248,16 @@ u64 dma_get_required_mask(struct device *dev)
return dma_direct_get_required_mask(dev);
if (ops->get_required_mask)
return ops->get_required_mask(dev);
- return dma_default_get_required_mask(dev);
+
+ /*
+ * We require every DMA ops implementation to at least support a 32-bit
+ * DMA mask (and use bounce buffering if that isn't supported in
+ * hardware). As the direct mapping code has its own routine to
+ * actually report an optimal mask we default to 32-bit here as that
+ * is the right thing for most IOMMUs, and at least not actively
+ * harmful in general.
+ */
+ return DMA_BIT_MASK(32);
}
EXPORT_SYMBOL_GPL(dma_get_required_mask);
@@ -317,12 +314,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
}
EXPORT_SYMBOL(dma_free_attrs);
-static inline void dma_check_mask(struct device *dev, u64 mask)
-{
- if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
- dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
-}
-
int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -353,7 +344,6 @@ int dma_set_mask(struct device *dev, u64 mask)
return -EIO;
arch_dma_set_mask(dev, mask);
- dma_check_mask(dev, mask);
*dev->dma_mask = mask;
return 0;
}
@@ -371,7 +361,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
if (!dma_supported(dev, mask))
return -EIO;
- dma_check_mask(dev, mask);
dev->coherent_dma_mask = mask;
return 0;
}
@@ -405,3 +394,14 @@ size_t dma_max_mapping_size(struct device *dev)
return size;
}
EXPORT_SYMBOL_GPL(dma_max_mapping_size);
+
+unsigned long dma_get_merge_boundary(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (!ops || !ops->get_merge_boundary)
+ return 0; /* can't merge */
+
+ return ops->get_merge_boundary(dev);
+}
+EXPORT_SYMBOL_GPL(dma_get_merge_boundary);
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index ffe78f0b2fe4..d14cbc83986a 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -11,13 +11,21 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
+struct page **dma_common_find_pages(void *cpu_addr)
+{
+ struct vm_struct *area = find_vm_area(cpu_addr);
+
+ if (!area || area->flags != VM_DMA_COHERENT)
+ return NULL;
+ return area->pages;
+}
+
static struct vm_struct *__dma_common_pages_remap(struct page **pages,
- size_t size, unsigned long vm_flags, pgprot_t prot,
- const void *caller)
+ size_t size, pgprot_t prot, const void *caller)
{
struct vm_struct *area;
- area = get_vm_area_caller(size, vm_flags, caller);
+ area = get_vm_area_caller(size, VM_DMA_COHERENT, caller);
if (!area)
return NULL;
@@ -34,12 +42,11 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages,
* Cannot be used in non-sleeping contexts
*/
void *dma_common_pages_remap(struct page **pages, size_t size,
- unsigned long vm_flags, pgprot_t prot,
- const void *caller)
+ pgprot_t prot, const void *caller)
{
struct vm_struct *area;
- area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+ area = __dma_common_pages_remap(pages, size, prot, caller);
if (!area)
return NULL;
@@ -53,7 +60,6 @@ void *dma_common_pages_remap(struct page **pages, size_t size,
* Cannot be used in non-sleeping contexts
*/
void *dma_common_contiguous_remap(struct page *page, size_t size,
- unsigned long vm_flags,
pgprot_t prot, const void *caller)
{
int i;
@@ -67,7 +73,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,
for (i = 0; i < (size >> PAGE_SHIFT); i++)
pages[i] = nth_page(page, i);
- area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+ area = __dma_common_pages_remap(pages, size, prot, caller);
kfree(pages);
@@ -79,11 +85,11 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,
/*
* Unmaps a range previously mapped by dma_common_*_remap
*/
-void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
+void dma_common_free_remap(void *cpu_addr, size_t size)
{
struct vm_struct *area = find_vm_area(cpu_addr);
- if (!area || (area->flags & vm_flags) != vm_flags) {
+ if (!area || area->flags != VM_DMA_COHERENT) {
WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
return;
}
@@ -105,7 +111,16 @@ static int __init early_coherent_pool(char *p)
}
early_param("coherent_pool", early_coherent_pool);
-int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
+static gfp_t dma_atomic_pool_gfp(void)
+{
+ if (IS_ENABLED(CONFIG_ZONE_DMA))
+ return GFP_DMA;
+ if (IS_ENABLED(CONFIG_ZONE_DMA32))
+ return GFP_DMA32;
+ return GFP_KERNEL;
+}
+
+static int __init dma_atomic_pool_init(void)
{
unsigned int pool_size_order = get_order(atomic_pool_size);
unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
@@ -117,7 +132,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
page = dma_alloc_from_contiguous(NULL, nr_pages,
pool_size_order, false);
else
- page = alloc_pages(gfp, pool_size_order);
+ page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order);
if (!page)
goto out;
@@ -127,8 +142,9 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
if (!atomic_pool)
goto free_page;
- addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP,
- prot, __builtin_return_address(0));
+ addr = dma_common_contiguous_remap(page, atomic_pool_size,
+ pgprot_dmacoherent(PAGE_KERNEL),
+ __builtin_return_address(0));
if (!addr)
goto destroy_genpool;
@@ -143,7 +159,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
return 0;
remove_mapping:
- dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP);
+ dma_common_free_remap(addr, atomic_pool_size);
destroy_genpool:
gen_pool_destroy(atomic_pool);
atomic_pool = NULL;
@@ -155,13 +171,14 @@ out:
atomic_pool_size / 1024);
return -ENOMEM;
}
+postcore_initcall(dma_atomic_pool_init);
bool dma_in_atomic_pool(void *start, size_t size)
{
if (unlikely(!atomic_pool))
return false;
- return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+ return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
}
void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
@@ -193,59 +210,4 @@ bool dma_free_from_pool(void *start, size_t size)
gen_pool_free(atomic_pool, (unsigned long)start, size);
return true;
}
-
-void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t flags, unsigned long attrs)
-{
- struct page *page = NULL;
- void *ret;
-
- size = PAGE_ALIGN(size);
-
- if (!gfpflags_allow_blocking(flags)) {
- ret = dma_alloc_from_pool(size, &page, flags);
- if (!ret)
- return NULL;
- goto done;
- }
-
- page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
- if (!page)
- return NULL;
-
- /* remove any dirty cache lines on the kernel alias */
- arch_dma_prep_coherent(page, size);
-
- /* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
- dma_pgprot(dev, PAGE_KERNEL, attrs),
- __builtin_return_address(0));
- if (!ret) {
- __dma_direct_free_pages(dev, size, page);
- return ret;
- }
-
- memset(ret, 0, size);
-done:
- *dma_handle = phys_to_dma(dev, page_to_phys(page));
- return ret;
-}
-
-void arch_dma_free(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_handle, unsigned long attrs)
-{
- if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
- phys_addr_t phys = dma_to_phys(dev, dma_handle);
- struct page *page = pfn_to_page(__phys_to_pfn(phys));
-
- vunmap(vaddr);
- __dma_direct_free_pages(dev, size, page);
- }
-}
-
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
- dma_addr_t dma_addr)
-{
- return __phys_to_pfn(dma_to_phys(dev, dma_addr));
-}
#endif /* CONFIG_DMA_DIRECT_REMAP */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9de232229063..9280d6f8271e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -444,7 +444,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
dma_addr_t tbl_dma_addr,
- phys_addr_t orig_addr, size_t size,
+ phys_addr_t orig_addr,
+ size_t mapping_size,
+ size_t alloc_size,
enum dma_data_direction dir,
unsigned long attrs)
{
@@ -461,8 +463,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
if (mem_encrypt_active())
- pr_warn_once("%s is active and system is using DMA bounce buffers\n",
- sme_active() ? "SME" : "SEV");
+ pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+
+ if (mapping_size > alloc_size) {
+ dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
+ mapping_size, alloc_size);
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
mask = dma_get_seg_boundary(hwdev);
@@ -471,8 +478,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
/*
- * Carefully handle integer overflow which can occur when mask == ~0UL.
- */
+ * Carefully handle integer overflow which can occur when mask == ~0UL.
+ */
max_slots = mask + 1
? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
: 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
@@ -481,8 +488,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
* For mappings greater than or equal to a page, we limit the stride
* (and hence alignment) to a page size.
*/
- nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- if (size >= PAGE_SIZE)
+ nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ if (alloc_size >= PAGE_SIZE)
stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
else
stride = 1;
@@ -547,7 +554,7 @@ not_found:
spin_unlock_irqrestore(&io_tlb_lock, flags);
if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- size, io_tlb_nslabs, tmp_io_tlb_used);
+ alloc_size, io_tlb_nslabs, tmp_io_tlb_used);
return (phys_addr_t)DMA_MAPPING_ERROR;
found:
io_tlb_used += nslots;
@@ -562,7 +569,7 @@ found:
io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE);
+ swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
}
@@ -571,11 +578,11 @@ found:
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+ size_t mapping_size, size_t alloc_size,
+ enum dma_data_direction dir, unsigned long attrs)
{
unsigned long flags;
- int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = io_tlb_orig_addr[index];
@@ -585,7 +592,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
if (orig_addr != INVALID_PHYS_ADDR &&
!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
- swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE);
+ swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);
/*
* Return the buffer to the free list by setting the corresponding
@@ -665,14 +672,14 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
/* Oh well, have to allocate and map a bounce buffer. */
*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
- *phys, size, dir, attrs);
+ *phys, size, size, dir, attrs);
if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)
return false;
/* Ensure that the address returned is DMA'ble */
*dma_addr = __phys_to_dma(dev, *phys);
- if (unlikely(!dma_capable(dev, *dma_addr, size))) {
- swiotlb_tbl_unmap_single(dev, *phys, size, dir,
+ if (unlikely(!dma_capable(dev, *dma_addr, size, true))) {
+ swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
return false;
}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index fc482c8e0bd8..57fb4dcff434 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -3,6 +3,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/binfmts.h>
+#include <linux/elfcore.h>
Elf_Half __weak elf_core_extra_phdrs(void)
{
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0463c1151bae..e453589da97c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -951,9 +951,9 @@ list_update_cgroup_event(struct perf_event *event,
/*
* Because cgroup events are always per-cpu events,
- * this will always be called from the right CPU.
+ * @ctx == &cpuctx->ctx.
*/
- cpuctx = __get_cpu_context(ctx);
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
/*
* Since setting cpuctx->cgrp is conditional on the current @cgrp
@@ -979,7 +979,8 @@ list_update_cgroup_event(struct perf_event *event,
cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
if (add)
- list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+ list_add(cpuctx_entry,
+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
else
list_del(cpuctx_entry);
}
@@ -1031,7 +1032,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
{
}
-void
+static inline void
perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}
@@ -1103,7 +1104,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
raw_spin_lock_init(&cpuctx->hrtimer_lock);
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
@@ -1121,7 +1122,7 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
if (!cpuctx->hrtimer_active) {
cpuctx->hrtimer_active = 1;
hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
@@ -1887,6 +1888,104 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->generation++;
}
+static int
+perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
+{
+ if (!has_aux(aux_event))
+ return 0;
+
+ if (!event->pmu->aux_output_match)
+ return 0;
+
+ return event->pmu->aux_output_match(aux_event);
+}
+
+static void put_event(struct perf_event *event);
+static void event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx);
+
+static void perf_put_aux_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event *iter;
+
+ /*
+ * If event uses aux_event tear down the link
+ */
+ if (event->aux_event) {
+ iter = event->aux_event;
+ event->aux_event = NULL;
+ put_event(iter);
+ return;
+ }
+
+ /*
+ * If the event is an aux_event, tear down all links to
+ * it from other events.
+ */
+ for_each_sibling_event(iter, event->group_leader) {
+ if (iter->aux_event != event)
+ continue;
+
+ iter->aux_event = NULL;
+ put_event(event);
+
+ /*
+ * If it's ACTIVE, schedule it out and put it into ERROR
+ * state so that we don't try to schedule it again. Note
+ * that perf_event_enable() will clear the ERROR status.
+ */
+ event_sched_out(iter, cpuctx, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ }
+}
+
+static bool perf_need_aux_event(struct perf_event *event)
+{
+ return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+}
+
+static int perf_get_aux_event(struct perf_event *event,
+ struct perf_event *group_leader)
+{
+ /*
+ * Our group leader must be an aux event if we want to be
+ * an aux_output. This way, the aux event will precede its
+ * aux_output events in the group, and therefore will always
+ * schedule first.
+ */
+ if (!group_leader)
+ return 0;
+
+ /*
+ * aux_output and aux_sample_size are mutually exclusive.
+ */
+ if (event->attr.aux_output && event->attr.aux_sample_size)
+ return 0;
+
+ if (event->attr.aux_output &&
+ !perf_aux_output_match(event, group_leader))
+ return 0;
+
+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
+ return 0;
+
+ if (!atomic_long_inc_not_zero(&group_leader->refcount))
+ return 0;
+
+ /*
+ * Link aux_outputs to their aux event; this is undone in
+ * perf_group_detach() by perf_put_aux_event(). When the
+ * group in torn down, the aux_output events loose their
+ * link to the aux_event and can't schedule any more.
+ */
+ event->aux_event = group_leader;
+
+ return 1;
+}
+
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
@@ -1902,6 +2001,8 @@ static void perf_group_detach(struct perf_event *event)
event->attach_state &= ~PERF_ATTACH_GROUP;
+ perf_put_aux_event(event);
+
/*
* If this is a sibling, remove it from its group.
*/
@@ -2154,7 +2255,7 @@ static void __perf_event_disable(struct perf_event *event,
*
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
- * remains valid. This condition is satisifed when called through
+ * remains valid. This condition is satisfied when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event().
@@ -2581,6 +2682,25 @@ perf_install_in_context(struct perf_event_context *ctx,
*/
smp_store_release(&event->ctx, ctx);
+ /*
+ * perf_event_attr::disabled events will not run and can be initialized
+ * without IPI. Except when this is the first event for the context, in
+ * that case we need the magic of the IPI to set ctx->is_active.
+ *
+ * The IOC_ENABLE that is sure to follow the creation of a disabled
+ * event will issue the IPI and reprogram the hardware.
+ */
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+ raw_spin_lock_irq(&ctx->lock);
+ if (ctx->task == TASK_TOMBSTONE) {
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+
if (!task) {
cpu_function_call(cpu, __perf_install_in_context, event);
return;
@@ -3119,10 +3239,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
+ struct pmu *pmu = ctx->pmu;
+
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ /*
+ * PMU specific parts of task perf context can require
+ * additional synchronization. As an example of such
+ * synchronization see implementation details of Intel
+ * LBR call stack data profiling;
+ */
+ if (pmu->swap_task_ctx)
+ pmu->swap_task_ctx(ctx, next_ctx);
+ else
+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3694,11 +3825,23 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
perf_event_groups_insert(&ctx->flexible_groups, event);
}
+/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_first_active(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_context *ctx)
{
- return list_first_entry_or_null(&ctx->flexible_active,
- struct perf_event, active_list);
+ struct perf_event *event;
+
+ /* pick the first active flexible event */
+ event = list_first_entry_or_null(&ctx->flexible_active,
+ struct perf_event, active_list);
+
+ /* if no active flexible event, pick the first event */
+ if (!event) {
+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
+ typeof(*event), group_node);
+ }
+
+ return event;
}
static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3723,9 +3866,9 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_rotate)
- task_event = ctx_first_active(task_ctx);
+ task_event = ctx_event_to_rotate(task_ctx);
if (cpu_rotate)
- cpu_event = ctx_first_active(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
@@ -4089,10 +4232,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
return NULL;
__perf_event_init_context(ctx);
- if (task) {
- ctx->task = task;
- get_task_struct(task);
- }
+ if (task)
+ ctx->task = get_task_struct(task);
ctx->pmu = pmu;
return ctx;
@@ -4134,8 +4275,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (!task) {
/* Must be root to operate on a CPU event: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return ERR_PTR(-EACCES);
+ err = perf_allow_cpu(&event->attr);
+ if (err)
+ return ERR_PTR(err);
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -4232,7 +4374,7 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
+ struct perf_buffer *rb);
static void detach_sb_event(struct perf_event *event)
{
@@ -4444,6 +4586,8 @@ static void _free_event(struct perf_event *event)
unaccount_event(event);
+ security_perf_event_free(event);
+
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
@@ -4897,6 +5041,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct perf_event_context *ctx;
int ret;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = __perf_read(event, buf, count);
perf_event_ctx_unlock(event, ctx);
@@ -4907,7 +5055,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
static __poll_t perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
poll_wait(file, &event->waitq, wait);
@@ -4934,6 +5082,24 @@ static void _perf_event_reset(struct perf_event *event)
perf_event_update_userpage(event);
}
+/* Assume it's not an event with inherit set. */
+u64 perf_event_pause(struct perf_event *event, bool reset)
+{
+ struct perf_event_context *ctx;
+ u64 count;
+
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(event->attr.inherit);
+ _perf_event_disable(event);
+ count = local64_read(&event->count);
+ if (reset)
+ local64_set(&event->count, 0);
+ perf_event_ctx_unlock(event, ctx);
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(perf_event_pause);
+
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
@@ -5011,16 +5177,11 @@ static int perf_event_check_period(struct perf_event *event, u64 value)
return event->pmu->check_period(event, value);
}
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
+static int _perf_event_period(struct perf_event *event, u64 value)
{
- u64 value;
-
if (!is_sampling_event(event))
return -EINVAL;
- if (copy_from_user(&value, arg, sizeof(value)))
- return -EFAULT;
-
if (!value)
return -EINVAL;
@@ -5038,6 +5199,19 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
return 0;
}
+int perf_event_period(struct perf_event *event, u64 value)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_event_period(event, value);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(perf_event_period);
+
static const struct file_operations perf_fops;
static inline int perf_fget_light(int fd, struct fd *p)
@@ -5081,8 +5255,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
return _perf_event_refresh(event, arg);
case PERF_EVENT_IOC_PERIOD:
- return perf_event_period(event, (u64 __user *)arg);
+ {
+ u64 value;
+
+ if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
+ return -EFAULT;
+ return _perf_event_period(event, value);
+ }
case PERF_EVENT_IOC_ID:
{
u64 id = primary_event_id(event);
@@ -5117,7 +5297,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
return perf_event_set_bpf_prog(event, arg);
case PERF_EVENT_IOC_PAUSE_OUTPUT: {
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5161,6 +5341,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct perf_event_context *ctx;
long ret;
+ /* Treat ioctl like writes as it is likely a mutating operation. */
+ ret = security_perf_event_write(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = _perf_ioctl(event, cmd, arg);
perf_event_ctx_unlock(event, ctx);
@@ -5248,7 +5433,7 @@ static void calc_timer_values(struct perf_event *event,
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5280,7 +5465,7 @@ void __weak arch_perf_update_userpage(
void perf_event_update_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
u64 enabled, running, now;
rcu_read_lock();
@@ -5331,7 +5516,7 @@ EXPORT_SYMBOL_GPL(perf_event_update_userpage);
static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
{
struct perf_event *event = vmf->vma->vm_file->private_data;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
vm_fault_t ret = VM_FAULT_SIGBUS;
if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -5364,9 +5549,9 @@ unlock:
}
static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb)
+ struct perf_buffer *rb)
{
- struct ring_buffer *old_rb = NULL;
+ struct perf_buffer *old_rb = NULL;
unsigned long flags;
if (event->rb) {
@@ -5424,7 +5609,7 @@ static void ring_buffer_attach(struct perf_event *event,
static void ring_buffer_wakeup(struct perf_event *event)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5435,9 +5620,9 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_unlock();
}
-struct ring_buffer *ring_buffer_get(struct perf_event *event)
+struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5450,7 +5635,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)
return rb;
}
-void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct perf_buffer *rb)
{
if (!refcount_dec_and_test(&rb->refcount))
return;
@@ -5488,7 +5673,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- struct ring_buffer *rb = ring_buffer_get(event);
+ struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
@@ -5512,7 +5697,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
perf_pmu_output_stop(event);
/* now it's safe to free the pages */
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
/* this has to be the last one */
@@ -5585,7 +5770,8 @@ again:
* undo the VM accounting.
*/
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
+ &mmap_user->locked_vm);
atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
free_uid(mmap_user);
@@ -5605,8 +5791,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
+ struct perf_buffer *rb = NULL;
unsigned long locked, lock_limit;
- struct ring_buffer *rb = NULL;
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
@@ -5623,6 +5809,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
vma_size = vma->vm_end - vma->vm_start;
if (vma->vm_pgoff == 0) {
@@ -5727,16 +5917,30 @@ accounting:
*/
user_lock_limit *= num_online_cpus();
- user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+ user_locked = atomic_long_read(&user->locked_vm);
+ /*
+ * sysctl_perf_event_mlock may have changed, so that
+ * user->locked_vm > user_lock_limit
+ */
if (user_locked > user_lock_limit)
+ user_locked = user_lock_limit;
+ user_locked += user_extra;
+
+ if (user_locked > user_lock_limit) {
+ /*
+ * charge locked_vm until it hits user_lock_limit;
+ * charge the rest from pinned_vm
+ */
extra = user_locked - user_lock_limit;
+ user_extra -= extra;
+ }
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
- if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ if ((locked > lock_limit) && perf_is_paranoid() &&
!capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
@@ -5971,7 +6175,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr,
* Get remaining task size from user stack pointer.
*
* It'd be better to take stack vma map and limit this more
- * precisly, but there's no way to get it safely under interrupt,
+ * precisely, but there's no way to get it safely under interrupt,
* so using TASK_SIZE as limit.
*/
static u64 perf_ustack_task_size(struct pt_regs *regs)
@@ -6066,6 +6270,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}
+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
+ struct perf_sample_data *data,
+ size_t size)
+{
+ struct perf_event *sampler = event->aux_event;
+ struct perf_buffer *rb;
+
+ data->aux_size = 0;
+
+ if (!sampler)
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
+ goto out;
+
+ rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ if (!rb)
+ goto out;
+
+ /*
+ * If this is an NMI hit inside sampling code, don't take
+ * the sample. See also perf_aux_sample_output().
+ */
+ if (READ_ONCE(rb->aux_in_sampling)) {
+ data->aux_size = 0;
+ } else {
+ size = min_t(size_t, size, perf_aux_size(rb));
+ data->aux_size = ALIGN(size, sizeof(u64));
+ }
+ ring_buffer_put(rb);
+
+out:
+ return data->aux_size;
+}
+
+long perf_pmu_snapshot_aux(struct perf_buffer *rb,
+ struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
+{
+ unsigned long flags;
+ long ret;
+
+ /*
+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
+ * paths. If we start calling them in NMI context, they may race with
+ * the IRQ ones, that is, for example, re-starting an event that's just
+ * been stopped, which is why we're using a separate callback that
+ * doesn't change the event state.
+ *
+ * IRQs need to be disabled to prevent IPIs from racing with us.
+ */
+ local_irq_save(flags);
+ /*
+ * Guard against NMI hits inside the critical section;
+ * see also perf_prepare_sample_aux().
+ */
+ WRITE_ONCE(rb->aux_in_sampling, 1);
+ barrier();
+
+ ret = event->pmu->snapshot_aux(event, handle, size);
+
+ barrier();
+ WRITE_ONCE(rb->aux_in_sampling, 0);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ struct perf_event *sampler = event->aux_event;
+ struct perf_buffer *rb;
+ unsigned long pad;
+ long size;
+
+ if (WARN_ON_ONCE(!sampler || !data->aux_size))
+ return;
+
+ rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ if (!rb)
+ return;
+
+ size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
+
+ /*
+ * An error here means that perf_output_copy() failed (returned a
+ * non-zero surplus that it didn't copy), which in its current
+ * enlightened implementation is not possible. If that changes, we'd
+ * like to know.
+ */
+ if (WARN_ON_ONCE(size < 0))
+ goto out_put;
+
+ /*
+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
+ * perf_prepare_sample_aux(), so should not be more than that.
+ */
+ pad = data->aux_size - size;
+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
+ pad = 8;
+
+ if (pad) {
+ u64 zero = 0;
+ perf_output_copy(handle, &zero, pad);
+ }
+
+out_put:
+ ring_buffer_put(rb);
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -6385,11 +6705,18 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);
+ if (sample_type & PERF_SAMPLE_AUX) {
+ perf_output_put(handle, data->aux_size);
+
+ if (data->aux_size)
+ perf_aux_sample_output(event, handle, data);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
if (wakeup_events) {
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
int events = local_inc_return(&rb->events);
if (events >= wakeup_events) {
@@ -6533,7 +6860,7 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_STACK_USER) {
/*
- * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
* in case new sample type is added, because we could eat
* up the rest of the sample size.
@@ -6573,6 +6900,35 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ u64 size;
+
+ header->size += sizeof(u64); /* size */
+
+ /*
+ * Given the 16bit nature of header::size, an AUX sample can
+ * easily overflow it, what with all the preceding sample bits.
+ * Make sure this doesn't happen by using up to U16_MAX bytes
+ * per sample in total (rounded down to 8 byte boundary).
+ */
+ size = min_t(size_t, U16_MAX - header->size,
+ event->attr.aux_sample_size);
+ size = rounddown(size, 8);
+ size = perf_prepare_sample_aux(event, data, size);
+
+ WARN_ON_ONCE(size + header->size > U16_MAX);
+ header->size += size;
+ }
+ /*
+ * If you're adding more sample types here, you likely need to do
+ * something about the overflowing header::size, like repurpose the
+ * lowest 3 bits of size, which should be always zero at the moment.
+ * This raises a more important question, do we really need 512k sized
+ * samples and why, so good argumentation is in order for whatever you
+ * do here next.
+ */
+ WARN_ON_ONCE(header->size & 7);
}
static __always_inline int
@@ -6803,7 +7159,7 @@ void perf_event_exec(void)
}
struct remote_output {
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
int err;
};
@@ -6811,7 +7167,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
{
struct perf_event *parent = event->parent;
struct remote_output *ro = data;
- struct ring_buffer *rb = ro->rb;
+ struct perf_buffer *rb = ro->rb;
struct stop_event_data sd = {
.event = event,
};
@@ -6839,7 +7195,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
- struct pmu *pmu = event->pmu;
+ struct pmu *pmu = event->ctx->pmu;
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
struct remote_output ro = {
.rb = event->rb,
@@ -7148,7 +7504,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
{
struct path ns_path;
struct inode *ns_inode;
- void *error;
+ int error;
error = ns_get_path(&ns_path, task, ns_ops);
if (!error) {
@@ -9491,7 +9847,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
period = max_t(u64, 10000, hwc->sample_period);
}
hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@ -9513,7 +9869,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hwc->hrtimer.function = perf_swevent_hrtimer;
/*
@@ -9924,7 +10280,7 @@ static struct lock_class_key cpuctx_lock;
int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
- int cpu, ret;
+ int cpu, ret, max = PERF_TYPE_MAX;
mutex_lock(&pmus_lock);
ret = -ENOMEM;
@@ -9937,12 +10293,17 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto skip_type;
pmu->name = name;
- if (type < 0) {
- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
- if (type < 0) {
- ret = type;
+ if (type != PERF_TYPE_SOFTWARE) {
+ if (type >= 0)
+ max = type;
+
+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+ if (ret < 0)
goto free_pdc;
- }
+
+ WARN_ON(type >= 0 && ret != type);
+
+ type = ret;
}
pmu->type = type;
@@ -10019,7 +10380,16 @@ got_cpu_context:
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
- list_add_rcu(&pmu->entry, &pmus);
+ /*
+ * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
+ * since these cannot be in the IDR. This way the linear search
+ * is fast, provided a valid software event is provided.
+ */
+ if (type == PERF_TYPE_SOFTWARE || !name)
+ list_add_rcu(&pmu->entry, &pmus);
+ else
+ list_add_tail_rcu(&pmu->entry, &pmus);
+
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
@@ -10032,7 +10402,7 @@ free_dev:
put_device(pmu->dev);
free_idr:
- if (pmu->type >= PERF_TYPE_MAX)
+ if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
free_pdc:
@@ -10054,7 +10424,7 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
- if (pmu->type >= PERF_TYPE_MAX)
+ if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
if (pmu_bus_running) {
if (pmu->nr_addr_filters)
@@ -10124,9 +10494,8 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
static struct pmu *perf_init_event(struct perf_event *event)
{
+ int idx, type, ret;
struct pmu *pmu;
- int idx;
- int ret;
idx = srcu_read_lock(&pmus_srcu);
@@ -10138,17 +10507,32 @@ static struct pmu *perf_init_event(struct perf_event *event)
goto unlock;
}
+ /*
+ * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ * are often aliases for PERF_TYPE_RAW.
+ */
+ type = event->attr.type;
+ if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
+ type = PERF_TYPE_RAW;
+
+again:
rcu_read_lock();
- pmu = idr_find(&pmu_idr, event->attr.type);
+ pmu = idr_find(&pmu_idr, type);
rcu_read_unlock();
if (pmu) {
ret = perf_try_init_event(pmu, event);
+ if (ret == -ENOENT && event->attr.type != type) {
+ type = event->attr.type;
+ goto again;
+ }
+
if (ret)
pmu = ERR_PTR(ret);
+
goto unlock;
}
- list_for_each_entry_rcu(pmu, &pmus, entry) {
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
goto unlock;
@@ -10355,8 +10739,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* and we cannot use the ctx information because we need the
* pmu before we get a ctx.
*/
- get_task_struct(task);
- event->hw.target = task;
+ event->hw.target = get_task_struct(task);
}
event->clock = &local_clock;
@@ -10368,12 +10751,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
if (overflow_handler == bpf_overflow_handler) {
- struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
+ struct bpf_prog *prog = parent_event->prog;
- if (IS_ERR(prog)) {
- err = PTR_ERR(prog);
- goto err_ns;
- }
+ bpf_prog_inc(prog);
event->prog = prog;
event->orig_overflow_handler =
parent_event->orig_overflow_handler;
@@ -10426,6 +10806,21 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_ns;
}
+ /*
+ * Disallow uncore-cgroup events, they don't make sense as the cgroup will
+ * be different on other CPUs in the uncore mask.
+ */
+ if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+ err = -EINVAL;
+ goto err_pmu;
+ }
+
+ if (event->attr.aux_output &&
+ !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+ err = -EOPNOTSUPP;
+ goto err_pmu;
+ }
+
err = exclusive_event_init(event);
if (err)
goto err_pmu;
@@ -10465,11 +10860,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
+ err = security_perf_event_alloc(event);
+ if (err)
+ goto err_callchain_buffer;
+
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
return event;
+err_callchain_buffer:
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
err_addr_filters:
kfree(event->addr_filter_ranges);
@@ -10498,58 +10902,29 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
u32 size;
int ret;
- if (!access_ok(uattr, PERF_ATTR_SIZE_VER0))
- return -EFAULT;
-
- /*
- * zero the full structure, so that a short copy will be nice.
- */
+ /* Zero the full structure, so that a short copy will be nice. */
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
- if (size > PAGE_SIZE) /* silly large */
- goto err_size;
-
- if (!size) /* abi compat */
+ /* ABI compatibility quirk: */
+ if (!size)
size = PERF_ATTR_SIZE_VER0;
-
- if (size < PERF_ATTR_SIZE_VER0)
+ if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
goto err_size;
- /*
- * If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
- if (size > sizeof(*attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(*attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- ret = get_user(val, addr);
- if (ret)
- return ret;
- if (val)
- goto err_size;
- }
- size = sizeof(*attr);
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
}
- ret = copy_from_user(attr, uattr, size);
- if (ret)
- return -EFAULT;
-
attr->size = size;
- if (attr->__reserved_1)
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
return -EINVAL;
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -10587,9 +10962,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->branch_sample_type = mask;
}
/* privileged levels capture (kernel, hv): check permissions */
- if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
- && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
+ ret = perf_allow_kernel(attr);
+ if (ret)
+ return ret;
+ }
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -10630,7 +11007,7 @@ err_size:
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL;
+ struct perf_buffer *rb = NULL;
int ret = -EINVAL;
if (!output_event)
@@ -10802,13 +11179,19 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ if (err)
+ return err;
+
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
if (!attr.exclude_kernel) {
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
}
if (attr.namespaces) {
@@ -10825,9 +11208,18 @@ SYSCALL_DEFINE5(perf_event_open,
}
/* Only privileged users can get physical addresses */
- if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
- perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
+ err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
+ }
+
+ err = security_locked_down(LOCKDOWN_PERF);
+ if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
+ /* REGS_INTR can leak data, lockdown must prevent this */
+ return err;
+
+ err = 0;
/*
* In cgroup mode, the pid argument is used to pass the fd
@@ -11082,6 +11474,10 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
+ err = -EINVAL;
+ goto err_locked;
+ }
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
@@ -11228,8 +11624,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
int err;
/*
- * Get the target context (task or percpu):
+ * Grouping is not supported for kernel events, neither is 'AUX',
+ * make sure the caller's intentions are adjusted.
*/
+ if (attr->aux_output)
+ return ERR_PTR(-EINVAL);
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
overflow_handler, context, -1);
@@ -11241,6 +11640,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
+ /*
+ * Get the target context (task or percpu):
+ */
ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
@@ -11692,7 +12094,7 @@ inherit_event(struct perf_event *parent_event,
GFP_KERNEL);
if (!child_ctx->task_ctx_data) {
free_event(child_event);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
}
@@ -11794,6 +12196,10 @@ static int inherit_group(struct perf_event *parent_event,
child, leader, child_ctx);
if (IS_ERR(child_ctr))
return PTR_ERR(child_ctr);
+
+ if (sub->aux_event == parent_event && child_ctr &&
+ !perf_get_aux_event(child_ctr, leader))
+ return -EINVAL;
}
return 0;
}
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c5cd852fe86b..3cc8416ec844 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -413,7 +413,7 @@ static int hw_breakpoint_parse(struct perf_event *bp,
int register_perf_hw_breakpoint(struct perf_event *bp)
{
- struct arch_hw_breakpoint hw;
+ struct arch_hw_breakpoint hw = { };
int err;
err = reserve_bp_slot(bp);
@@ -461,7 +461,7 @@ int
modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
bool check)
{
- struct arch_hw_breakpoint hw;
+ struct arch_hw_breakpoint hw = { };
int err;
err = hw_breakpoint_parse(bp, attr, &hw);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 3aef4191798c..f16f66b6b655 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -10,7 +10,7 @@
#define RING_BUFFER_WRITABLE 0x01
-struct ring_buffer {
+struct perf_buffer {
refcount_t refcount;
struct rcu_head rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
@@ -50,6 +50,7 @@ struct ring_buffer {
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
refcount_t aux_refcount;
+ int aux_in_sampling;
void **aux_pages;
void *aux_priv;
@@ -57,17 +58,17 @@ struct ring_buffer {
void *data_pages[0];
};
-extern void rb_free(struct ring_buffer *rb);
+extern void rb_free(struct perf_buffer *rb);
static inline void rb_free_rcu(struct rcu_head *rcu_head)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
- rb = container_of(rcu_head, struct ring_buffer, rcu_head);
+ rb = container_of(rcu_head, struct perf_buffer, rcu_head);
rb_free(rb);
}
-static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
+static inline void rb_toggle_paused(struct perf_buffer *rb, bool pause)
{
if (!pause && rb->nr_pages)
rb->paused = 0;
@@ -75,16 +76,16 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
rb->paused = 1;
}
-extern struct ring_buffer *
+extern struct perf_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
-extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+extern int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags);
-extern void rb_free_aux(struct ring_buffer *rb);
-extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
-extern void ring_buffer_put(struct ring_buffer *rb);
+extern void rb_free_aux(struct perf_buffer *rb);
+extern struct perf_buffer *ring_buffer_get(struct perf_event *event);
+extern void ring_buffer_put(struct perf_buffer *rb);
-static inline bool rb_has_aux(struct ring_buffer *rb)
+static inline bool rb_has_aux(struct perf_buffer *rb)
{
return !!rb->aux_nr_pages;
}
@@ -93,7 +94,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
unsigned long size, u64 flags);
extern struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
+perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff);
#ifdef CONFIG_PERF_USE_VMALLOC
/*
@@ -102,25 +103,25 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
* Required for architectures that have d-cache aliasing issues.
*/
-static inline int page_order(struct ring_buffer *rb)
+static inline int page_order(struct perf_buffer *rb)
{
return rb->page_order;
}
#else
-static inline int page_order(struct ring_buffer *rb)
+static inline int page_order(struct perf_buffer *rb)
{
return 0;
}
#endif
-static inline unsigned long perf_data_size(struct ring_buffer *rb)
+static inline unsigned long perf_data_size(struct perf_buffer *rb)
{
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
-static inline unsigned long perf_aux_size(struct ring_buffer *rb)
+static inline unsigned long perf_aux_size(struct perf_buffer *rb)
{
return rb->aux_nr_pages << PAGE_SHIFT;
}
@@ -140,7 +141,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
buf += written; \
handle->size -= written; \
if (!handle->size) { \
- struct ring_buffer *rb = handle->rb; \
+ struct perf_buffer *rb = handle->rb; \
\
handle->page++; \
handle->page &= rb->nr_pages - 1; \
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ffb59a4ef4ff..192b8abc6330 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -35,7 +35,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
*/
static void perf_output_get_handle(struct perf_output_handle *handle)
{
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
preempt_disable();
@@ -49,7 +49,7 @@ static void perf_output_get_handle(struct perf_output_handle *handle)
static void perf_output_put_handle(struct perf_output_handle *handle)
{
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
unsigned long head;
unsigned int nest;
@@ -150,7 +150,7 @@ __perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size,
bool backward)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
unsigned long tail, offset, head;
int have_lost, page_shift;
struct {
@@ -301,7 +301,7 @@ void perf_output_end(struct perf_output_handle *handle)
}
static void
-ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
+ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
{
long max_size = perf_data_size(rb);
@@ -361,7 +361,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
{
struct perf_event *output_event = event;
unsigned long aux_head, aux_tail;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
unsigned int nest;
if (output_event->parent)
@@ -449,7 +449,7 @@ err:
}
EXPORT_SYMBOL_GPL(perf_aux_output_begin);
-static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb)
+static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
{
if (rb->aux_overwrite)
return false;
@@ -475,7 +475,7 @@ static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb)
void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
{
bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
unsigned long aux_head;
/* in overwrite mode, driver provides aux_head via handle */
@@ -532,7 +532,7 @@ EXPORT_SYMBOL_GPL(perf_aux_output_end);
*/
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
{
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
if (size > handle->size)
return -ENOSPC;
@@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle)
}
EXPORT_SYMBOL_GPL(perf_get_aux);
+/*
+ * Copy out AUX data from an AUX handle.
+ */
+long perf_output_copy_aux(struct perf_output_handle *aux_handle,
+ struct perf_output_handle *handle,
+ unsigned long from, unsigned long to)
+{
+ struct perf_buffer *rb = aux_handle->rb;
+ unsigned long tocopy, remainder, len = 0;
+ void *addr;
+
+ from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+ to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+
+ do {
+ tocopy = PAGE_SIZE - offset_in_page(from);
+ if (to > from)
+ tocopy = min(tocopy, to - from);
+ if (!tocopy)
+ break;
+
+ addr = rb->aux_pages[from >> PAGE_SHIFT];
+ addr += offset_in_page(from);
+
+ remainder = perf_output_copy(handle, addr, tocopy);
+ if (remainder)
+ return -EFAULT;
+
+ len += tocopy;
+ from += tocopy;
+ from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+ } while (to != from);
+
+ return len;
+}
+
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
static struct page *rb_alloc_aux_page(int node, int order)
@@ -590,7 +626,7 @@ static struct page *rb_alloc_aux_page(int node, int order)
return page;
}
-static void rb_free_aux_page(struct ring_buffer *rb, int idx)
+static void rb_free_aux_page(struct perf_buffer *rb, int idx)
{
struct page *page = virt_to_page(rb->aux_pages[idx]);
@@ -599,7 +635,7 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
__free_page(page);
}
-static void __rb_free_aux(struct ring_buffer *rb)
+static void __rb_free_aux(struct perf_buffer *rb)
{
int pg;
@@ -626,7 +662,7 @@ static void __rb_free_aux(struct ring_buffer *rb)
}
}
-int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags)
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
@@ -717,7 +753,7 @@ out:
return ret;
}
-void rb_free_aux(struct ring_buffer *rb)
+void rb_free_aux(struct perf_buffer *rb)
{
if (refcount_dec_and_test(&rb->aux_refcount))
__rb_free_aux(rb);
@@ -730,7 +766,7 @@ void rb_free_aux(struct ring_buffer *rb)
*/
static struct page *
-__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
if (pgoff > rb->nr_pages)
return NULL;
@@ -754,13 +790,21 @@ static void *perf_mmap_alloc_page(int cpu)
return page_address(page);
}
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+static void perf_mmap_free_page(void *addr)
{
- struct ring_buffer *rb;
+ struct page *page = virt_to_page(addr);
+
+ page->mapping = NULL;
+ __free_page(page);
+}
+
+struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+ struct perf_buffer *rb;
unsigned long size;
int i;
- size = sizeof(struct ring_buffer);
+ size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
@@ -788,9 +832,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
fail_data_pages:
for (i--; i >= 0; i--)
- free_page((unsigned long)rb->data_pages[i]);
+ perf_mmap_free_page(rb->data_pages[i]);
- free_page((unsigned long)rb->user_page);
+ perf_mmap_free_page(rb->user_page);
fail_user_page:
kfree(rb);
@@ -799,32 +843,24 @@ fail:
return NULL;
}
-static void perf_mmap_free_page(unsigned long addr)
-{
- struct page *page = virt_to_page((void *)addr);
-
- page->mapping = NULL;
- __free_page(page);
-}
-
-void rb_free(struct ring_buffer *rb)
+void rb_free(struct perf_buffer *rb)
{
int i;
- perf_mmap_free_page((unsigned long)rb->user_page);
+ perf_mmap_free_page(rb->user_page);
for (i = 0; i < rb->nr_pages; i++)
- perf_mmap_free_page((unsigned long)rb->data_pages[i]);
+ perf_mmap_free_page(rb->data_pages[i]);
kfree(rb);
}
#else
-static int data_page_nr(struct ring_buffer *rb)
+static int data_page_nr(struct perf_buffer *rb)
{
return rb->nr_pages << page_order(rb);
}
static struct page *
-__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
/* The '>' counts in the user page. */
if (pgoff > data_page_nr(rb))
@@ -842,11 +878,11 @@ static void perf_mmap_unmark_page(void *addr)
static void rb_free_work(struct work_struct *work)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
void *base;
int i, nr;
- rb = container_of(work, struct ring_buffer, work);
+ rb = container_of(work, struct perf_buffer, work);
nr = data_page_nr(rb);
base = rb->user_page;
@@ -858,18 +894,18 @@ static void rb_free_work(struct work_struct *work)
kfree(rb);
}
-void rb_free(struct ring_buffer *rb)
+void rb_free(struct perf_buffer *rb)
{
schedule_work(&rb->work);
}
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
unsigned long size;
void *all_buf;
- size = sizeof(struct ring_buffer);
+ size = sizeof(struct perf_buffer);
size += sizeof(void *);
rb = kzalloc(size, GFP_KERNEL);
@@ -903,7 +939,7 @@ fail:
#endif
struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
if (rb->aux_nr_pages) {
/* above AUX space */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 84fa00497c49..ece7e13f6e4a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
+#include <linux/khugepaged.h>
#include <linux/uprobes.h>
@@ -143,17 +144,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
*
* @vma: vma that holds the pte pointing to page
* @addr: address the old @page is mapped at
- * @page: the cowed page we are replacing by kpage
- * @kpage: the modified page we replace page by
+ * @old_page: the page we are replacing by new_page
+ * @new_page: the modified page we replace page by
*
- * Returns 0 on success, -EFAULT on failure.
+ * If @new_page is NULL, only unmap @old_page.
+ *
+ * Returns 0 on success, negative error code otherwise.
*/
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
- .page = old_page,
+ .page = compound_head(old_page),
.vma = vma,
.address = addr,
};
@@ -164,12 +167,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
- VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
-
- err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
- false);
- if (err)
- return err;
+ if (new_page) {
+ err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false);
+ if (err)
+ return err;
+ }
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(old_page);
@@ -177,15 +180,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
if (!page_vma_mapped_walk(&pvmw)) {
- mem_cgroup_cancel_charge(new_page, memcg, false);
+ if (new_page)
+ mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
}
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
- get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ if (new_page) {
+ get_page(new_page);
+ page_add_new_anon_rmap(new_page, vma, addr, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
+ } else
+ /* no new page, just dec_mm_counter for old_page */
+ dec_mm_counter(mm, MM_ANONPAGES);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, mm_counter_file(old_page));
@@ -194,8 +202,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
ptep_clear_flush_notify(vma, addr, pvmw.pte);
- set_pte_at_notify(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
+ if (new_page)
+ set_pte_at_notify(mm, addr, pvmw.pte,
+ mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, false);
if (!page_mapped(old_page))
@@ -464,14 +473,18 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
struct page *old_page, *new_page;
struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
+ bool orig_page_huge = false;
+ unsigned int gup_flags = FOLL_FORCE;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
retry:
+ if (is_register)
+ gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1,
- FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
+ ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags,
+ &old_page, &vma, NULL);
if (ret <= 0)
return ret;
@@ -479,6 +492,12 @@ retry:
if (ret <= 0)
goto put_old;
+ if (WARN(!is_register && PageCompound(old_page),
+ "uprobe unregister should never work on compound page\n")) {
+ ret = -EINVAL;
+ goto put_old;
+ }
+
/* We are going to replace instruction, update ref_ctr. */
if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
@@ -488,6 +507,10 @@ retry:
ref_ctr_updated = 1;
}
+ ret = 0;
+ if (!is_register && !PageAnon(old_page))
+ goto put_old;
+
ret = anon_vma_prepare(vma);
if (ret)
goto put_old;
@@ -501,8 +524,33 @@ retry:
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (!is_register) {
+ struct page *orig_page;
+ pgoff_t index;
+
+ VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
+
+ index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
+ orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
+ index);
+
+ if (orig_page) {
+ if (PageUptodate(orig_page) &&
+ pages_identical(new_page, orig_page)) {
+ /* let go new_page */
+ put_page(new_page);
+ new_page = NULL;
+
+ if (PageCompound(orig_page))
+ orig_page_huge = true;
+ }
+ put_page(orig_page);
+ }
+ }
+
ret = __replace_page(vma, vaddr, old_page, new_page);
- put_page(new_page);
+ if (new_page)
+ put_page(new_page);
put_old:
put_page(old_page);
@@ -513,6 +561,10 @@ put_old:
if (ret && is_register && ref_ctr_updated)
update_ref_ctr(uprobe, mm, -1);
+ /* try collapse pmd for compound page */
+ if (!ret && orig_page_huge)
+ collapse_pte_mapped_thp(mm, vaddr);
+
return ret;
}
@@ -1405,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
/* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
PAGE_SIZE, 0, 0);
- if (area->vaddr & ~PAGE_MASK) {
+ if (IS_ERR_VALUE(area->vaddr)) {
ret = area->vaddr;
goto fail;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b4a5dcce8f8..2833ffb0c211 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(tsk);
}
+void put_task_struct_rcu_user(struct task_struct *task)
+{
+ if (refcount_dec_and_test(&task->rcu_users))
+ call_rcu(&task->rcu, delayed_put_task_struct);
+}
void release_task(struct task_struct *p)
{
@@ -222,76 +227,13 @@ repeat:
write_unlock_irq(&tasklist_lock);
release_thread(p);
- call_rcu(&p->rcu, delayed_put_task_struct);
+ put_task_struct_rcu_user(p);
p = leader;
if (unlikely(zap_leader))
goto repeat;
}
-/*
- * Note that if this function returns a valid task_struct pointer (!NULL)
- * task->usage must remain >0 for the duration of the RCU critical section.
- */
-struct task_struct *task_rcu_dereference(struct task_struct **ptask)
-{
- struct sighand_struct *sighand;
- struct task_struct *task;
-
- /*
- * We need to verify that release_task() was not called and thus
- * delayed_put_task_struct() can't run and drop the last reference
- * before rcu_read_unlock(). We check task->sighand != NULL,
- * but we can read the already freed and reused memory.
- */
-retry:
- task = rcu_dereference(*ptask);
- if (!task)
- return NULL;
-
- probe_kernel_address(&task->sighand, sighand);
-
- /*
- * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
- * was already freed we can not miss the preceding update of this
- * pointer.
- */
- smp_rmb();
- if (unlikely(task != READ_ONCE(*ptask)))
- goto retry;
-
- /*
- * We've re-checked that "task == *ptask", now we have two different
- * cases:
- *
- * 1. This is actually the same task/task_struct. In this case
- * sighand != NULL tells us it is still alive.
- *
- * 2. This is another task which got the same memory for task_struct.
- * We can't know this of course, and we can not trust
- * sighand != NULL.
- *
- * In this case we actually return a random value, but this is
- * correct.
- *
- * If we return NULL - we can pretend that we actually noticed that
- * *ptask was updated when the previous task has exited. Or pretend
- * that probe_slab_address(&sighand) reads NULL.
- *
- * If we return the new task (because sighand is not NULL for any
- * reason) - this is fine too. This (new) task can't go away before
- * another gp pass.
- *
- * And note: We could even eliminate the false positive if re-read
- * task->sighand once again to avoid the falsely NULL. But this case
- * is very unlikely so we don't care.
- */
- if (!sighand)
- return NULL;
-
- return task;
-}
-
void rcuwait_wake_up(struct rcuwait *w)
{
struct task_struct *task;
@@ -311,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w)
*/
smp_mb(); /* (B) */
- /*
- * Avoid using task_rcu_dereference() magic as long as we are careful,
- * see comment in rcuwait_wait_event() regarding ->exit_state.
- */
task = rcu_dereference(w->task);
if (task)
wake_up_process(task);
@@ -499,7 +437,7 @@ static void exit_mm(void)
struct mm_struct *mm = current->mm;
struct core_state *core_state;
- mm_release(current, mm);
+ exit_mm_release(current, mm);
if (!mm)
return;
sync_mm_rss(mm);
@@ -579,10 +517,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
}
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns)) {
- panic("Attempted to kill init! exitcode=0x%08x\n",
- father->signal->group_exit_code ?: father->exit_code);
- }
list_for_each_entry_safe(p, n, dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
@@ -808,32 +742,12 @@ void __noreturn do_exit(long code)
*/
if (unlikely(tsk->flags & PF_EXITING)) {
pr_alert("Fixing recursive fault but reboot is needed!\n");
- /*
- * We can do this unlocked here. The futex code uses
- * this flag just to verify whether the pi state
- * cleanup has been done or not. In the worst case it
- * loops once more. We pretend that the cleanup was
- * done as there is no way to return. Either the
- * OWNER_DIED bit is set by now or we push the blocked
- * task into the wait for ever nirwana as well.
- */
- tsk->flags |= PF_EXITPIDONE;
+ futex_exit_recursive(tsk);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
exit_signals(tsk); /* sets PF_EXITING */
- /*
- * Ensure that all new tsk->pi_lock acquisitions must observe
- * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
- */
- smp_mb();
- /*
- * Ensure that we must observe the pi_state in exit_mm() ->
- * mm_release() -> exit_pi_state_list().
- */
- raw_spin_lock_irq(&tsk->pi_lock);
- raw_spin_unlock_irq(&tsk->pi_lock);
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -848,6 +762,14 @@ void __noreturn do_exit(long code)
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
+ /*
+ * If the last thread of global init has exited, panic
+ * immediately to get a useable coredump.
+ */
+ if (unlikely(is_global_init(tsk)))
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ tsk->signal->group_exit_code ?: (int)code);
+
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
@@ -908,12 +830,6 @@ void __noreturn do_exit(long code)
* Make sure we are holding no locks:
*/
debug_check_no_locks_held();
- /*
- * We can do this unlocked here. The futex code uses this flag
- * just to verify whether the pi state cleanup has been done
- * or not. In the worst case it loops once more.
- */
- tsk->flags |= PF_EXITPIDONE;
if (tsk->io_context)
exit_io_context(tsk);
@@ -1497,7 +1413,7 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
__wake_up_sync_key(&parent->signal->wait_chldexit,
- TASK_INTERRUPTIBLE, 1, p);
+ TASK_INTERRUPTIBLE, p);
}
static long do_wait(struct wait_opts *wo)
@@ -1519,7 +1435,7 @@ repeat:
*/
wo->notask_error = -ECHILD;
if ((wo->wo_type < PIDTYPE_MAX) &&
- (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
+ (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
set_current_state(TASK_INTERRUPTIBLE);
@@ -1554,6 +1470,23 @@ end:
return retval;
}
+static struct pid *pidfd_get_pid(unsigned int fd)
+{
+ struct fd f;
+ struct pid *pid;
+
+ f = fdget(fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ pid = pidfd_pid(f.file);
+ if (!IS_ERR(pid))
+ get_pid(pid);
+
+ fdput(f);
+ return pid;
+}
+
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
{
@@ -1576,19 +1509,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
type = PIDTYPE_PID;
if (upid <= 0)
return -EINVAL;
+
+ pid = find_get_pid(upid);
break;
case P_PGID:
type = PIDTYPE_PGID;
- if (upid <= 0)
+ if (upid < 0)
return -EINVAL;
+
+ if (upid)
+ pid = find_get_pid(upid);
+ else
+ pid = get_task_pid(current, PIDTYPE_PGID);
+ break;
+ case P_PIDFD:
+ type = PIDTYPE_PID;
+ if (upid < 0)
+ return -EINVAL;
+
+ pid = pidfd_get_pid(upid);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
break;
default:
return -EINVAL;
}
- if (type < PIDTYPE_MAX)
- pid = find_get_pid(upid);
-
wo.wo_type = type;
wo.wo_pid = pid;
wo.wo_flags = options;
diff --git a/kernel/extable.c b/kernel/extable.c
index e23cce6e6092..a0024f27d3a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,15 +40,24 @@ void __init sort_main_extable(void)
}
}
+/* Given an address, look for it in the kernel exception table */
+const
+struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
+{
+ return search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
+}
+
/* Given an address, look for it in the exception tables. */
const struct exception_table_entry *search_exception_tables(unsigned long addr)
{
const struct exception_table_entry *e;
- e = search_extable(__start___ex_table,
- __stop___ex_table - __start___ex_table, addr);
+ e = search_kernel_exception_table(addr);
if (!e)
e = search_module_extables(addr);
+ if (!e)
+ e = search_bpf_extables(addr);
return e;
}
@@ -122,8 +131,9 @@ int kernel_text_address(unsigned long addr)
* triggers a stack trace, or a WARN() that happens during
* coming back from idle, or cpu on or offlining.
*
- * is_module_text_address() as well as the kprobe slots
- * and is_bpf_text_address() require RCU to be watching.
+ * is_module_text_address() as well as the kprobe slots,
+ * is_bpf_text_address() and is_bpf_image_address require
+ * RCU to be watching.
*/
no_rcu = !rcu_is_watching();
@@ -139,6 +149,8 @@ int kernel_text_address(unsigned long addr)
goto out;
if (is_bpf_text_address(addr))
goto out;
+ if (is_bpf_image_address(addr))
+ goto out;
ret = 0;
out:
if (no_rcu)
diff --git a/kernel/fork.c b/kernel/fork.c
index 2852d0e76ea3..60a1295f4384 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,7 +40,6 @@
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
-#include <linux/hmm.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@@ -94,6 +93,7 @@
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/stackleak.h>
+#include <linux/kasan.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -125,6 +125,15 @@ int nr_threads; /* The idle threads do not count.. */
static int max_threads; /* tunable limit on nr_threads */
+#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
+
+static const char * const resident_page_types[] = {
+ NAMED_ARRAY_INDEX(MM_FILEPAGES),
+ NAMED_ARRAY_INDEX(MM_ANONPAGES),
+ NAMED_ARRAY_INDEX(MM_SWAPENTS),
+ NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
+};
+
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
@@ -215,6 +224,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
+ /* Clear the KASAN shadow of the stack. */
+ kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
@@ -645,12 +657,15 @@ static void check_mm(struct mm_struct *mm)
{
int i;
+ BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
+ "Please make sure 'struct resident_page_types[]' is updated as well");
+
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = atomic_long_read(&mm->rss_stat.count[i]);
if (unlikely(x))
- printk(KERN_ALERT "BUG: Bad rss-counter state "
- "mm:%p idx:%d val:%ld\n", mm, i, x);
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
+ mm, resident_page_types[i], x);
}
if (mm_pgtables_bytes(mm))
@@ -677,7 +692,7 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
- mmu_notifier_mm_destroy(mm);
+ mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
free_mm(mm);
@@ -768,6 +783,7 @@ static void set_max_threads(unsigned int max_threads_suggested)
int arch_task_struct_size __read_mostly;
#endif
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
@@ -782,6 +798,7 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
else
*offset += offsetof(struct task_struct, thread);
}
+#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
@@ -901,10 +918,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->cpus_ptr = &tsk->cpus_mask;
/*
- * One for us, one for whoever does the "release_task()" (usually
- * parent)
+ * One for the user space visible state that goes away when reaped.
+ * One for the scheduler.
*/
- refcount_set(&tsk->usage, 2);
+ refcount_set(&tsk->rcu_users, 2);
+ /* One for the rcu users */
+ refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
@@ -1006,8 +1025,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_aio(mm);
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
- mmu_notifier_mm_init(mm);
- hmm_mm_init(mm);
+ mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
@@ -1268,24 +1286,8 @@ static int wait_for_vfork_done(struct task_struct *child,
* restoring the old one. . .
* Eric Biederman 10 January 1998
*/
-void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
- /* Get rid of any futexes when releasing the mm */
-#ifdef CONFIG_FUTEX
- if (unlikely(tsk->robust_list)) {
- exit_robust_list(tsk);
- tsk->robust_list = NULL;
- }
-#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list)) {
- compat_exit_robust_list(tsk);
- tsk->compat_robust_list = NULL;
- }
-#endif
- if (unlikely(!list_empty(&tsk->pi_state_list)))
- exit_pi_state_list(tsk);
-#endif
-
uprobe_free_utask(tsk);
/* Get rid of any cached register state */
@@ -1318,6 +1320,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
complete_vfork_done(tsk);
}
+void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ futex_exit_release(tsk);
+ mm_release(tsk, mm);
+}
+
+void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ futex_exec_release(tsk);
+ mm_release(tsk, mm);
+}
+
/**
* dup_mm() - duplicates an existing mm structure
* @tsk: the task_struct with which the new mm will be associated.
@@ -1502,6 +1516,11 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
spin_lock_irq(&current->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
spin_unlock_irq(&current->sighand->siglock);
+
+ /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
+ if (clone_flags & CLONE_CLEAR_SIGHAND)
+ flush_signal_handlers(tsk, 0);
+
return 0;
}
@@ -1517,28 +1536,17 @@ void __cleanup_sighand(struct sighand_struct *sighand)
}
}
-#ifdef CONFIG_POSIX_TIMERS
/*
* Initialize POSIX timer handling for a thread group.
*/
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
+ struct posix_cputimers *pct = &sig->posix_cputimers;
unsigned long cpu_limit;
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
- if (cpu_limit != RLIM_INFINITY) {
- sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
- sig->cputimer.running = true;
- }
-
- /* The timer lists. */
- INIT_LIST_HEAD(&sig->cpu_timers[0]);
- INIT_LIST_HEAD(&sig->cpu_timers[1]);
- INIT_LIST_HEAD(&sig->cpu_timers[2]);
+ posix_cputimers_group_init(pct, cpu_limit);
}
-#else
-static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
-#endif
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
@@ -1640,23 +1648,6 @@ static void rt_mutex_init_task(struct task_struct *p)
#endif
}
-#ifdef CONFIG_POSIX_TIMERS
-/*
- * Initialize POSIX timer handling for a single task.
- */
-static void posix_cpu_timers_init(struct task_struct *tsk)
-{
- tsk->cputime_expires.prof_exp = 0;
- tsk->cputime_expires.virt_exp = 0;
- tsk->cputime_expires.sched_exp = 0;
- INIT_LIST_HEAD(&tsk->cpu_timers[0]);
- INIT_LIST_HEAD(&tsk->cpu_timers[1]);
- INIT_LIST_HEAD(&tsk->cpu_timers[2]);
-}
-#else
-static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
-#endif
-
static inline void init_task_pid_links(struct task_struct *task)
{
enum pid_type type;
@@ -1690,6 +1681,14 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_RCU */
}
+struct pid *pidfd_pid(const struct file *file)
+{
+ if (file->f_op == &pidfd_fops)
+ return file->private_data;
+
+ return ERR_PTR(-EBADF);
+}
+
static int pidfd_release(struct inode *inode, struct file *file)
{
struct pid *pid = file->private_data;
@@ -1700,12 +1699,68 @@ static int pidfd_release(struct inode *inode, struct file *file)
}
#ifdef CONFIG_PROC_FS
+/**
+ * pidfd_show_fdinfo - print information about a pidfd
+ * @m: proc fdinfo file
+ * @f: file referencing a pidfd
+ *
+ * Pid:
+ * This function will print the pid that a given pidfd refers to in the
+ * pid namespace of the procfs instance.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its pid. This is
+ * similar to calling getppid() on a process whose parent is outside of
+ * its pid namespace.
+ *
+ * NSpid:
+ * If pid namespaces are supported then this function will also print
+ * the pid of a given pidfd refers to for all descendant pid namespaces
+ * starting from the current pid namespace of the instance, i.e. the
+ * Pid field and the first entry in the NSpid field will be identical.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its first NSpid
+ * entry and no others will be shown.
+ * Note that this differs from the Pid and NSpid fields in
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
+ * the pid namespace of the procfs instance. The difference becomes
+ * obvious when sending around a pidfd between pid namespaces from a
+ * different branch of the tree, i.e. where no ancestoral relation is
+ * present between the pid namespaces:
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
+ * namespace (also take care to create new mount namespaces in the
+ * new pid namespace and mount procfs)
+ * - create a process with a pidfd in ns1
+ * - send pidfd from ns1 to ns2
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
+ * have exactly one entry, which is 0
+ */
static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
{
- struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
struct pid *pid = f->private_data;
+ struct pid_namespace *ns;
+ pid_t nr = -1;
- seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
+ if (likely(pid_has_task(pid, PIDTYPE_PID))) {
+ ns = proc_pid_ns(file_inode(m->file));
+ nr = pid_nr_ns(pid, ns);
+ }
+
+ seq_put_decimal_ll(m, "Pid:\t", nr);
+
+#ifdef CONFIG_PID_NS
+ seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+ if (nr > 0) {
+ int i;
+
+ /* If nr is non-zero it means that 'pid' is valid and that
+ * ns, i.e. the pid namespace associated with the procfs
+ * instance, is in the pid namespace hierarchy of pid.
+ * Start at one below the already printed level.
+ */
+ for (i = ns->level + 1; i <= pid->level; i++)
+ seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
+ }
+#endif
seq_putc(m, '\n');
}
#endif
@@ -1713,11 +1768,11 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
/*
* Poll support for process exit notification.
*/
-static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
+static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct task_struct *task;
struct pid *pid = file->private_data;
- int poll_flags = 0;
+ __poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
@@ -1729,7 +1784,7 @@ static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
* group, then poll(2) should block, similar to the wait(2) family.
*/
if (!task || (task->exit_state && thread_group_empty(task)))
- poll_flags = POLLIN | POLLRDNORM;
+ poll_flags = EPOLLIN | EPOLLRDNORM;
rcu_read_unlock();
return poll_flags;
@@ -1777,6 +1832,7 @@ static __latent_entropy struct task_struct *copy_process(
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
u64 clone_flags = args->flags;
+ struct nsproxy *nsp = current->nsproxy;
/*
* Don't allow sharing the root directory with processes in a different
@@ -1819,8 +1875,16 @@ static __latent_entropy struct task_struct *copy_process(
*/
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
- (task_active_pid_ns(current) !=
- current->nsproxy->pid_ns_for_children))
+ (task_active_pid_ns(current) != nsp->pid_ns_for_children))
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * If the new process will be in a different time namespace
+ * do not allow it to share VM or a thread group with the forking task.
+ */
+ if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
+ if (nsp->time_ns != nsp->time_ns_for_children)
return ERR_PTR(-EINVAL);
}
@@ -1935,7 +1999,7 @@ static __latent_entropy struct task_struct *copy_process(
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
- posix_cpu_timers_init(p);
+ posix_cputimers_init(&p->posix_cputimers);
p->io_context = NULL;
audit_set_context(p, NULL);
@@ -2031,7 +2095,8 @@ static __latent_entropy struct task_struct *copy_process(
stackleak_task_init(p);
if (pid != &init_struct_pid) {
- pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
+ args->set_tid_size);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
@@ -2067,14 +2132,8 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
-#ifdef CONFIG_FUTEX
- p->robust_list = NULL;
-#ifdef CONFIG_COMPAT
- p->compat_robust_list = NULL;
-#endif
- INIT_LIST_HEAD(&p->pi_state_list);
- p->pi_state_cache = NULL;
-#endif
+ futex_init_task(p);
+
/*
* sigaltstack should be cleared when sharing the same VM
*/
@@ -2135,7 +2194,7 @@ static __latent_entropy struct task_struct *copy_process(
*/
p->start_time = ktime_get_ns();
- p->real_start_time = ktime_get_boottime_ns();
+ p->start_boottime = ktime_get_boottime_ns();
/*
* Make it visible to the rest of the system, but dont wake it up yet.
@@ -2338,6 +2397,8 @@ struct mm_struct *copy_init_mm(void)
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
+ *
+ * args->exit_signal is expected to be checked for sanity by the caller.
*/
long _do_fork(struct kernel_clone_args *args)
{
@@ -2526,41 +2587,49 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
#endif
#ifdef __ARCH_WANT_SYS_CLONE3
+
+/*
+ * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
+ * the registers containing the syscall arguments for clone. This doesn't work
+ * with clone3 since the TLS value is passed in clone_args instead.
+ */
+#ifndef CONFIG_HAVE_COPY_THREAD_TLS
+#error clone3 requires copy_thread_tls support in arch
+#endif
+
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
- size_t size)
+ size_t usize)
{
+ int err;
struct clone_args args;
+ pid_t *kset_tid = kargs->set_tid;
- if (unlikely(size > PAGE_SIZE))
+ if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
-
- if (unlikely(size < sizeof(struct clone_args)))
+ if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
return -EINVAL;
- if (unlikely(!access_ok(uargs, size)))
- return -EFAULT;
-
- if (size > sizeof(struct clone_args)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
+ err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+ if (err)
+ return err;
- addr = (void __user *)uargs + sizeof(struct clone_args);
- end = (void __user *)uargs + size;
+ if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
+ return -EINVAL;
- for (; addr < end; addr++) {
- if (get_user(val, addr))
- return -EFAULT;
- if (val)
- return -E2BIG;
- }
+ if (unlikely(!args.set_tid && args.set_tid_size > 0))
+ return -EINVAL;
- size = sizeof(struct clone_args);
- }
+ if (unlikely(args.set_tid && args.set_tid_size == 0))
+ return -EINVAL;
- if (copy_from_user(&args, uargs, size))
- return -EFAULT;
+ /*
+ * Verify that higher 32bits of exit_signal are unset and that
+ * it is a valid signal
+ */
+ if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
+ !valid_signal(args.exit_signal)))
+ return -EINVAL;
*kargs = (struct kernel_clone_args){
.flags = args.flags,
@@ -2571,18 +2640,51 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
+ .set_tid_size = args.set_tid_size,
};
+ if (args.set_tid &&
+ copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
+ (kargs->set_tid_size * sizeof(pid_t))))
+ return -EFAULT;
+
+ kargs->set_tid = kset_tid;
+
return 0;
}
-static bool clone3_args_valid(const struct kernel_clone_args *kargs)
+/**
+ * clone3_stack_valid - check and prepare stack
+ * @kargs: kernel clone args
+ *
+ * Verify that the stack arguments userspace gave us are sane.
+ * In addition, set the stack direction for userspace since it's easy for us to
+ * determine.
+ */
+static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
{
- /*
- * All lower bits of the flag word are taken.
- * Verify that no other unknown flags are passed along.
- */
- if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+ if (kargs->stack == 0) {
+ if (kargs->stack_size > 0)
+ return false;
+ } else {
+ if (kargs->stack_size == 0)
+ return false;
+
+ if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
+ return false;
+
+#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
+ kargs->stack += kargs->stack_size;
+#endif
+ }
+
+ return true;
+}
+
+static bool clone3_args_valid(struct kernel_clone_args *kargs)
+{
+ /* Verify that no unknown flags are passed along. */
+ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
return false;
/*
@@ -2592,18 +2694,39 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs)
if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
return false;
+ if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
+ (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
+ return false;
+
if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
kargs->exit_signal)
return false;
+ if (!clone3_stack_valid(kargs))
+ return false;
+
return true;
}
+/**
+ * clone3 - create a new process with specific properties
+ * @uargs: argument structure
+ * @size: size of @uargs
+ *
+ * clone3() is the extensible successor to clone()/clone2().
+ * It takes a struct as argument that is versioned by its size.
+ *
+ * Return: On success, a positive PID for the child process.
+ * On error, a negative errno number.
+ */
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
{
int err;
struct kernel_clone_args kargs;
+ pid_t set_tid[MAX_PID_NS_LEVEL];
+
+ kargs.set_tid = set_tid;
err = copy_clone_args_from_user(&kargs, uargs, size);
if (err)
@@ -2707,7 +2830,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
+ CLONE_NEWTIME))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
@@ -2929,7 +3053,7 @@ int sysctl_max_threads(struct ctl_table *table, int write,
struct ctl_table t;
int ret;
int threads = max_threads;
- int min = MIN_THREADS;
+ int min = 1;
int max = MAX_THREADS;
t = *table;
@@ -2941,7 +3065,7 @@ int sysctl_max_threads(struct ctl_table *table, int write,
if (ret || !write)
return ret;
- set_max_threads(threads);
+ max_threads = threads;
return 0;
}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index c0738424bb43..dc520f01f99d 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -22,12 +22,6 @@ EXPORT_SYMBOL(system_freezing_cnt);
bool pm_freezing;
bool pm_nosig_freezing;
-/*
- * Temporary export for the deadlock workaround in ata_scsi_hotplug().
- * Remove once the hack becomes unnecessary.
- */
-EXPORT_SYMBOL_GPL(pm_freezing);
-
/* protects freezing and frozen transitions */
static DEFINE_SPINLOCK(freezer_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index 6d50728ef2e7..0cf84c8664f2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -325,6 +325,12 @@ static inline bool should_fail_futex(bool fshared)
}
#endif /* CONFIG_FAIL_FUTEX */
+#ifdef CONFIG_COMPAT
+static void compat_exit_robust_list(struct task_struct *curr);
+#else
+static inline void compat_exit_robust_list(struct task_struct *curr) { }
+#endif
+
static inline void futex_get_mm(union futex_key *key)
{
mmgrab(key->private.mm);
@@ -487,11 +493,9 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
if (!time)
return NULL;
- hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(timeout, current);
-
+ hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
/*
* If range_ns is 0, calling hrtimer_set_expires_range_ns() is
* effectively the same as calling hrtimer_set_expires().
@@ -892,7 +896,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* Kernel cleans up PI-state, but userspace is likely hosed.
* (Robust-futex cleanup is separate and might save the day for userspace.)
*/
-void exit_pi_state_list(struct task_struct *curr)
+static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
@@ -962,7 +966,8 @@ void exit_pi_state_list(struct task_struct *curr)
}
raw_spin_unlock_irq(&curr->pi_lock);
}
-
+#else
+static inline void exit_pi_state_list(struct task_struct *curr) { }
#endif
/*
@@ -1171,16 +1176,48 @@ out_error:
return ret;
}
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @ret: owner's current futex lock status
+ * @exiting: Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+ if (ret != -EBUSY) {
+ WARN_ON_ONCE(exiting);
+ return;
+ }
+
+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+ return;
+
+ mutex_lock(&exiting->futex_exit_mutex);
+ /*
+ * No point in doing state checking here. If the waiter got here
+ * while the task was in exec()->exec_futex_release() then it can
+ * have any FUTEX_STATE_* value when the waiter has acquired the
+ * mutex. OK, if running, EXITING or DEAD if it reached exit()
+ * already. Highly unlikely and not a problem. Just one more round
+ * through the futex maze.
+ */
+ mutex_unlock(&exiting->futex_exit_mutex);
+
+ put_task_struct(exiting);
+}
+
static int handle_exit_race(u32 __user *uaddr, u32 uval,
struct task_struct *tsk)
{
u32 uval2;
/*
- * If PF_EXITPIDONE is not yet set, then try again.
+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+ * caller that the alleged owner is busy.
*/
- if (tsk && !(tsk->flags & PF_EXITPIDONE))
- return -EAGAIN;
+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ return -EBUSY;
/*
* Reread the user space value to handle the following situation:
@@ -1198,8 +1235,9 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
* *uaddr = 0xC0000000; tsk = get_task(PID);
* } if (!tsk->flags & PF_EXITING) {
* ... attach();
- * tsk->flags |= PF_EXITPIDONE; } else {
- * if (!(tsk->flags & PF_EXITPIDONE))
+ * tsk->futex_state = } else {
+ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
+ * FUTEX_STATE_DEAD)
* return -EAGAIN;
* return -ESRCH; <--- FAIL
* }
@@ -1230,7 +1268,8 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
* it after doing proper sanity checks.
*/
static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
- struct futex_pi_state **ps)
+ struct futex_pi_state **ps,
+ struct task_struct **exiting)
{
pid_t pid = uval & FUTEX_TID_MASK;
struct futex_pi_state *pi_state;
@@ -1255,22 +1294,33 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
}
/*
- * We need to look at the task state flags to figure out,
- * whether the task is exiting. To protect against the do_exit
- * change of the task flags, we do this protected by
- * p->pi_lock:
+ * We need to look at the task state to figure out, whether the
+ * task is exiting. To protect against the change of the task state
+ * in futex_exit_release(), we do this protected by p->pi_lock:
*/
raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->flags & PF_EXITING)) {
+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
/*
- * The task is on the way out. When PF_EXITPIDONE is
- * set, we know that the task has finished the
- * cleanup:
+ * The task is on the way out. When the futex state is
+ * FUTEX_STATE_DEAD, we know that the task has finished
+ * the cleanup:
*/
int ret = handle_exit_race(uaddr, uval, p);
raw_spin_unlock_irq(&p->pi_lock);
- put_task_struct(p);
+ /*
+ * If the owner task is between FUTEX_STATE_EXITING and
+ * FUTEX_STATE_DEAD then store the task pointer and keep
+ * the reference on the task struct. The calling code will
+ * drop all locks, wait for the task to reach
+ * FUTEX_STATE_DEAD and then drop the refcount. This is
+ * required to prevent a live lock when the current task
+ * preempted the exiting task between the two states.
+ */
+ if (ret == -EBUSY)
+ *exiting = p;
+ else
+ put_task_struct(p);
return ret;
}
@@ -1309,7 +1359,8 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
static int lookup_pi_state(u32 __user *uaddr, u32 uval,
struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps)
+ union futex_key *key, struct futex_pi_state **ps,
+ struct task_struct **exiting)
{
struct futex_q *top_waiter = futex_top_waiter(hb, key);
@@ -1324,7 +1375,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
* We are the first waiter - try to look up the owner based on
* @uval and attach to it.
*/
- return attach_to_pi_owner(uaddr, uval, key, ps);
+ return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
}
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1352,6 +1403,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* lookup
* @task: the task to perform the atomic lock work for. This will
* be "current" except in the case of requeue pi.
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Return:
@@ -1360,11 +1413,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* - <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
*/
static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
union futex_key *key,
struct futex_pi_state **ps,
- struct task_struct *task, int set_waiters)
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters)
{
u32 uval, newval, vpid = task_pid_vnr(task);
struct futex_q *top_waiter;
@@ -1434,7 +1493,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* attach to the owner. If that fails, no harm done, we only
* set the FUTEX_WAITERS bit in the user space variable.
*/
- return attach_to_pi_owner(uaddr, newval, key, ps);
+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
}
/**
@@ -1482,7 +1541,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
/*
* Queue the task for later wakeup for after we've released
- * the hb->lock. wake_q_add() grabs reference to p.
+ * the hb->lock.
*/
wake_q_add_safe(wake_q, p);
}
@@ -1852,6 +1911,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* @key1: the from futex key
* @key2: the to futex key
* @ps: address to store the pi_state pointer
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Try and get the lock on behalf of the top waiter if we can do it atomically.
@@ -1859,16 +1920,20 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
* Return:
* - 0 - failed to acquire the lock atomically;
* - >0 - acquired the lock, return value is vpid of the top_waiter
* - <0 - error
*/
-static int futex_proxy_trylock_atomic(u32 __user *pifutex,
- struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2,
- union futex_key *key1, union futex_key *key2,
- struct futex_pi_state **ps, int set_waiters)
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key1,
+ union futex_key *key2, struct futex_pi_state **ps,
+ struct task_struct **exiting, int set_waiters)
{
struct futex_q *top_waiter = NULL;
u32 curval;
@@ -1905,7 +1970,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
*/
vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
- set_waiters);
+ exiting, set_waiters);
if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
return vpid;
@@ -2034,6 +2099,8 @@ retry_private:
}
if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+ struct task_struct *exiting = NULL;
+
/*
* Attempt to acquire uaddr2 and wake the top waiter. If we
* intend to requeue waiters, force setting the FUTEX_WAITERS
@@ -2041,7 +2108,8 @@ retry_private:
* faults rather in the requeue loop below.
*/
ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state, nr_requeue);
+ &key2, &pi_state,
+ &exiting, nr_requeue);
/*
* At this point the top_waiter has either taken uaddr2 or is
@@ -2068,7 +2136,8 @@ retry_private:
* If that call succeeds then we have pi_state and an
* initial refcount on it.
*/
- ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
+ &pi_state, &exiting);
}
switch (ret) {
@@ -2086,17 +2155,24 @@ retry_private:
if (!ret)
goto retry;
goto out;
+ case -EBUSY:
case -EAGAIN:
/*
* Two reasons for this:
- * - Owner is exiting and we just wait for the
+ * - EBUSY: Owner is exiting and we just wait for the
* exit to complete.
- * - The user space value changed.
+ * - EAGAIN: The user space value changed.
*/
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
@@ -2613,7 +2689,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
/* Arm the timer */
if (timeout)
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
/*
* If we have been removed from the hash list, then another task
@@ -2803,6 +2879,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
{
struct hrtimer_sleeper timeout, *to;
struct futex_pi_state *pi_state = NULL;
+ struct task_struct *exiting = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
@@ -2824,7 +2901,8 @@ retry:
retry_private:
hb = queue_lock(&q);
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
if (unlikely(ret)) {
/*
* Atomic work succeeded and we got the lock,
@@ -2837,15 +2915,22 @@ retry_private:
goto out_unlock_put_key;
case -EFAULT:
goto uaddr_faulted;
+ case -EBUSY:
case -EAGAIN:
/*
* Two reasons for this:
- * - Task is exiting and we just wait for the
+ * - EBUSY: Task is exiting and we just wait for the
* exit to complete.
- * - The user space value changed.
+ * - EAGAIN: The user space value changed.
*/
queue_unlock(hb);
put_futex_key(&q.key);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
@@ -2899,7 +2984,7 @@ retry_private:
}
if (unlikely(to))
- hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
@@ -3454,11 +3539,16 @@ err_unlock:
return ret;
}
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING true
+#define HANDLE_DEATH_LIST false
+
/*
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+ bool pi, bool pending_op)
{
u32 uval, uninitialized_var(nval), mval;
int err;
@@ -3471,6 +3561,42 @@ retry:
if (get_user(uval, uaddr))
return -1;
+ /*
+ * Special case for regular (non PI) futexes. The unlock path in
+ * user space has two race scenarios:
+ *
+ * 1. The unlock path releases the user space futex value and
+ * before it can execute the futex() syscall to wake up
+ * waiters it is killed.
+ *
+ * 2. A woken up waiter is killed before it can acquire the
+ * futex in user space.
+ *
+ * In both cases the TID validation below prevents a wakeup of
+ * potential waiters which can cause these waiters to block
+ * forever.
+ *
+ * In both cases the following conditions are met:
+ *
+ * 1) task->robust_list->list_op_pending != NULL
+ * @pending_op == true
+ * 2) User space futex value == 0
+ * 3) Regular futex: @pi == false
+ *
+ * If these conditions are met, it is safe to attempt waking up a
+ * potential waiter without touching the user space futex value and
+ * trying to set the OWNER_DIED bit. The user space futex value is
+ * uncontended and the rest of the user space mutex state is
+ * consistent, so a woken waiter will just take over the
+ * uncontended futex. Setting the OWNER_DIED bit would create
+ * inconsistent state and malfunction of the user space owner died
+ * handling.
+ */
+ if (pending_op && !pi && !uval) {
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ return 0;
+ }
+
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
return 0;
@@ -3549,7 +3675,7 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
*
* We silently return on any sign of list-walking problem.
*/
-void exit_robust_list(struct task_struct *curr)
+static void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
@@ -3590,10 +3716,11 @@ void exit_robust_list(struct task_struct *curr)
* A pending lock might already be on the list, so
* don't process it twice:
*/
- if (entry != pending)
+ if (entry != pending) {
if (handle_futex_death((void __user *)entry + futex_offset,
- curr, pi))
+ curr, pi, HANDLE_DEATH_LIST))
return;
+ }
if (rc)
return;
entry = next_entry;
@@ -3607,9 +3734,118 @@ void exit_robust_list(struct task_struct *curr)
cond_resched();
}
- if (pending)
+ if (pending) {
handle_futex_death((void __user *)pending + futex_offset,
- curr, pip);
+ curr, pip, HANDLE_DEATH_PENDING);
+ }
+}
+
+static void futex_cleanup(struct task_struct *tsk)
+{
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+ tsk->robust_list = NULL;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list)) {
+ compat_exit_robust_list(tsk);
+ tsk->compat_robust_list = NULL;
+ }
+#endif
+
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk: task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in do_exit().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+ if (tsk->futex_state == FUTEX_STATE_EXITING)
+ mutex_unlock(&tsk->futex_exit_mutex);
+ tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+ /*
+ * Prevent various race issues against a concurrent incoming waiter
+ * including live locks by forcing the waiter to block on
+ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+ * attach_to_pi_owner().
+ */
+ mutex_lock(&tsk->futex_exit_mutex);
+
+ /*
+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+ *
+ * This ensures that all subsequent checks of tsk->futex_state in
+ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+ * tsk->pi_lock held.
+ *
+ * It guarantees also that a pi_state which was queued right before
+ * the state change under tsk->pi_lock by a concurrent waiter must
+ * be observed in exit_pi_state_list().
+ */
+ raw_spin_lock_irq(&tsk->pi_lock);
+ tsk->futex_state = FUTEX_STATE_EXITING;
+ raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+ /*
+ * Lockless store. The only side effect is that an observer might
+ * take another loop until it becomes visible.
+ */
+ tsk->futex_state = state;
+ /*
+ * Drop the exit protection. This unblocks waiters which observed
+ * FUTEX_STATE_EXITING to reevaluate the state.
+ */
+ mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+ /*
+ * The state handling is done for consistency, but in the case of
+ * exec() there is no way to prevent futher damage as the PID stays
+ * the same. But for the unlikely and arguably buggy case that a
+ * futex is held on exec(), this provides at least as much state
+ * consistency protection which is possible.
+ */
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ /*
+ * Reset the state to FUTEX_STATE_OK. The task is alive and about
+ * exec a new binary.
+ */
+ futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
@@ -3739,7 +3975,7 @@ static void __user *futex_uaddr(struct robust_list __user *entry,
*
* We silently return on any sign of list-walking problem.
*/
-void compat_exit_robust_list(struct task_struct *curr)
+static void compat_exit_robust_list(struct task_struct *curr)
{
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
@@ -3786,7 +4022,8 @@ void compat_exit_robust_list(struct task_struct *curr)
if (entry != pending) {
void __user *uaddr = futex_uaddr(entry, futex_offset);
- if (handle_futex_death(uaddr, curr, pi))
+ if (handle_futex_death(uaddr, curr, pi,
+ HANDLE_DEATH_LIST))
return;
}
if (rc)
@@ -3805,7 +4042,7 @@ void compat_exit_robust_list(struct task_struct *curr)
if (pending) {
void __user *uaddr = futex_uaddr(pending, futex_offset);
- handle_futex_death(uaddr, curr, pip);
+ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
}
}
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 9ff449888d9c..e13ca842eb7e 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
# This script generates an archive consisting of kernel headers
@@ -21,30 +21,38 @@ arch/$SRCARCH/include/
# Uncomment it for debugging.
# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
-# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter
-# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter
+# find $all_dirs -name "*.h" | xargs ls -l > /tmp/ls-$iter
+
+all_dirs=
+if [ "$building_out_of_srctree" ]; then
+ for d in $dir_list; do
+ all_dirs="$all_dirs $srctree/$d"
+ done
+fi
+all_dirs="$all_dirs $dir_list"
# include/generated/compile.h is ignored because it is touched even when none
-# of the source files changed. This causes pointless regeneration, so let us
-# ignore them for md5 calculation.
-pushd $srctree > /dev/null
-src_files_md5="$(find $dir_list -name "*.h" |
- grep -v "include/generated/compile.h" |
- grep -v "include/generated/autoconf.h" |
- xargs ls -l | md5sum | cut -d ' ' -f1)"
-popd > /dev/null
-obj_files_md5="$(find $dir_list -name "*.h" |
- grep -v "include/generated/compile.h" |
- grep -v "include/generated/autoconf.h" |
+# of the source files changed.
+#
+# When Kconfig regenerates include/generated/autoconf.h, its timestamp is
+# updated, but the contents might be still the same. When any CONFIG option is
+# changed, Kconfig touches the corresponding timestamp file include/config/*.h.
+# Hence, the md5sum detects the configuration change anyway. We do not need to
+# check include/generated/autoconf.h explicitly.
+#
+# Ignore them for md5 calculation to avoid pointless regeneration.
+headers_md5="$(find $all_dirs -name "*.h" |
+ grep -v "include/generated/compile.h" |
+ grep -v "include/generated/autoconf.h" |
xargs ls -l | md5sum | cut -d ' ' -f1)"
+
# Any changes to this script will also cause a rebuild of the archive.
this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
if [ -f kernel/kheaders.md5 ] &&
- [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
- [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
- [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] &&
- [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
+ [ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] &&
+ [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] &&
+ [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then
exit
fi
@@ -55,14 +63,17 @@ fi
rm -rf $cpio_dir
mkdir $cpio_dir
-pushd $srctree > /dev/null
-for f in $dir_list;
- do find "$f" -name "*.h";
-done | cpio --quiet -pd $cpio_dir
-popd > /dev/null
+if [ "$building_out_of_srctree" ]; then
+ (
+ cd $srctree
+ for f in $dir_list
+ do find "$f" -name "*.h";
+ done | cpio --quiet -pd $cpio_dir
+ )
+fi
-# The second CPIO can complain if files already exist which can
-# happen with out of tree builds. Just silence CPIO for now.
+# The second CPIO can complain if files already exist which can happen with out
+# of tree builds having stale headers in srctree. Just silence CPIO for now.
for f in $dir_list;
do find "$f" -name "*.h";
done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
@@ -71,10 +82,15 @@ done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
find $cpio_dir -type f -print0 |
xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
-tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
+# Create archive and try to normalize metadata for reproducibility.
+# For compatibility with older versions of tar, files are fed to tar
+# pre-sorted, as --sort=name might not be available.
+find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \
+ tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
+ --owner=0 --group=0 --numeric-owner --no-recursion \
+ -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null
-echo "$src_files_md5" > kernel/kheaders.md5
-echo "$obj_files_md5" >> kernel/kheaders.md5
+echo $headers_md5 > kernel/kheaders.md5
echo "$this_file_md5" >> kernel/kheaders.md5
echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 6fef48033f96..4d89ad4fae3b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/sort.h>
static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
unsigned int cpus_per_vec)
@@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
return nodes;
}
+struct node_vectors {
+ unsigned id;
+
+ union {
+ unsigned nvectors;
+ unsigned ncpus;
+ };
+};
+
+static int ncpus_cmp_func(const void *l, const void *r)
+{
+ const struct node_vectors *ln = l;
+ const struct node_vectors *rn = r;
+
+ return ln->ncpus - rn->ncpus;
+}
+
+/*
+ * Allocate vector number for each node, so that for each node:
+ *
+ * 1) the allocated number is >= 1
+ *
+ * 2) the allocated numbver is <= active CPU number of this node
+ *
+ * The actual allocated total vectors may be less than @numvecs when
+ * active total CPU number is less than @numvecs.
+ *
+ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
+ * for each node.
+ */
+static void alloc_nodes_vectors(unsigned int numvecs,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ const nodemask_t nodemsk,
+ struct cpumask *nmsk,
+ struct node_vectors *node_vectors)
+{
+ unsigned n, remaining_ncpus = 0;
+
+ for (n = 0; n < nr_node_ids; n++) {
+ node_vectors[n].id = n;
+ node_vectors[n].ncpus = UINT_MAX;
+ }
+
+ for_each_node_mask(n, nodemsk) {
+ unsigned ncpus;
+
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+ ncpus = cpumask_weight(nmsk);
+
+ if (!ncpus)
+ continue;
+ remaining_ncpus += ncpus;
+ node_vectors[n].ncpus = ncpus;
+ }
+
+ numvecs = min_t(unsigned, remaining_ncpus, numvecs);
+
+ sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
+ ncpus_cmp_func, NULL);
+
+ /*
+ * Allocate vectors for each node according to the ratio of this
+ * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
+ * bigger than number of active numa nodes. Always start the
+ * allocation from the node with minimized nr_cpus.
+ *
+ * This way guarantees that each active node gets allocated at
+ * least one vector, and the theory is simple: over-allocation
+ * is only done when this node is assigned by one vector, so
+ * other nodes will be allocated >= 1 vector, since 'numvecs' is
+ * bigger than number of numa nodes.
+ *
+ * One perfect invariant is that number of allocated vectors for
+ * each node is <= CPU count of this node:
+ *
+ * 1) suppose there are two nodes: A and B
+ * ncpu(X) is CPU count of node X
+ * vecs(X) is the vector count allocated to node X via this
+ * algorithm
+ *
+ * ncpu(A) <= ncpu(B)
+ * ncpu(A) + ncpu(B) = N
+ * vecs(A) + vecs(B) = V
+ *
+ * vecs(A) = max(1, round_down(V * ncpu(A) / N))
+ * vecs(B) = V - vecs(A)
+ *
+ * both N and V are integer, and 2 <= V <= N, suppose
+ * V = N - delta, and 0 <= delta <= N - 2
+ *
+ * 2) obviously vecs(A) <= ncpu(A) because:
+ *
+ * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
+ * ncpu(A) >= 1
+ *
+ * otherwise,
+ * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
+ *
+ * 3) prove how vecs(B) <= ncpu(B):
+ *
+ * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
+ * over-allocated, so vecs(B) <= ncpu(B),
+ *
+ * otherwise:
+ *
+ * vecs(A) =
+ * round_down(V * ncpu(A) / N) =
+ * round_down((N - delta) * ncpu(A) / N) =
+ * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
+ * round_down((N * ncpu(A) - delta * N) / N) =
+ * cpu(A) - delta
+ *
+ * then:
+ *
+ * vecs(A) - V >= ncpu(A) - delta - V
+ * =>
+ * V - vecs(A) <= V + delta - ncpu(A)
+ * =>
+ * vecs(B) <= N - ncpu(A)
+ * =>
+ * vecs(B) <= cpu(B)
+ *
+ * For nodes >= 3, it can be thought as one node and another big
+ * node given that is exactly what this algorithm is implemented,
+ * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
+ * finally for each node X: vecs(X) <= ncpu(X).
+ *
+ */
+ for (n = 0; n < nr_node_ids; n++) {
+ unsigned nvectors, ncpus;
+
+ if (node_vectors[n].ncpus == UINT_MAX)
+ continue;
+
+ WARN_ON_ONCE(numvecs == 0);
+
+ ncpus = node_vectors[n].ncpus;
+ nvectors = max_t(unsigned, 1,
+ numvecs * ncpus / remaining_ncpus);
+ WARN_ON_ONCE(nvectors > ncpus);
+
+ node_vectors[n].nvectors = nvectors;
+
+ remaining_ncpus -= ncpus;
+ numvecs -= nvectors;
+ }
+}
+
static int __irq_build_affinity_masks(unsigned int startvec,
unsigned int numvecs,
unsigned int firstvec,
@@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec,
struct cpumask *nmsk,
struct irq_affinity_desc *masks)
{
- unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+ unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
unsigned int last_affv = firstvec + numvecs;
unsigned int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
+ struct node_vectors *node_vectors;
if (!cpumask_weight(cpu_mask))
return 0;
@@ -126,42 +277,56 @@ static int __irq_build_affinity_masks(unsigned int startvec,
return numvecs;
}
- for_each_node_mask(n, nodemsk) {
- unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
+ node_vectors = kcalloc(nr_node_ids,
+ sizeof(struct node_vectors),
+ GFP_KERNEL);
+ if (!node_vectors)
+ return -ENOMEM;
- /* Spread the vectors per node */
- vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
+ /* allocate vector number for each node */
+ alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
+ nodemsk, nmsk, node_vectors);
- /* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+ for (i = 0; i < nr_node_ids; i++) {
+ unsigned int ncpus, v;
+ struct node_vectors *nv = &node_vectors[i];
+
+ if (nv->nvectors == UINT_MAX)
+ continue;
- /* Calculate the number of cpus per vector */
+ /* Get the cpus on this node which are in the mask */
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
ncpus = cpumask_weight(nmsk);
- vecs_to_assign = min(vecs_per_node, ncpus);
+ if (!ncpus)
+ continue;
+
+ WARN_ON_ONCE(nv->nvectors > ncpus);
/* Account for rounding errors */
- extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign);
+ extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
- for (v = 0; curvec < last_affv && v < vecs_to_assign;
- curvec++, v++) {
- cpus_per_vec = ncpus / vecs_to_assign;
+ /* Spread allocated vectors on CPUs of the current node */
+ for (v = 0; v < nv->nvectors; v++, curvec++) {
+ cpus_per_vec = ncpus / nv->nvectors;
/* Account for extra vectors to compensate rounding errors */
if (extra_vecs) {
cpus_per_vec++;
--extra_vecs;
}
+
+ /*
+ * wrapping has to be considered given 'startvec'
+ * may start anywhere
+ */
+ if (curvec >= last_affv)
+ curvec = firstvec;
irq_spread_init_one(&masks[curvec].mask, nmsk,
cpus_per_vec);
}
-
- done += v;
- if (done >= numvecs)
- break;
- if (curvec >= last_affv)
- curvec = firstvec;
- --nodes;
+ done += nv->nvectors;
}
+ kfree(node_vectors);
return done;
}
@@ -174,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
unsigned int firstvec,
struct irq_affinity_desc *masks)
{
- unsigned int curvec = startvec, nr_present, nr_others;
+ unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
@@ -189,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
if (!node_to_cpumask)
goto fail_npresmsk;
- ret = 0;
/* Stabilize the cpumasks */
get_online_cpus();
build_node_to_cpumask(node_to_cpumask);
/* Spread on present CPUs starting from affd->pre_vectors */
- nr_present = __irq_build_affinity_masks(curvec, numvecs,
- firstvec, node_to_cpumask,
- cpu_present_mask, nmsk, masks);
+ ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
+ node_to_cpumask, cpu_present_mask,
+ nmsk, masks);
+ if (ret < 0)
+ goto fail_build_affinity;
+ nr_present = ret;
/*
* Spread on non present CPUs starting from the next vector to be
@@ -210,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
else
curvec = firstvec + nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- nr_others = __irq_build_affinity_masks(curvec, numvecs,
- firstvec, node_to_cpumask,
- npresmsk, nmsk, masks);
+ ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
+ node_to_cpumask, npresmsk, nmsk,
+ masks);
+ if (ret >= 0)
+ nr_others = ret;
+
+ fail_build_affinity:
put_online_cpus();
- if (nr_present < numvecs)
+ if (ret >= 0)
WARN_ON(nr_present + nr_others < numvecs);
free_node_to_cpumask(node_to_cpumask);
@@ -225,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
fail_nmsk:
free_cpumask_var(nmsk);
- return ret;
+ return ret < 0 ? ret : 0;
}
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b76703b2c0af..b3fa2d87d2f3 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1298,6 +1298,50 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
#endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */
/**
+ * irq_chip_set_parent_state - set the state of a parent interrupt.
+ *
+ * @data: Pointer to interrupt specific data
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
+ *
+ * Conditional success, if the underlying irqchip does not implement it.
+ */
+int irq_chip_set_parent_state(struct irq_data *data,
+ enum irqchip_irq_state which,
+ bool val)
+{
+ data = data->parent_data;
+
+ if (!data || !data->chip->irq_set_irqchip_state)
+ return 0;
+
+ return data->chip->irq_set_irqchip_state(data, which, val);
+}
+EXPORT_SYMBOL_GPL(irq_chip_set_parent_state);
+
+/**
+ * irq_chip_get_parent_state - get the state of a parent interrupt.
+ *
+ * @data: Pointer to interrupt specific data
+ * @which: one of IRQCHIP_STATE_* the caller wants to know
+ * @state: a pointer to a boolean where the state is to be stored
+ *
+ * Conditional success, if the underlying irqchip does not implement it.
+ */
+int irq_chip_get_parent_state(struct irq_data *data,
+ enum irqchip_irq_state which,
+ bool *state)
+{
+ data = data->parent_data;
+
+ if (!data || !data->chip->irq_get_irqchip_state)
+ return 0;
+
+ return data->chip->irq_get_irqchip_state(data, which, state);
+}
+EXPORT_SYMBOL_GPL(irq_chip_get_parent_state);
+
+/**
* irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
* NULL)
* @data: Pointer to interrupt specific data
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 6c7ca2e983a5..02236b13b359 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -12,6 +12,7 @@
#include <linux/interrupt.h>
#include <linux/ratelimit.h>
#include <linux/irq.h>
+#include <linux/sched/isolation.h>
#include "internals.h"
@@ -171,6 +172,20 @@ void irq_migrate_all_off_this_cpu(void)
}
}
+static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
+{
+ const struct cpumask *hk_mask;
+
+ if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
+ return false;
+
+ hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
+ return false;
+
+ return cpumask_test_cpu(cpu, hk_mask);
+}
+
static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
{
struct irq_data *data = irq_desc_get_irq_data(desc);
@@ -188,9 +203,11 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
/*
* If the interrupt can only be directed to a single target
* CPU then it is already assigned to a CPU in the affinity
- * mask. No point in trying to move it around.
+ * mask. No point in trying to move it around unless the
+ * isolation mechanism requests to move it to an upcoming
+ * housekeeping CPU.
*/
- if (!irqd_is_single_target(data))
+ if (!irqd_is_single_target(data) || hk_should_isolate(data, cpu))
irq_set_affinity_locked(data, affinity, false);
}
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index c1eccd4f6520..a949bd39e343 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
+ BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9484e88dabc2..98a5f10d1900 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -295,6 +295,18 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
}
}
+static void irq_sysfs_del(struct irq_desc *desc)
+{
+ /*
+ * If irq_sysfs_init() has not yet been invoked (early boot), then
+ * irq_kobj_base is NULL and the descriptor was never added.
+ * kobject_del() complains about a object with no parent, so make
+ * it conditional.
+ */
+ if (irq_kobj_base)
+ kobject_del(&desc->kobj);
+}
+
static int __init irq_sysfs_init(void)
{
struct irq_desc *desc;
@@ -325,6 +337,7 @@ static struct kobj_type irq_kobj_type = {
};
static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
+static void irq_sysfs_del(struct irq_desc *desc) {}
#endif /* CONFIG_SYSFS */
@@ -438,7 +451,7 @@ static void free_desc(unsigned int irq)
* The sysfs entry must be serialized against a concurrent
* irq_sysfs_init() as well.
*/
- kobject_del(&desc->kobj);
+ irq_sysfs_del(desc);
delete_irq_desc(irq);
/*
@@ -737,7 +750,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
EXPORT_SYMBOL_GPL(irq_free_descs);
/**
- * irq_alloc_descs - allocate and initialize a range of irq descriptors
+ * __irq_alloc_descs - allocate and initialize a range of irq descriptors
* @irq: Allocate for specific irq number if irq >= 0
* @from: Start the search from this irq number
* @cnt: Number of consecutive irqs to allocate.
@@ -878,6 +891,7 @@ __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
}
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+ __releases(&desc->lock)
{
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (bus)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3078d0e48bba..7527e5ef6fe5 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -31,7 +31,7 @@ struct irqchip_fwid {
struct fwnode_handle fwnode;
unsigned int type;
char *name;
- void *data;
+ phys_addr_t *pa;
};
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
@@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
* @type: Type of irqchip_fwnode. See linux/irqdomain.h
* @name: Optional user provided domain name
* @id: Optional user provided id if name != NULL
- * @data: Optional user-provided data
+ * @pa: Optional user-provided physical address
*
* Allocate a struct irqchip_fwid, and return a poiner to the embedded
* fwnode_handle (or NULL on failure).
@@ -62,7 +62,8 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
* domain struct.
*/
struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
- const char *name, void *data)
+ const char *name,
+ phys_addr_t *pa)
{
struct irqchip_fwid *fwid;
char *n;
@@ -77,7 +78,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
n = kasprintf(GFP_KERNEL, "%s-%d", name, id);
break;
default:
- n = kasprintf(GFP_KERNEL, "irqchip@%p", data);
+ n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa);
break;
}
@@ -89,7 +90,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
fwid->type = type;
fwid->name = n;
- fwid->data = data;
+ fwid->pa = pa;
fwid->fwnode.ops = &irqchip_fwnode_ops;
return &fwid->fwnode;
}
@@ -148,6 +149,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
switch (fwid->type) {
case IRQCHIP_FWNODE_NAMED:
case IRQCHIP_FWNODE_NAMED_ID:
+ domain->fwnode = fwnode;
domain->name = kstrdup(fwid->name, GFP_KERNEL);
if (!domain->name) {
kfree(domain);
@@ -985,6 +987,23 @@ const struct irq_domain_ops irq_domain_simple_ops = {
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
/**
+ * irq_domain_translate_onecell() - Generic translate for direct one cell
+ * bindings
+ */
+int irq_domain_translate_onecell(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq,
+ unsigned int *out_type)
+{
+ if (WARN_ON(fwspec->param_count < 1))
+ return -EINVAL;
+ *out_hwirq = fwspec->param[0];
+ *out_type = IRQ_TYPE_NONE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_onecell);
+
+/**
* irq_domain_translate_twocell() - Generic translate for direct two cell
* bindings
*
@@ -1457,6 +1476,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (rv) {
/* Restore the original irq_data. */
*root_irq_data = *child_irq_data;
+ kfree(child_irq_data);
goto error;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e8f7f179bf77..3089a60ea8f9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,12 +18,13 @@
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/sched/task.h>
+#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include <linux/task_work.h>
#include "internals.h"
-#ifdef CONFIG_IRQ_FORCED_THREADING
+#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)
__read_mostly bool force_irqthreads;
EXPORT_SYMBOL_GPL(force_irqthreads);
@@ -217,7 +218,45 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- ret = chip->irq_set_affinity(data, mask, force);
+ /*
+ * If this is a managed interrupt and housekeeping is enabled on
+ * it check whether the requested affinity mask intersects with
+ * a housekeeping CPU. If so, then remove the isolated CPUs from
+ * the mask and just keep the housekeeping CPU(s). This prevents
+ * the affinity setter from routing the interrupt to an isolated
+ * CPU to avoid that I/O submitted from a housekeeping CPU causes
+ * interrupts on an isolated one.
+ *
+ * If the masks do not intersect or include online CPU(s) then
+ * keep the requested mask. The isolated target CPUs are only
+ * receiving interrupts when the I/O operation was submitted
+ * directly from them.
+ *
+ * If all housekeeping CPUs in the affinity mask are offline, the
+ * interrupt will be migrated by the CPU hotplug code once a
+ * housekeeping CPU which belongs to the affinity mask comes
+ * online.
+ */
+ if (irqd_affinity_is_managed(data) &&
+ housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
+ const struct cpumask *hk_mask, *prog_mask;
+
+ static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
+ static struct cpumask tmp_mask;
+
+ hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+
+ raw_spin_lock(&tmp_mask_lock);
+ cpumask_and(&tmp_mask, mask, hk_mask);
+ if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
+ prog_mask = mask;
+ else
+ prog_mask = &tmp_mask;
+ ret = chip->irq_set_affinity(data, prog_mask, force);
+ raw_spin_unlock(&tmp_mask_lock);
+ } else {
+ ret = chip->irq_set_affinity(data, mask, force);
+ }
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
@@ -692,6 +731,13 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
*
* Wakeup mode lets this IRQ wake the system from sleep
* states like "suspend to RAM".
+ *
+ * Note: irq enable/disable state is completely orthogonal
+ * to the enable/disable state of irq wake. An irq can be
+ * disabled with disable_irq() and still wake the system as
+ * long as the irq has wake enabled. If this does not hold,
+ * then the underlying irq chip and the related driver need
+ * to be investigated.
*/
int irq_set_irq_wake(unsigned int irq, unsigned int on)
{
@@ -1255,8 +1301,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
* the thread dies to avoid that the interrupt code
* references an already freed task_struct.
*/
- get_task_struct(t);
- new->thread = t;
+ new->thread = get_task_struct(t);
/*
* Tell the thread to set its affinity. This is
* important for shared interrupt handlers as we do
@@ -1501,8 +1546,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* has. The type flags are unreliable as the
* underlying chip implementation can override them.
*/
- pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
- irq);
+ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n",
+ new->name, irq);
ret = -EINVAL;
goto out_unlock;
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index ad26fbcfbfc8..eb95f6106a1e 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
continue;
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- if (!can_reserve)
+ if (!can_reserve) {
irqd_clr_can_reserve(irq_data);
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irq_data);
+ }
ret = irq_domain_activate_irq(irq_data, can_reserve);
if (ret)
goto cleanup;
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d6961d3c6f9e..8f557fa1f4fe 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -177,6 +177,26 @@ static void resume_irqs(bool want_early)
}
/**
+ * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup
+ * @irq: Interrupt to rearm
+ */
+void rearm_wake_irq(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+
+ if (!desc || !(desc->istate & IRQS_SUSPENDED) ||
+ !irqd_is_wakeup_set(&desc->irq_data))
+ return;
+
+ desc->istate &= ~IRQS_SUSPENDED;
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ __enable_irq(desc);
+
+ irq_put_desc_busunlock(desc, flags);
+}
+
+/**
* irq_pm_syscore_ops - enable interrupt lines early
*
* Enable all interrupt lines with %IRQF_EARLY_RESUME set.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index da9addb8d655..9e5783d98033 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -100,10 +100,6 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
return 0;
}
-#ifndef is_affinity_mask_valid
-#define is_affinity_mask_valid(val) 1
-#endif
-
int no_irq_affinity;
static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
@@ -136,11 +132,6 @@ static ssize_t write_irq_affinity(int type, struct file *file,
if (err)
goto free_cpumask;
- if (!is_affinity_mask_valid(new_value)) {
- err = -EINVAL;
- goto free_cpumask;
- }
-
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
@@ -185,20 +176,20 @@ static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
}
-static const struct file_operations irq_affinity_proc_fops = {
- .open = irq_affinity_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = irq_affinity_proc_write,
+static const struct proc_ops irq_affinity_proc_ops = {
+ .proc_open = irq_affinity_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = irq_affinity_proc_write,
};
-static const struct file_operations irq_affinity_list_proc_fops = {
- .open = irq_affinity_list_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = irq_affinity_list_proc_write,
+static const struct proc_ops irq_affinity_list_proc_ops = {
+ .proc_open = irq_affinity_list_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = irq_affinity_list_proc_write,
};
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
@@ -232,11 +223,6 @@ static ssize_t default_affinity_write(struct file *file,
if (err)
goto out;
- if (!is_affinity_mask_valid(new_value)) {
- err = -EINVAL;
- goto out;
- }
-
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
@@ -260,12 +246,12 @@ static int default_affinity_open(struct inode *inode, struct file *file)
return single_open(file, default_affinity_show, PDE_DATA(inode));
}
-static const struct file_operations default_affinity_proc_fops = {
- .open = default_affinity_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = default_affinity_write,
+static const struct proc_ops default_affinity_proc_ops = {
+ .proc_open = default_affinity_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = default_affinity_write,
};
static int irq_node_proc_show(struct seq_file *m, void *v)
@@ -356,7 +342,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
#ifdef CONFIG_SMP
/* create /proc/irq/<irq>/smp_affinity */
proc_create_data("smp_affinity", 0644, desc->dir,
- &irq_affinity_proc_fops, irqp);
+ &irq_affinity_proc_ops, irqp);
/* create /proc/irq/<irq>/affinity_hint */
proc_create_single_data("affinity_hint", 0444, desc->dir,
@@ -364,7 +350,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
/* create /proc/irq/<irq>/smp_affinity_list */
proc_create_data("smp_affinity_list", 0644, desc->dir,
- &irq_affinity_list_proc_fops, irqp);
+ &irq_affinity_list_proc_ops, irqp);
proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show,
irqp);
@@ -415,7 +401,7 @@ static void register_default_affinity_proc(void)
{
#ifdef CONFIG_SMP
proc_create("irq/default_smp_affinity", 0644, NULL,
- &default_affinity_proc_fops);
+ &default_affinity_proc_ops);
#endif
}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 95414ad3506a..98c04ca5fa43 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -36,6 +36,8 @@ static void resend_irqs(unsigned long arg)
irq = find_first_bit(irqs_resend, nr_irqs);
clear_bit(irq, irqs_resend);
desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
local_irq_disable();
desc->handle_irq(desc);
local_irq_enable();
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 2ed97a7c9b2a..f865e5f4d382 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -34,6 +34,7 @@ static atomic_t irq_poll_active;
* true and let the handler run.
*/
bool irq_wait_for_poll(struct irq_desc *desc)
+ __must_hold(&desc->lock)
{
if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
"irq poll in progress on cpu %d for irq %d\n",
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index d42acaf81886..828cc30774bc 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -29,24 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list);
*/
static bool irq_work_claim(struct irq_work *work)
{
- unsigned long flags, oflags, nflags;
+ int oflags;
+ oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
/*
- * Start with our best wish as a premise but only trust any
- * flag value after cmpxchg() result.
+ * If the work is already pending, no need to raise the IPI.
+ * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
+ * everything we did before is visible.
*/
- flags = work->flags & ~IRQ_WORK_PENDING;
- for (;;) {
- nflags = flags | IRQ_WORK_CLAIMED;
- oflags = cmpxchg(&work->flags, flags, nflags);
- if (oflags == flags)
- break;
- if (oflags & IRQ_WORK_PENDING)
- return false;
- flags = oflags;
- cpu_relax();
- }
-
+ if (oflags & IRQ_WORK_PENDING)
+ return false;
return true;
}
@@ -61,7 +53,7 @@ void __weak arch_irq_work_raise(void)
static void __irq_work_queue_local(struct irq_work *work)
{
/* If the work is "lazy", handle it from next tick if any */
- if (work->flags & IRQ_WORK_LAZY) {
+ if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
tick_nohz_tick_stopped())
arch_irq_work_raise();
@@ -143,7 +135,6 @@ static void irq_work_run_list(struct llist_head *list)
{
struct irq_work *work, *tmp;
struct llist_node *llnode;
- unsigned long flags;
BUG_ON(!irqs_disabled());
@@ -152,6 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
llnode = llist_del_all(list);
llist_for_each_entry_safe(work, tmp, llnode, llnode) {
+ int flags;
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
@@ -159,15 +151,15 @@ static void irq_work_run_list(struct llist_head *list)
* to claim that work don't rely on us to handle their data
* while we are in the middle of the func.
*/
- flags = work->flags & ~IRQ_WORK_PENDING;
- xchg(&work->flags, flags);
+ flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
- (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+ flags &= ~IRQ_WORK_PENDING;
+ (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
}
}
@@ -199,7 +191,7 @@ void irq_work_sync(struct irq_work *work)
{
lockdep_assert_irqs_enabled();
- while (work->flags & IRQ_WORK_BUSY)
+ while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index df3008419a1d..cdb3ffab128b 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -407,7 +407,9 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
return false;
if (!kernel_text_address(jump_entry_code(entry))) {
- WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry));
+ WARN_ONCE(!jump_entry_is_init(entry),
+ "can't patch jump_label at %pS",
+ (void *)jump_entry_code(entry));
return false;
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 95a260f9214b..a9b3f660dee7 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -44,7 +44,7 @@ __attribute__((weak, section(".rodata")));
extern const unsigned long kallsyms_relative_base
__attribute__((weak, section(".rodata")));
-extern const u8 kallsyms_token_table[] __weak;
+extern const char kallsyms_token_table[] __weak;
extern const u16 kallsyms_token_index[] __weak;
extern const unsigned int kallsyms_markers[] __weak;
@@ -58,7 +58,8 @@ static unsigned int kallsyms_expand_symbol(unsigned int off,
char *result, size_t maxlen)
{
int len, skipped_first = 0;
- const u8 *tptr, *data;
+ const char *tptr;
+ const u8 *data;
/* Get the compressed symbol length from the first symbol byte. */
data = &kallsyms_names[off];
@@ -263,8 +264,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
{
char namebuf[KSYM_NAME_LEN];
- if (is_ksym_addr(addr))
- return !!get_symbol_pos(addr, symbolsize, offset);
+ if (is_ksym_addr(addr)) {
+ get_symbol_pos(addr, symbolsize, offset);
+ return 1;
+ }
return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
!!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
@@ -696,16 +699,16 @@ const char *kdb_walk_kallsyms(loff_t *pos)
}
#endif /* CONFIG_KGDB_KDB */
-static const struct file_operations kallsyms_operations = {
- .open = kallsyms_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
+static const struct proc_ops kallsyms_proc_ops = {
+ .proc_open = kallsyms_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release_private,
};
static int __init kallsyms_init(void)
{
- proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
+ proc_create("kallsyms", 0444, NULL, &kallsyms_proc_ops);
return 0;
}
device_initcall(kallsyms_init);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 2ee38727844a..f50354202dbe 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/hashtable.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/preempt.h>
@@ -21,8 +22,11 @@
#include <linux/uaccess.h>
#include <linux/kcov.h>
#include <linux/refcount.h>
+#include <linux/log2.h>
#include <asm/setup.h>
+#define kcov_debug(fmt, ...) pr_debug("%s: " fmt, __func__, ##__VA_ARGS__)
+
/* Number of 64-bit words written per one comparison: */
#define KCOV_WORDS_PER_CMP 4
@@ -44,19 +48,100 @@ struct kcov {
* Reference counter. We keep one for:
* - opened file descriptor
* - task with enabled coverage (we can't unwire it from another task)
+ * - each code section for remote coverage collection
*/
refcount_t refcount;
/* The lock protects mode, size, area and t. */
spinlock_t lock;
enum kcov_mode mode;
- /* Size of arena (in long's for KCOV_MODE_TRACE). */
- unsigned size;
+ /* Size of arena (in long's). */
+ unsigned int size;
/* Coverage buffer shared with user space. */
void *area;
/* Task for which we collect coverage, or NULL. */
struct task_struct *t;
+ /* Collecting coverage from remote (background) threads. */
+ bool remote;
+ /* Size of remote area (in long's). */
+ unsigned int remote_size;
+ /*
+ * Sequence is incremented each time kcov is reenabled, used by
+ * kcov_remote_stop(), see the comment there.
+ */
+ int sequence;
};
+struct kcov_remote_area {
+ struct list_head list;
+ unsigned int size;
+};
+
+struct kcov_remote {
+ u64 handle;
+ struct kcov *kcov;
+ struct hlist_node hnode;
+};
+
+static DEFINE_SPINLOCK(kcov_remote_lock);
+static DEFINE_HASHTABLE(kcov_remote_map, 4);
+static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
+
+/* Must be called with kcov_remote_lock locked. */
+static struct kcov_remote *kcov_remote_find(u64 handle)
+{
+ struct kcov_remote *remote;
+
+ hash_for_each_possible(kcov_remote_map, remote, hnode, handle) {
+ if (remote->handle == handle)
+ return remote;
+ }
+ return NULL;
+}
+
+static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle)
+{
+ struct kcov_remote *remote;
+
+ if (kcov_remote_find(handle))
+ return ERR_PTR(-EEXIST);
+ remote = kmalloc(sizeof(*remote), GFP_ATOMIC);
+ if (!remote)
+ return ERR_PTR(-ENOMEM);
+ remote->handle = handle;
+ remote->kcov = kcov;
+ hash_add(kcov_remote_map, &remote->hnode, handle);
+ return remote;
+}
+
+/* Must be called with kcov_remote_lock locked. */
+static struct kcov_remote_area *kcov_remote_area_get(unsigned int size)
+{
+ struct kcov_remote_area *area;
+ struct list_head *pos;
+
+ kcov_debug("size = %u\n", size);
+ list_for_each(pos, &kcov_remote_areas) {
+ area = list_entry(pos, struct kcov_remote_area, list);
+ if (area->size == size) {
+ list_del(&area->list);
+ kcov_debug("rv = %px\n", area);
+ return area;
+ }
+ }
+ kcov_debug("rv = NULL\n");
+ return NULL;
+}
+
+/* Must be called with kcov_remote_lock locked. */
+static void kcov_remote_area_put(struct kcov_remote_area *area,
+ unsigned int size)
+{
+ kcov_debug("area = %px, size = %u\n", area, size);
+ INIT_LIST_HEAD(&area->list);
+ area->size = size;
+ list_add(&area->list, &kcov_remote_areas);
+}
+
static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
{
unsigned int mode;
@@ -73,7 +158,7 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru
* in_interrupt() returns false (e.g. preempt_schedule_irq()).
* READ_ONCE()/barrier() effectively provides load-acquire wrt
* interrupts, there are paired barrier()/WRITE_ONCE() in
- * kcov_ioctl_locked().
+ * kcov_start().
*/
barrier();
return mode == needed_mode;
@@ -227,6 +312,78 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+static void kcov_start(struct task_struct *t, unsigned int size,
+ void *area, enum kcov_mode mode, int sequence)
+{
+ kcov_debug("t = %px, size = %u, area = %px\n", t, size, area);
+ /* Cache in task struct for performance. */
+ t->kcov_size = size;
+ t->kcov_area = area;
+ /* See comment in check_kcov_mode(). */
+ barrier();
+ WRITE_ONCE(t->kcov_mode, mode);
+ t->kcov_sequence = sequence;
+}
+
+static void kcov_stop(struct task_struct *t)
+{
+ WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED);
+ barrier();
+ t->kcov_size = 0;
+ t->kcov_area = NULL;
+}
+
+static void kcov_task_reset(struct task_struct *t)
+{
+ kcov_stop(t);
+ t->kcov = NULL;
+ t->kcov_sequence = 0;
+ t->kcov_handle = 0;
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+ kcov_task_reset(t);
+ t->kcov_handle = current->kcov_handle;
+}
+
+static void kcov_reset(struct kcov *kcov)
+{
+ kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
+ kcov->remote = false;
+ kcov->remote_size = 0;
+ kcov->sequence++;
+}
+
+static void kcov_remote_reset(struct kcov *kcov)
+{
+ int bkt;
+ struct kcov_remote *remote;
+ struct hlist_node *tmp;
+
+ spin_lock(&kcov_remote_lock);
+ hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) {
+ if (remote->kcov != kcov)
+ continue;
+ kcov_debug("removing handle %llx\n", remote->handle);
+ hash_del(&remote->hnode);
+ kfree(remote);
+ }
+ /* Do reset before unlock to prevent races with kcov_remote_start(). */
+ kcov_reset(kcov);
+ spin_unlock(&kcov_remote_lock);
+}
+
+static void kcov_disable(struct task_struct *t, struct kcov *kcov)
+{
+ kcov_task_reset(t);
+ if (kcov->remote)
+ kcov_remote_reset(kcov);
+ else
+ kcov_reset(kcov);
+}
+
static void kcov_get(struct kcov *kcov)
{
refcount_inc(&kcov->refcount);
@@ -235,20 +392,12 @@ static void kcov_get(struct kcov *kcov)
static void kcov_put(struct kcov *kcov)
{
if (refcount_dec_and_test(&kcov->refcount)) {
+ kcov_remote_reset(kcov);
vfree(kcov->area);
kfree(kcov);
}
}
-void kcov_task_init(struct task_struct *t)
-{
- WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED);
- barrier();
- t->kcov_size = 0;
- t->kcov_area = NULL;
- t->kcov = NULL;
-}
-
void kcov_task_exit(struct task_struct *t)
{
struct kcov *kcov;
@@ -256,15 +405,36 @@ void kcov_task_exit(struct task_struct *t)
kcov = t->kcov;
if (kcov == NULL)
return;
+
spin_lock(&kcov->lock);
+ kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t);
+ /*
+ * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t,
+ * which comes down to:
+ * WARN_ON(!kcov->remote && kcov->t != t);
+ *
+ * For KCOV_REMOTE_ENABLE devices, the exiting task is either:
+ * 2. A remote task between kcov_remote_start() and kcov_remote_stop().
+ * In this case we should print a warning right away, since a task
+ * shouldn't be exiting when it's in a kcov coverage collection
+ * section. Here t points to the task that is collecting remote
+ * coverage, and t->kcov->t points to the thread that created the
+ * kcov device. Which means that to detect this case we need to
+ * check that t != t->kcov->t, and this gives us the following:
+ * WARN_ON(kcov->remote && kcov->t != t);
+ *
+ * 2. The task that created kcov exiting without calling KCOV_DISABLE,
+ * and then again we can make sure that t->kcov->t == t:
+ * WARN_ON(kcov->remote && kcov->t != t);
+ *
+ * By combining all three checks into one we get:
+ */
if (WARN_ON(kcov->t != t)) {
spin_unlock(&kcov->lock);
return;
}
/* Just to not leave dangling references behind. */
- kcov_task_init(t);
- kcov->t = NULL;
- kcov->mode = KCOV_MODE_INIT;
+ kcov_disable(t, kcov);
spin_unlock(&kcov->lock);
kcov_put(kcov);
}
@@ -313,6 +483,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
if (!kcov)
return -ENOMEM;
kcov->mode = KCOV_MODE_DISABLED;
+ kcov->sequence = 1;
refcount_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
@@ -325,6 +496,20 @@ static int kcov_close(struct inode *inode, struct file *filep)
return 0;
}
+static int kcov_get_mode(unsigned long arg)
+{
+ if (arg == KCOV_TRACE_PC)
+ return KCOV_MODE_TRACE_PC;
+ else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+ return KCOV_MODE_TRACE_CMP;
+#else
+ return -ENOTSUPP;
+#endif
+ else
+ return -EINVAL;
+}
+
/*
* Fault in a lazily-faulted vmalloc area before it can be used by
* __santizer_cov_trace_pc(), to avoid recursion issues if any code on the
@@ -340,14 +525,35 @@ static void kcov_fault_in_area(struct kcov *kcov)
READ_ONCE(area[offset]);
}
+static inline bool kcov_check_handle(u64 handle, bool common_valid,
+ bool uncommon_valid, bool zero_valid)
+{
+ if (handle & ~(KCOV_SUBSYSTEM_MASK | KCOV_INSTANCE_MASK))
+ return false;
+ switch (handle & KCOV_SUBSYSTEM_MASK) {
+ case KCOV_SUBSYSTEM_COMMON:
+ return (handle & KCOV_INSTANCE_MASK) ?
+ common_valid : zero_valid;
+ case KCOV_SUBSYSTEM_USB:
+ return uncommon_valid;
+ default:
+ return false;
+ }
+ return false;
+}
+
static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
unsigned long arg)
{
struct task_struct *t;
unsigned long size, unused;
+ int mode, i;
+ struct kcov_remote_arg *remote_arg;
+ struct kcov_remote *remote;
switch (cmd) {
case KCOV_INIT_TRACE:
+ kcov_debug("KCOV_INIT_TRACE\n");
/*
* Enable kcov in trace mode and setup buffer size.
* Must happen before anything else.
@@ -366,6 +572,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
kcov->mode = KCOV_MODE_INIT;
return 0;
case KCOV_ENABLE:
+ kcov_debug("KCOV_ENABLE\n");
/*
* Enable coverage for the current task.
* At this point user must have been enabled trace mode,
@@ -378,29 +585,20 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
t = current;
if (kcov->t != NULL || t->kcov != NULL)
return -EBUSY;
- if (arg == KCOV_TRACE_PC)
- kcov->mode = KCOV_MODE_TRACE_PC;
- else if (arg == KCOV_TRACE_CMP)
-#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
- kcov->mode = KCOV_MODE_TRACE_CMP;
-#else
- return -ENOTSUPP;
-#endif
- else
- return -EINVAL;
+ mode = kcov_get_mode(arg);
+ if (mode < 0)
+ return mode;
kcov_fault_in_area(kcov);
- /* Cache in task struct for performance. */
- t->kcov_size = kcov->size;
- t->kcov_area = kcov->area;
- /* See comment in check_kcov_mode(). */
- barrier();
- WRITE_ONCE(t->kcov_mode, kcov->mode);
+ kcov->mode = mode;
+ kcov_start(t, kcov->size, kcov->area, kcov->mode,
+ kcov->sequence);
t->kcov = kcov;
kcov->t = t;
- /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+ /* Put either in kcov_task_exit() or in KCOV_DISABLE. */
kcov_get(kcov);
return 0;
case KCOV_DISABLE:
+ kcov_debug("KCOV_DISABLE\n");
/* Disable coverage for the current task. */
unused = arg;
if (unused != 0 || current->kcov != kcov)
@@ -408,11 +606,65 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
t = current;
if (WARN_ON(kcov->t != t))
return -EINVAL;
- kcov_task_init(t);
- kcov->t = NULL;
- kcov->mode = KCOV_MODE_INIT;
+ kcov_disable(t, kcov);
kcov_put(kcov);
return 0;
+ case KCOV_REMOTE_ENABLE:
+ kcov_debug("KCOV_REMOTE_ENABLE\n");
+ if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
+ return -EINVAL;
+ t = current;
+ if (kcov->t != NULL || t->kcov != NULL)
+ return -EBUSY;
+ remote_arg = (struct kcov_remote_arg *)arg;
+ mode = kcov_get_mode(remote_arg->trace_mode);
+ if (mode < 0)
+ return mode;
+ if (remote_arg->area_size > LONG_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ kcov->mode = mode;
+ t->kcov = kcov;
+ kcov->t = t;
+ kcov->remote = true;
+ kcov->remote_size = remote_arg->area_size;
+ spin_lock(&kcov_remote_lock);
+ for (i = 0; i < remote_arg->num_handles; i++) {
+ kcov_debug("handle %llx\n", remote_arg->handles[i]);
+ if (!kcov_check_handle(remote_arg->handles[i],
+ false, true, false)) {
+ spin_unlock(&kcov_remote_lock);
+ kcov_disable(t, kcov);
+ return -EINVAL;
+ }
+ remote = kcov_remote_add(kcov, remote_arg->handles[i]);
+ if (IS_ERR(remote)) {
+ spin_unlock(&kcov_remote_lock);
+ kcov_disable(t, kcov);
+ return PTR_ERR(remote);
+ }
+ }
+ if (remote_arg->common_handle) {
+ kcov_debug("common handle %llx\n",
+ remote_arg->common_handle);
+ if (!kcov_check_handle(remote_arg->common_handle,
+ true, false, false)) {
+ spin_unlock(&kcov_remote_lock);
+ kcov_disable(t, kcov);
+ return -EINVAL;
+ }
+ remote = kcov_remote_add(kcov,
+ remote_arg->common_handle);
+ if (IS_ERR(remote)) {
+ spin_unlock(&kcov_remote_lock);
+ kcov_disable(t, kcov);
+ return PTR_ERR(remote);
+ }
+ t->kcov_handle = remote_arg->common_handle;
+ }
+ spin_unlock(&kcov_remote_lock);
+ /* Put either in kcov_task_exit() or in KCOV_DISABLE. */
+ kcov_get(kcov);
+ return 0;
default:
return -ENOTTY;
}
@@ -422,11 +674,35 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
{
struct kcov *kcov;
int res;
+ struct kcov_remote_arg *remote_arg = NULL;
+ unsigned int remote_num_handles;
+ unsigned long remote_arg_size;
+
+ if (cmd == KCOV_REMOTE_ENABLE) {
+ if (get_user(remote_num_handles, (unsigned __user *)(arg +
+ offsetof(struct kcov_remote_arg, num_handles))))
+ return -EFAULT;
+ if (remote_num_handles > KCOV_REMOTE_MAX_HANDLES)
+ return -EINVAL;
+ remote_arg_size = struct_size(remote_arg, handles,
+ remote_num_handles);
+ remote_arg = memdup_user((void __user *)arg, remote_arg_size);
+ if (IS_ERR(remote_arg))
+ return PTR_ERR(remote_arg);
+ if (remote_arg->num_handles != remote_num_handles) {
+ kfree(remote_arg);
+ return -EINVAL;
+ }
+ arg = (unsigned long)remote_arg;
+ }
kcov = filep->private_data;
spin_lock(&kcov->lock);
res = kcov_ioctl_locked(kcov, cmd, arg);
spin_unlock(&kcov->lock);
+
+ kfree(remote_arg);
+
return res;
}
@@ -438,6 +714,207 @@ static const struct file_operations kcov_fops = {
.release = kcov_close,
};
+/*
+ * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section
+ * of code in a kernel background thread to allow kcov to be used to collect
+ * coverage from that part of code.
+ *
+ * The handle argument of kcov_remote_start() identifies a code section that is
+ * used for coverage collection. A userspace process passes this handle to
+ * KCOV_REMOTE_ENABLE ioctl to make the used kcov device start collecting
+ * coverage for the code section identified by this handle.
+ *
+ * The usage of these annotations in the kernel code is different depending on
+ * the type of the kernel thread whose code is being annotated.
+ *
+ * For global kernel threads that are spawned in a limited number of instances
+ * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each
+ * instance must be assigned a unique 4-byte instance id. The instance id is
+ * then combined with a 1-byte subsystem id to get a handle via
+ * kcov_remote_handle(subsystem_id, instance_id).
+ *
+ * For local kernel threads that are spawned from system calls handler when a
+ * user interacts with some kernel interface (e.g. vhost workers), a handle is
+ * passed from a userspace process as the common_handle field of the
+ * kcov_remote_arg struct (note, that the user must generate a handle by using
+ * kcov_remote_handle() with KCOV_SUBSYSTEM_COMMON as the subsystem id and an
+ * arbitrary 4-byte non-zero number as the instance id). This common handle
+ * then gets saved into the task_struct of the process that issued the
+ * KCOV_REMOTE_ENABLE ioctl. When this proccess issues system calls that spawn
+ * kernel threads, the common handle must be retrived via kcov_common_handle()
+ * and passed to the spawned threads via custom annotations. Those kernel
+ * threads must in turn be annotated with kcov_remote_start(common_handle) and
+ * kcov_remote_stop(). All of the threads that are spawned by the same process
+ * obtain the same handle, hence the name "common".
+ *
+ * See Documentation/dev-tools/kcov.rst for more details.
+ *
+ * Internally, this function looks up the kcov device associated with the
+ * provided handle, allocates an area for coverage collection, and saves the
+ * pointers to kcov and area into the current task_struct to allow coverage to
+ * be collected via __sanitizer_cov_trace_pc()
+ * In turns kcov_remote_stop() clears those pointers from task_struct to stop
+ * collecting coverage and copies all collected coverage into the kcov area.
+ */
+void kcov_remote_start(u64 handle)
+{
+ struct kcov_remote *remote;
+ void *area;
+ struct task_struct *t;
+ unsigned int size;
+ enum kcov_mode mode;
+ int sequence;
+
+ if (WARN_ON(!kcov_check_handle(handle, true, true, true)))
+ return;
+ if (WARN_ON(!in_task()))
+ return;
+ t = current;
+ /*
+ * Check that kcov_remote_start is not called twice
+ * nor called by user tasks (with enabled kcov).
+ */
+ if (WARN_ON(t->kcov))
+ return;
+
+ kcov_debug("handle = %llx\n", handle);
+
+ spin_lock(&kcov_remote_lock);
+ remote = kcov_remote_find(handle);
+ if (!remote) {
+ kcov_debug("no remote found");
+ spin_unlock(&kcov_remote_lock);
+ return;
+ }
+ /* Put in kcov_remote_stop(). */
+ kcov_get(remote->kcov);
+ t->kcov = remote->kcov;
+ /*
+ * Read kcov fields before unlock to prevent races with
+ * KCOV_DISABLE / kcov_remote_reset().
+ */
+ size = remote->kcov->remote_size;
+ mode = remote->kcov->mode;
+ sequence = remote->kcov->sequence;
+ area = kcov_remote_area_get(size);
+ spin_unlock(&kcov_remote_lock);
+
+ if (!area) {
+ area = vmalloc(size * sizeof(unsigned long));
+ if (!area) {
+ t->kcov = NULL;
+ kcov_put(remote->kcov);
+ return;
+ }
+ }
+ /* Reset coverage size. */
+ *(u64 *)area = 0;
+
+ kcov_debug("area = %px, size = %u", area, size);
+
+ kcov_start(t, size, area, mode, sequence);
+
+}
+EXPORT_SYMBOL(kcov_remote_start);
+
+static void kcov_move_area(enum kcov_mode mode, void *dst_area,
+ unsigned int dst_area_size, void *src_area)
+{
+ u64 word_size = sizeof(unsigned long);
+ u64 count_size, entry_size_log;
+ u64 dst_len, src_len;
+ void *dst_entries, *src_entries;
+ u64 dst_occupied, dst_free, bytes_to_move, entries_moved;
+
+ kcov_debug("%px %u <= %px %lu\n",
+ dst_area, dst_area_size, src_area, *(unsigned long *)src_area);
+
+ switch (mode) {
+ case KCOV_MODE_TRACE_PC:
+ dst_len = READ_ONCE(*(unsigned long *)dst_area);
+ src_len = *(unsigned long *)src_area;
+ count_size = sizeof(unsigned long);
+ entry_size_log = __ilog2_u64(sizeof(unsigned long));
+ break;
+ case KCOV_MODE_TRACE_CMP:
+ dst_len = READ_ONCE(*(u64 *)dst_area);
+ src_len = *(u64 *)src_area;
+ count_size = sizeof(u64);
+ BUILD_BUG_ON(!is_power_of_2(KCOV_WORDS_PER_CMP));
+ entry_size_log = __ilog2_u64(sizeof(u64) * KCOV_WORDS_PER_CMP);
+ break;
+ default:
+ WARN_ON(1);
+ return;
+ }
+
+ /* As arm can't divide u64 integers use log of entry size. */
+ if (dst_len > ((dst_area_size * word_size - count_size) >>
+ entry_size_log))
+ return;
+ dst_occupied = count_size + (dst_len << entry_size_log);
+ dst_free = dst_area_size * word_size - dst_occupied;
+ bytes_to_move = min(dst_free, src_len << entry_size_log);
+ dst_entries = dst_area + dst_occupied;
+ src_entries = src_area + count_size;
+ memcpy(dst_entries, src_entries, bytes_to_move);
+ entries_moved = bytes_to_move >> entry_size_log;
+
+ switch (mode) {
+ case KCOV_MODE_TRACE_PC:
+ WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved);
+ break;
+ case KCOV_MODE_TRACE_CMP:
+ WRITE_ONCE(*(u64 *)dst_area, dst_len + entries_moved);
+ break;
+ default:
+ break;
+ }
+}
+
+/* See the comment before kcov_remote_start() for usage details. */
+void kcov_remote_stop(void)
+{
+ struct task_struct *t = current;
+ struct kcov *kcov = t->kcov;
+ void *area = t->kcov_area;
+ unsigned int size = t->kcov_size;
+ int sequence = t->kcov_sequence;
+
+ if (!kcov) {
+ kcov_debug("no kcov found\n");
+ return;
+ }
+
+ kcov_stop(t);
+ t->kcov = NULL;
+
+ spin_lock(&kcov->lock);
+ /*
+ * KCOV_DISABLE could have been called between kcov_remote_start()
+ * and kcov_remote_stop(), hence the check.
+ */
+ kcov_debug("move if: %d == %d && %d\n",
+ sequence, kcov->sequence, (int)kcov->remote);
+ if (sequence == kcov->sequence && kcov->remote)
+ kcov_move_area(kcov->mode, kcov->area, kcov->size, area);
+ spin_unlock(&kcov->lock);
+
+ spin_lock(&kcov_remote_lock);
+ kcov_remote_area_put(area, size);
+ spin_unlock(&kcov_remote_lock);
+
+ kcov_put(kcov);
+}
+EXPORT_SYMBOL(kcov_remote_stop);
+
+/* See the comment before kcov_remote_start() for usage details. */
+u64 kcov_common_handle(void)
+{
+ return current->kcov_handle;
+}
+EXPORT_SYMBOL(kcov_common_handle);
+
static int __init kcov_init(void)
{
/*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1b018f1a6e0d..f977786fe498 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -159,6 +159,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
kimage_terminate(image);
+ ret = machine_kexec_post_load(image);
+ if (ret)
+ goto out;
+
/* Install the new kernel and uninstall the old */
image = xchg(dest_image, image);
@@ -206,6 +210,14 @@ static inline int kexec_load_check(unsigned long nr_segments,
return result;
/*
+ * kexec can be used to circumvent module loading restrictions, so
+ * prevent loading in that case
+ */
+ result = security_locked_down(LOCKDOWN_KEXEC);
+ if (result)
+ return result;
+
+ /*
* Verify we have a legal set of flags
* This leaves us room for future extensions.
*/
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d5870723b8ad..c19c0dad1ebe 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -300,6 +300,8 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *pages;
+ if (fatal_signal_pending(current))
+ return NULL;
pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
if (pages) {
unsigned int count, i;
@@ -587,6 +589,12 @@ static void kimage_free_extra_pages(struct kimage *image)
kimage_free_page_list(&image->unusable_pages);
}
+
+int __weak machine_kexec_post_load(struct kimage *image)
+{
+ return 0;
+}
+
void kimage_terminate(struct kimage *image)
{
if (*image->entry != 0)
@@ -1169,7 +1177,7 @@ int kernel_kexec(void)
* CPU hotplug again; so re-enable it here.
*/
cpu_hotplug_enable();
- pr_emerg("Starting new kernel\n");
+ pr_notice("Starting new kernel\n");
machine_shutdown();
}
diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c
new file mode 100644
index 000000000000..d3689632e8b9
--- /dev/null
+++ b/kernel/kexec_elf.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Load ELF vmlinux file for the kexec_file_load syscall.
+ *
+ * Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004 IBM Corp.
+ * Copyright (C) 2005 R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2016 IBM Corporation
+ *
+ * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c.
+ * Heavily modified for the kernel by
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>.
+ */
+
+#define pr_fmt(fmt) "kexec_elf: " fmt
+
+#include <linux/elf.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+static inline bool elf_is_elf_file(const struct elfhdr *ehdr)
+{
+ return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0;
+}
+
+static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le64_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be64_to_cpu(value);
+
+ return value;
+}
+
+static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le32_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be32_to_cpu(value);
+
+ return value;
+}
+
+static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le16_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be16_to_cpu(value);
+
+ return value;
+}
+
+/**
+ * elf_is_ehdr_sane - check that it is safe to use the ELF header
+ * @buf_len: size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len)
+{
+ if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) {
+ pr_debug("Bad program header size.\n");
+ return false;
+ } else if (ehdr->e_shnum > 0 &&
+ ehdr->e_shentsize != sizeof(struct elf_shdr)) {
+ pr_debug("Bad section header size.\n");
+ return false;
+ } else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT ||
+ ehdr->e_version != EV_CURRENT) {
+ pr_debug("Unknown ELF version.\n");
+ return false;
+ }
+
+ if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+ size_t phdr_size;
+
+ /*
+ * e_phnum is at most 65535 so calculating the size of the
+ * program header cannot overflow.
+ */
+ phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+ /* Sanity check the program header table location. */
+ if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) {
+ pr_debug("Program headers at invalid location.\n");
+ return false;
+ } else if (ehdr->e_phoff + phdr_size > buf_len) {
+ pr_debug("Program headers truncated.\n");
+ return false;
+ }
+ }
+
+ if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) {
+ size_t shdr_size;
+
+ /*
+ * e_shnum is at most 65536 so calculating
+ * the size of the section header cannot overflow.
+ */
+ shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum;
+
+ /* Sanity check the section header table location. */
+ if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) {
+ pr_debug("Section headers at invalid location.\n");
+ return false;
+ } else if (ehdr->e_shoff + shdr_size > buf_len) {
+ pr_debug("Section headers truncated.\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr)
+{
+ struct elfhdr *buf_ehdr;
+
+ if (len < sizeof(*buf_ehdr)) {
+ pr_debug("Buffer is too small to hold ELF header.\n");
+ return -ENOEXEC;
+ }
+
+ memset(ehdr, 0, sizeof(*ehdr));
+ memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident));
+ if (!elf_is_elf_file(ehdr)) {
+ pr_debug("No ELF header magic.\n");
+ return -ENOEXEC;
+ }
+
+ if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) {
+ pr_debug("Not a supported ELF class.\n");
+ return -ENOEXEC;
+ } else if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB &&
+ ehdr->e_ident[EI_DATA] != ELFDATA2MSB) {
+ pr_debug("Not a supported ELF data format.\n");
+ return -ENOEXEC;
+ }
+
+ buf_ehdr = (struct elfhdr *) buf;
+ if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) {
+ pr_debug("Bad ELF header size.\n");
+ return -ENOEXEC;
+ }
+
+ ehdr->e_type = elf16_to_cpu(ehdr, buf_ehdr->e_type);
+ ehdr->e_machine = elf16_to_cpu(ehdr, buf_ehdr->e_machine);
+ ehdr->e_version = elf32_to_cpu(ehdr, buf_ehdr->e_version);
+ ehdr->e_flags = elf32_to_cpu(ehdr, buf_ehdr->e_flags);
+ ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize);
+ ehdr->e_phnum = elf16_to_cpu(ehdr, buf_ehdr->e_phnum);
+ ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize);
+ ehdr->e_shnum = elf16_to_cpu(ehdr, buf_ehdr->e_shnum);
+ ehdr->e_shstrndx = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx);
+
+ switch (ehdr->e_ident[EI_CLASS]) {
+ case ELFCLASS64:
+ ehdr->e_entry = elf64_to_cpu(ehdr, buf_ehdr->e_entry);
+ ehdr->e_phoff = elf64_to_cpu(ehdr, buf_ehdr->e_phoff);
+ ehdr->e_shoff = elf64_to_cpu(ehdr, buf_ehdr->e_shoff);
+ break;
+
+ case ELFCLASS32:
+ ehdr->e_entry = elf32_to_cpu(ehdr, buf_ehdr->e_entry);
+ ehdr->e_phoff = elf32_to_cpu(ehdr, buf_ehdr->e_phoff);
+ ehdr->e_shoff = elf32_to_cpu(ehdr, buf_ehdr->e_shoff);
+ break;
+
+ default:
+ pr_debug("Unknown ELF class.\n");
+ return -EINVAL;
+ }
+
+ return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_is_phdr_sane - check that it is safe to use the program header
+ * @buf_len: size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len)
+{
+
+ if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) {
+ pr_debug("ELF segment location wraps around.\n");
+ return false;
+ } else if (phdr->p_offset + phdr->p_filesz > buf_len) {
+ pr_debug("ELF segment not in file.\n");
+ return false;
+ } else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) {
+ pr_debug("ELF segment address wraps around.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int elf_read_phdr(const char *buf, size_t len,
+ struct kexec_elf_info *elf_info,
+ int idx)
+{
+ /* Override the const in proghdrs, we are the ones doing the loading. */
+ struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx];
+ const struct elfhdr *ehdr = elf_info->ehdr;
+ const char *pbuf;
+ struct elf_phdr *buf_phdr;
+
+ pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr));
+ buf_phdr = (struct elf_phdr *) pbuf;
+
+ phdr->p_type = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type);
+ phdr->p_flags = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags);
+
+ switch (ehdr->e_ident[EI_CLASS]) {
+ case ELFCLASS64:
+ phdr->p_offset = elf64_to_cpu(ehdr, buf_phdr->p_offset);
+ phdr->p_paddr = elf64_to_cpu(ehdr, buf_phdr->p_paddr);
+ phdr->p_vaddr = elf64_to_cpu(ehdr, buf_phdr->p_vaddr);
+ phdr->p_filesz = elf64_to_cpu(ehdr, buf_phdr->p_filesz);
+ phdr->p_memsz = elf64_to_cpu(ehdr, buf_phdr->p_memsz);
+ phdr->p_align = elf64_to_cpu(ehdr, buf_phdr->p_align);
+ break;
+
+ case ELFCLASS32:
+ phdr->p_offset = elf32_to_cpu(ehdr, buf_phdr->p_offset);
+ phdr->p_paddr = elf32_to_cpu(ehdr, buf_phdr->p_paddr);
+ phdr->p_vaddr = elf32_to_cpu(ehdr, buf_phdr->p_vaddr);
+ phdr->p_filesz = elf32_to_cpu(ehdr, buf_phdr->p_filesz);
+ phdr->p_memsz = elf32_to_cpu(ehdr, buf_phdr->p_memsz);
+ phdr->p_align = elf32_to_cpu(ehdr, buf_phdr->p_align);
+ break;
+
+ default:
+ pr_debug("Unknown ELF class.\n");
+ return -EINVAL;
+ }
+
+ return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_read_phdrs - read the program headers from the buffer
+ *
+ * This function assumes that the program header table was checked for sanity.
+ * Use elf_is_ehdr_sane() if it wasn't.
+ */
+static int elf_read_phdrs(const char *buf, size_t len,
+ struct kexec_elf_info *elf_info)
+{
+ size_t phdr_size, i;
+ const struct elfhdr *ehdr = elf_info->ehdr;
+
+ /*
+ * e_phnum is at most 65535 so calculating the size of the
+ * program header cannot overflow.
+ */
+ phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+ elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL);
+ if (!elf_info->proghdrs)
+ return -ENOMEM;
+
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ int ret;
+
+ ret = elf_read_phdr(buf, len, elf_info, i);
+ if (ret) {
+ kfree(elf_info->proghdrs);
+ elf_info->proghdrs = NULL;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info
+ * @buf: Buffer to read ELF file from.
+ * @len: Size of @buf.
+ * @ehdr: Pointer to existing struct which will be populated.
+ * @elf_info: Pointer to existing struct which will be populated.
+ *
+ * This function allows reading ELF files with different byte order than
+ * the kernel, byte-swapping the fields as needed.
+ *
+ * Return:
+ * On success returns 0, and the caller should call
+ * kexec_free_elf_info(elf_info) to free the memory allocated for the section
+ * and program headers.
+ */
+static int elf_read_from_buffer(const char *buf, size_t len,
+ struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info)
+{
+ int ret;
+
+ ret = elf_read_ehdr(buf, len, ehdr);
+ if (ret)
+ return ret;
+
+ elf_info->buffer = buf;
+ elf_info->ehdr = ehdr;
+ if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+ ret = elf_read_phdrs(buf, len, elf_info);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * kexec_free_elf_info - free memory allocated by elf_read_from_buffer
+ */
+void kexec_free_elf_info(struct kexec_elf_info *elf_info)
+{
+ kfree(elf_info->proghdrs);
+ memset(elf_info, 0, sizeof(*elf_info));
+}
+/**
+ * kexec_build_elf_info - read ELF executable and check that we can use it
+ */
+int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info)
+{
+ int i;
+ int ret;
+
+ ret = elf_read_from_buffer(buf, len, ehdr, elf_info);
+ if (ret)
+ return ret;
+
+ /* Big endian vmlinux has type ET_DYN. */
+ if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) {
+ pr_err("Not an ELF executable.\n");
+ goto error;
+ } else if (!elf_info->proghdrs) {
+ pr_err("No ELF program header.\n");
+ goto error;
+ }
+
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ /*
+ * Kexec does not support loading interpreters.
+ * In addition this check keeps us from attempting
+ * to kexec ordinay executables.
+ */
+ if (elf_info->proghdrs[i].p_type == PT_INTERP) {
+ pr_err("Requires an ELF interpreter.\n");
+ goto error;
+ }
+ }
+
+ return 0;
+error:
+ kexec_free_elf_info(elf_info);
+ return -ENOEXEC;
+}
+
+
+int kexec_elf_probe(const char *buf, unsigned long len)
+{
+ struct elfhdr ehdr;
+ struct kexec_elf_info elf_info;
+ int ret;
+
+ ret = kexec_build_elf_info(buf, len, &ehdr, &elf_info);
+ if (ret)
+ return ret;
+
+ kexec_free_elf_info(&elf_info);
+
+ return elf_check_arch(&ehdr) ? 0 : -ENOEXEC;
+}
+
+/**
+ * kexec_elf_load - load ELF executable image
+ * @lowest_load_addr: On return, will be the address where the first PT_LOAD
+ * section will be loaded in memory.
+ *
+ * Return:
+ * 0 on success, negative value on failure.
+ */
+int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info,
+ struct kexec_buf *kbuf,
+ unsigned long *lowest_load_addr)
+{
+ unsigned long lowest_addr = UINT_MAX;
+ int ret;
+ size_t i;
+
+ /* Read in the PT_LOAD segments. */
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ unsigned long load_addr;
+ size_t size;
+ const struct elf_phdr *phdr;
+
+ phdr = &elf_info->proghdrs[i];
+ if (phdr->p_type != PT_LOAD)
+ continue;
+
+ size = phdr->p_filesz;
+ if (size > phdr->p_memsz)
+ size = phdr->p_memsz;
+
+ kbuf->buffer = (void *) elf_info->buffer + phdr->p_offset;
+ kbuf->bufsz = size;
+ kbuf->memsz = phdr->p_memsz;
+ kbuf->buf_align = phdr->p_align;
+ kbuf->buf_min = phdr->p_paddr;
+ kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(kbuf);
+ if (ret)
+ goto out;
+ load_addr = kbuf->mem;
+
+ if (load_addr < lowest_addr)
+ lowest_addr = load_addr;
+ }
+
+ *lowest_load_addr = lowest_addr;
+ ret = 0;
+ out:
+ return ret;
+}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b8cc032d5620..faa74d5f6941 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -88,7 +88,7 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
return kexec_image_post_load_cleanup_default(image);
}
-#ifdef CONFIG_KEXEC_VERIFY_SIG
+#ifdef CONFIG_KEXEC_SIG
static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
unsigned long buf_len)
{
@@ -177,6 +177,59 @@ void kimage_file_post_load_cleanup(struct kimage *image)
image->image_loader_data = NULL;
}
+#ifdef CONFIG_KEXEC_SIG
+static int
+kimage_validate_signature(struct kimage *image)
+{
+ const char *reason;
+ int ret;
+
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ switch (ret) {
+ case 0:
+ break;
+
+ /* Certain verification errors are non-fatal if we're not
+ * checking errors, provided we aren't mandating that there
+ * must be a valid signature.
+ */
+ case -ENODATA:
+ reason = "kexec of unsigned image";
+ goto decide;
+ case -ENOPKG:
+ reason = "kexec of image with unsupported crypto";
+ goto decide;
+ case -ENOKEY:
+ reason = "kexec of image with unavailable key";
+ decide:
+ if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
+ pr_notice("%s rejected\n", reason);
+ return ret;
+ }
+
+ /* If IMA is guaranteed to appraise a signature on the kexec
+ * image, permit it even if the kernel is otherwise locked
+ * down.
+ */
+ if (!ima_appraise_signature(READING_KEXEC_IMAGE) &&
+ security_locked_down(LOCKDOWN_KEXEC))
+ return -EPERM;
+
+ return 0;
+
+ /* All other errors are fatal, including nomem, unparseable
+ * signatures and signature check failures - even if signatures
+ * aren't required.
+ */
+ default:
+ pr_notice("kernel signature verification failed (%d).\n", ret);
+ }
+
+ return ret;
+}
+#endif
+
/*
* In file mode list of segments is prepared by kernel. Copy relevant
* data from user space, do error checking, prepare segment list
@@ -186,7 +239,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
const char __user *cmdline_ptr,
unsigned long cmdline_len, unsigned flags)
{
- int ret = 0;
+ int ret;
void *ldata;
loff_t size;
@@ -202,14 +255,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
if (ret)
goto out;
-#ifdef CONFIG_KEXEC_VERIFY_SIG
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
- if (ret) {
- pr_debug("kernel signature verification failed.\n");
+#ifdef CONFIG_KEXEC_SIG
+ ret = kimage_validate_signature(image);
+
+ if (ret)
goto out;
- }
- pr_debug("kernel signature verification successful.\n");
#endif
/* It is possible that there no initramfs is being loaded */
if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
@@ -391,6 +441,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
kimage_terminate(image);
+ ret = machine_kexec_post_load(image);
+ if (ret)
+ goto out;
+
/*
* Free up any temporary buffers allocated which are not needed
* after image has been loaded
@@ -1254,7 +1308,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
if (kernel_map) {
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_vaddr = (Elf64_Addr)_text;
+ phdr->p_vaddr = (unsigned long) _text;
phdr->p_filesz = phdr->p_memsz = _end - _text;
phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
ehdr->e_phnum++;
@@ -1271,7 +1325,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
phdr->p_offset = mstart;
phdr->p_paddr = mstart;
- phdr->p_vaddr = (unsigned long long) __va(mstart);
+ phdr->p_vaddr = (unsigned long) __va(mstart);
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 48aaf2ac0d0d..39d30ccf8d87 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,6 +13,8 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
+int machine_kexec_post_load(struct kimage *image);
+
extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9873fc627d61..2625c241ac00 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -470,6 +470,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
*/
static void do_optimize_kprobes(void)
{
+ lockdep_assert_held(&text_mutex);
/*
* The optimization/unoptimization refers online_cpus via
* stop_machine() and cpu-hotplug modifies online_cpus.
@@ -487,9 +488,7 @@ static void do_optimize_kprobes(void)
list_empty(&optimizing_list))
return;
- mutex_lock(&text_mutex);
arch_optimize_kprobes(&optimizing_list);
- mutex_unlock(&text_mutex);
}
/*
@@ -500,6 +499,7 @@ static void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
+ lockdep_assert_held(&text_mutex);
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held();
@@ -507,10 +507,11 @@ static void do_unoptimize_kprobes(void)
if (list_empty(&unoptimizing_list))
return;
- mutex_lock(&text_mutex);
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
+ /* Switching from detour code to origin */
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
/* Disarm probes if marked disabled */
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
@@ -524,7 +525,6 @@ static void do_unoptimize_kprobes(void)
} else
list_del_init(&op->list);
}
- mutex_unlock(&text_mutex);
}
/* Reclaim all kprobes on the free_list */
@@ -556,6 +556,7 @@ static void kprobe_optimizer(struct work_struct *work)
{
mutex_lock(&kprobe_mutex);
cpus_read_lock();
+ mutex_lock(&text_mutex);
/* Lock modules while optimizing kprobes */
mutex_lock(&module_mutex);
@@ -583,6 +584,7 @@ static void kprobe_optimizer(struct work_struct *work)
do_free_cleaned_kprobes();
mutex_unlock(&module_mutex);
+ mutex_unlock(&text_mutex);
cpus_read_unlock();
mutex_unlock(&kprobe_mutex);
@@ -610,6 +612,18 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
+static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+{
+ struct optimized_kprobe *_op;
+
+ list_for_each_entry(_op, &unoptimizing_list, list) {
+ if (op == _op)
+ return true;
+ }
+
+ return false;
+}
+
/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
@@ -631,17 +645,21 @@ static void optimize_kprobe(struct kprobe *p)
return;
/* Check if it is already optimized. */
- if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
+ if (optprobe_queued_unopt(op)) {
+ /* This is under unoptimizing. Just dequeue the probe */
+ list_del_init(&op->list);
+ }
return;
+ }
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- if (!list_empty(&op->list))
- /* This is under unoptimizing. Just dequeue the probe */
- list_del_init(&op->list);
- else {
- list_add(&op->list, &optimizing_list);
- kick_kprobe_optimizer();
- }
+ /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
+ if (WARN_ON_ONCE(!list_empty(&op->list)))
+ return;
+
+ list_add(&op->list, &optimizing_list);
+ kick_kprobe_optimizer();
}
/* Short cut to direct unoptimizing */
@@ -649,6 +667,7 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
lockdep_assert_cpus_held();
arch_unoptimize_kprobe(op);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
}
@@ -662,31 +681,33 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
return; /* This is not an optprobe nor optimized */
op = container_of(p, struct optimized_kprobe, kp);
- if (!kprobe_optimized(p)) {
- /* Unoptimized or unoptimizing case */
- if (force && !list_empty(&op->list)) {
- /*
- * Only if this is unoptimizing kprobe and forced,
- * forcibly unoptimize it. (No need to unoptimize
- * unoptimized kprobe again :)
- */
- list_del_init(&op->list);
- force_unoptimize_kprobe(op);
- }
+ if (!kprobe_optimized(p))
return;
- }
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (!list_empty(&op->list)) {
- /* Dequeue from the optimization queue */
- list_del_init(&op->list);
+ if (optprobe_queued_unopt(op)) {
+ /* Queued in unoptimizing queue */
+ if (force) {
+ /*
+ * Forcibly unoptimize the kprobe here, and queue it
+ * in the freeing list for release afterwards.
+ */
+ force_unoptimize_kprobe(op);
+ list_move(&op->list, &freeing_list);
+ }
+ } else {
+ /* Dequeue from the optimizing queue */
+ list_del_init(&op->list);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ }
return;
}
+
/* Optimized kprobe case */
- if (force)
+ if (force) {
/* Forcibly update the code: this is a special case */
force_unoptimize_kprobe(op);
- else {
+ } else {
list_add(&op->list, &unoptimizing_list);
kick_kprobe_optimizer();
}
@@ -962,8 +983,15 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
+ .flags = FTRACE_OPS_FL_SAVE_REGS,
+};
+
+static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
+ .func = kprobe_ftrace_handler,
.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};
+
+static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
/* Must ensure p->addr is really on ftrace */
@@ -976,58 +1004,75 @@ static int prepare_kprobe(struct kprobe *p)
}
/* Caller must lock kprobe_mutex */
-static int arm_kprobe_ftrace(struct kprobe *p)
+static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
+ int *cnt)
{
int ret = 0;
- ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
- (unsigned long)p->addr, 0, 0);
+ ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
if (ret) {
pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",
p->addr, ret);
return ret;
}
- if (kprobe_ftrace_enabled == 0) {
- ret = register_ftrace_function(&kprobe_ftrace_ops);
+ if (*cnt == 0) {
+ ret = register_ftrace_function(ops);
if (ret) {
pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
goto err_ftrace;
}
}
- kprobe_ftrace_enabled++;
+ (*cnt)++;
return ret;
err_ftrace:
/*
- * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a
- * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental
- * empty filter_hash which would undesirably trace all functions.
+ * At this point, sinec ops is not registered, we should be sefe from
+ * registering empty filter.
*/
- ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0);
+ ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
return ret;
}
+static int arm_kprobe_ftrace(struct kprobe *p)
+{
+ bool ipmodify = (p->post_handler != NULL);
+
+ return __arm_kprobe_ftrace(p,
+ ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
+ ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
+}
+
/* Caller must lock kprobe_mutex */
-static int disarm_kprobe_ftrace(struct kprobe *p)
+static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
+ int *cnt)
{
int ret = 0;
- if (kprobe_ftrace_enabled == 1) {
- ret = unregister_ftrace_function(&kprobe_ftrace_ops);
+ if (*cnt == 1) {
+ ret = unregister_ftrace_function(ops);
if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
return ret;
}
- kprobe_ftrace_enabled--;
+ (*cnt)--;
- ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
- (unsigned long)p->addr, 1, 0);
+ ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",
p->addr, ret);
return ret;
}
+
+static int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ bool ipmodify = (p->post_handler != NULL);
+
+ return __disarm_kprobe_ftrace(p,
+ ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
+ ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
+}
#else /* !CONFIG_KPROBES_ON_FTRACE */
#define prepare_kprobe(p) arch_prepare_kprobe(p)
#define arm_kprobe_ftrace(p) (-ENODEV)
@@ -1514,7 +1559,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
/* Ensure it is not in reserved area nor out of text */
if (!kernel_text_address((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
- jump_label_text_reserved(p->addr, p->addr)) {
+ jump_label_text_reserved(p->addr, p->addr) ||
+ find_bug((unsigned long)p->addr)) {
ret = -EINVAL;
goto out;
}
@@ -1906,7 +1952,7 @@ int register_kretprobe(struct kretprobe *rp)
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
rp->maxactive = num_possible_cpus();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 621467c33fef..b262f47046ca 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -866,9 +866,9 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
-void __kthread_queue_delayed_work(struct kthread_worker *worker,
- struct kthread_delayed_work *dwork,
- unsigned long delay)
+static void __kthread_queue_delayed_work(struct kthread_worker *worker,
+ struct kthread_delayed_work *dwork,
+ unsigned long delay)
{
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index e3acead004e6..8d1c15832e55 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -255,17 +255,17 @@ static int lstats_open(struct inode *inode, struct file *filp)
return single_open(filp, lstats_show, NULL);
}
-static const struct file_operations lstats_fops = {
- .open = lstats_open,
- .read = seq_read,
- .write = lstats_write,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops lstats_proc_ops = {
+ .proc_open = lstats_open,
+ .proc_read = seq_read,
+ .proc_write = lstats_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
static int __init init_lstats_procfs(void)
{
- proc_create("latency_stats", 0644, NULL, &lstats_fops);
+ proc_create("latency_stats", 0644, NULL, &lstats_proc_ops);
return 0;
}
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index cf9b5bcdb952..cf03d4bdfc66 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_LIVEPATCH) += livepatch.o
-livepatch-objs := core.o patch.o shadow.o transition.o
+livepatch-objs := core.o patch.o shadow.o state.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index c4ce08f43bd6..c3512e7e0801 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -22,6 +22,7 @@
#include <asm/cacheflush.h>
#include "core.h"
#include "patch.h"
+#include "state.h"
#include "transition.h"
/*
@@ -632,7 +633,7 @@ static void klp_free_objects_dynamic(struct klp_patch *patch)
* The operation must be completed by calling klp_free_patch_finish()
* outside klp_mutex.
*/
-void klp_free_patch_start(struct klp_patch *patch)
+static void klp_free_patch_start(struct klp_patch *patch)
{
if (!list_empty(&patch->list))
list_del(&patch->list);
@@ -677,6 +678,23 @@ static void klp_free_patch_work_fn(struct work_struct *work)
klp_free_patch_finish(patch);
}
+void klp_free_patch_async(struct klp_patch *patch)
+{
+ klp_free_patch_start(patch);
+ schedule_work(&patch->free_work);
+}
+
+void klp_free_replaced_patches_async(struct klp_patch *new_patch)
+{
+ struct klp_patch *old_patch, *tmp_patch;
+
+ klp_for_each_patch_safe(old_patch, tmp_patch) {
+ if (old_patch == new_patch)
+ return;
+ klp_free_patch_async(old_patch);
+ }
+}
+
static int klp_init_func(struct klp_object *obj, struct klp_func *func)
{
if (!func->old_name)
@@ -992,6 +1010,13 @@ int klp_enable_patch(struct klp_patch *patch)
mutex_lock(&klp_mutex);
+ if (!klp_is_patch_compatible(patch)) {
+ pr_err("Livepatch patch (%s) is not compatible with the already installed livepatches.\n",
+ patch->mod->name);
+ mutex_unlock(&klp_mutex);
+ return -EINVAL;
+ }
+
ret = klp_init_patch_early(patch);
if (ret) {
mutex_unlock(&klp_mutex);
@@ -1022,12 +1047,13 @@ err:
EXPORT_SYMBOL_GPL(klp_enable_patch);
/*
- * This function removes replaced patches.
+ * This function unpatches objects from the replaced livepatches.
*
* We could be pretty aggressive here. It is called in the situation where
- * these structures are no longer accessible. All functions are redirected
- * by the klp_transition_patch. They use either a new code or they are in
- * the original code because of the special nop function patches.
+ * these structures are no longer accessed from the ftrace handler.
+ * All functions are redirected by the klp_transition_patch. They
+ * use either a new code or they are in the original code because
+ * of the special nop function patches.
*
* The only exception is when the transition was forced. In this case,
* klp_ftrace_handler() might still see the replaced patch on the stack.
@@ -1035,18 +1061,16 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
* thanks to RCU. We only have to keep the patches on the system. Also
* this is handled transparently by patch->module_put.
*/
-void klp_discard_replaced_patches(struct klp_patch *new_patch)
+void klp_unpatch_replaced_patches(struct klp_patch *new_patch)
{
- struct klp_patch *old_patch, *tmp_patch;
+ struct klp_patch *old_patch;
- klp_for_each_patch_safe(old_patch, tmp_patch) {
+ klp_for_each_patch(old_patch) {
if (old_patch == new_patch)
return;
old_patch->enabled = false;
klp_unpatch_objects(old_patch);
- klp_free_patch_start(old_patch);
- schedule_work(&old_patch->free_work);
}
}
@@ -1175,6 +1199,7 @@ err:
pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
patch->mod->name, obj->mod->name, obj->mod->name);
mod->klp_alive = false;
+ obj->mod = NULL;
klp_cleanup_module_patches_limited(mod, patch);
mutex_unlock(&klp_mutex);
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index ec43a40b853f..38209c7361b6 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -13,8 +13,9 @@ extern struct list_head klp_patches;
#define klp_for_each_patch(patch) \
list_for_each_entry(patch, &klp_patches, list)
-void klp_free_patch_start(struct klp_patch *patch);
-void klp_discard_replaced_patches(struct klp_patch *new_patch);
+void klp_free_patch_async(struct klp_patch *patch);
+void klp_free_replaced_patches_async(struct klp_patch *new_patch);
+void klp_unpatch_replaced_patches(struct klp_patch *new_patch);
void klp_discard_nops(struct klp_patch *new_patch);
static inline bool klp_is_object_loaded(struct klp_object *obj)
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index bd43537702bd..b552cf2d85f8 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -196,7 +196,8 @@ static int klp_patch_func(struct klp_func *func)
ops->fops.func = klp_ftrace_handler;
ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
FTRACE_OPS_FL_DYNAMIC |
- FTRACE_OPS_FL_IPMODIFY;
+ FTRACE_OPS_FL_IPMODIFY |
+ FTRACE_OPS_FL_PERMANENT;
list_add(&ops->node, &klp_ops);
diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c
new file mode 100644
index 000000000000..7ee19476de9d
--- /dev/null
+++ b/kernel/livepatch/state.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * system_state.c - State of the system modified by livepatches
+ *
+ * Copyright (C) 2019 SUSE
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/livepatch.h>
+#include "core.h"
+#include "state.h"
+#include "transition.h"
+
+#define klp_for_each_state(patch, state) \
+ for (state = patch->states; state && state->id; state++)
+
+/**
+ * klp_get_state() - get information about system state modified by
+ * the given patch
+ * @patch: livepatch that modifies the given system state
+ * @id: custom identifier of the modified system state
+ *
+ * Checks whether the given patch modifies the given system state.
+ *
+ * The function can be called either from pre/post (un)patch
+ * callbacks or from the kernel code added by the livepatch.
+ *
+ * Return: pointer to struct klp_state when found, otherwise NULL.
+ */
+struct klp_state *klp_get_state(struct klp_patch *patch, unsigned long id)
+{
+ struct klp_state *state;
+
+ klp_for_each_state(patch, state) {
+ if (state->id == id)
+ return state;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_get_state);
+
+/**
+ * klp_get_prev_state() - get information about system state modified by
+ * the already installed livepatches
+ * @id: custom identifier of the modified system state
+ *
+ * Checks whether already installed livepatches modify the given
+ * system state.
+ *
+ * The same system state can be modified by more non-cumulative
+ * livepatches. It is expected that the latest livepatch has
+ * the most up-to-date information.
+ *
+ * The function can be called only during transition when a new
+ * livepatch is being enabled or when such a transition is reverted.
+ * It is typically called only from from pre/post (un)patch
+ * callbacks.
+ *
+ * Return: pointer to the latest struct klp_state from already
+ * installed livepatches, NULL when not found.
+ */
+struct klp_state *klp_get_prev_state(unsigned long id)
+{
+ struct klp_patch *patch;
+ struct klp_state *state, *last_state = NULL;
+
+ if (WARN_ON_ONCE(!klp_transition_patch))
+ return NULL;
+
+ klp_for_each_patch(patch) {
+ if (patch == klp_transition_patch)
+ goto out;
+
+ state = klp_get_state(patch, id);
+ if (state)
+ last_state = state;
+ }
+
+out:
+ return last_state;
+}
+EXPORT_SYMBOL_GPL(klp_get_prev_state);
+
+/* Check if the patch is able to deal with the existing system state. */
+static bool klp_is_state_compatible(struct klp_patch *patch,
+ struct klp_state *old_state)
+{
+ struct klp_state *state;
+
+ state = klp_get_state(patch, old_state->id);
+
+ /* A cumulative livepatch must handle all already modified states. */
+ if (!state)
+ return !patch->replace;
+
+ return state->version >= old_state->version;
+}
+
+/*
+ * Check that the new livepatch will not break the existing system states.
+ * Cumulative patches must handle all already modified states.
+ * Non-cumulative patches can touch already modified states.
+ */
+bool klp_is_patch_compatible(struct klp_patch *patch)
+{
+ struct klp_patch *old_patch;
+ struct klp_state *old_state;
+
+ klp_for_each_patch(old_patch) {
+ klp_for_each_state(old_patch, old_state) {
+ if (!klp_is_state_compatible(patch, old_state))
+ return false;
+ }
+ }
+
+ return true;
+}
diff --git a/kernel/livepatch/state.h b/kernel/livepatch/state.h
new file mode 100644
index 000000000000..49d9c16e8762
--- /dev/null
+++ b/kernel/livepatch/state.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIVEPATCH_STATE_H
+#define _LIVEPATCH_STATE_H
+
+#include <linux/livepatch.h>
+
+bool klp_is_patch_compatible(struct klp_patch *patch);
+
+#endif /* _LIVEPATCH_STATE_H */
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index cdf318d86dd6..f6310f848f34 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -78,7 +78,7 @@ static void klp_complete_transition(void)
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) {
- klp_discard_replaced_patches(klp_transition_patch);
+ klp_unpatch_replaced_patches(klp_transition_patch);
klp_discard_nops(klp_transition_patch);
}
@@ -446,14 +446,14 @@ void klp_try_complete_transition(void)
klp_complete_transition();
/*
- * It would make more sense to free the patch in
+ * It would make more sense to free the unused patches in
* klp_complete_transition() but it is called also
* from klp_cancel_transition().
*/
- if (!patch->enabled) {
- klp_free_patch_start(patch);
- schedule_work(&patch->free_work);
- }
+ if (!patch->enabled)
+ klp_free_patch_async(patch);
+ else if (patch->replace)
+ klp_free_replaced_patches_async(patch);
}
/*
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4861cf8e274b..32406ef0d6a2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -449,33 +449,100 @@ static void print_lockdep_off(const char *bug_msg)
unsigned long nr_stack_trace_entries;
#ifdef CONFIG_PROVE_LOCKING
+/**
+ * struct lock_trace - single stack backtrace
+ * @hash_entry: Entry in a stack_trace_hash[] list.
+ * @hash: jhash() of @entries.
+ * @nr_entries: Number of entries in @entries.
+ * @entries: Actual stack backtrace.
+ */
+struct lock_trace {
+ struct hlist_node hash_entry;
+ u32 hash;
+ u32 nr_entries;
+ unsigned long entries[0] __aligned(sizeof(unsigned long));
+};
+#define LOCK_TRACE_SIZE_IN_LONGS \
+ (sizeof(struct lock_trace) / sizeof(unsigned long))
/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
+ * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock.
*/
static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE];
+
+static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2)
+{
+ return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries &&
+ memcmp(t1->entries, t2->entries,
+ t1->nr_entries * sizeof(t1->entries[0])) == 0;
+}
-static int save_trace(struct lock_trace *trace)
+static struct lock_trace *save_trace(void)
{
- unsigned long *entries = stack_trace + nr_stack_trace_entries;
- unsigned int max_entries;
+ struct lock_trace *trace, *t2;
+ struct hlist_head *hash_head;
+ u32 hash;
+ int max_entries;
+
+ BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE);
+ BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES);
- trace->offset = nr_stack_trace_entries;
- max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- trace->nr_entries = stack_trace_save(entries, max_entries, 3);
- nr_stack_trace_entries += trace->nr_entries;
+ trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries);
+ max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries -
+ LOCK_TRACE_SIZE_IN_LONGS;
- if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+ if (max_entries <= 0) {
if (!debug_locks_off_graph_unlock())
- return 0;
+ return NULL;
print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
- return 0;
+ return NULL;
+ }
+ trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3);
+
+ hash = jhash(trace->entries, trace->nr_entries *
+ sizeof(trace->entries[0]), 0);
+ trace->hash = hash;
+ hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1));
+ hlist_for_each_entry(t2, hash_head, hash_entry) {
+ if (traces_identical(trace, t2))
+ return t2;
}
+ nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries;
+ hlist_add_head(&trace->hash_entry, hash_head);
- return 1;
+ return trace;
+}
+
+/* Return the number of stack traces in the stack_trace[] array. */
+u64 lockdep_stack_trace_count(void)
+{
+ struct lock_trace *trace;
+ u64 c = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) {
+ hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) {
+ c++;
+ }
+ }
+
+ return c;
+}
+
+/* Return the number of stack hash chains that have at least one stack trace. */
+u64 lockdep_stack_hash_count(void)
+{
+ u64 c = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++)
+ if (!hlist_empty(&stack_trace_hash[i]))
+ c++;
+
+ return c;
}
#endif
@@ -511,7 +578,7 @@ static const char *usage_str[] =
};
#endif
-const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+const char *__get_key_name(const struct lockdep_subclass_key *key, char *str)
{
return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
}
@@ -620,7 +687,7 @@ static void print_lock(struct held_lock *hlock)
return;
}
- printk(KERN_CONT "%p", hlock->instance);
+ printk(KERN_CONT "%px", hlock->instance);
print_lock_name(lock);
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
@@ -1235,7 +1302,7 @@ static struct lock_list *alloc_list_entry(void)
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
unsigned long ip, int distance,
- struct lock_trace *trace)
+ const struct lock_trace *trace)
{
struct lock_list *entry;
/*
@@ -1249,7 +1316,7 @@ static int add_lock_to_list(struct lock_class *this,
entry->class = this;
entry->links_to = links_to;
entry->distance = distance;
- entry->trace = *trace;
+ entry->trace = trace;
/*
* Both allocation and removal are done under the graph lock; but
* iteration is under RCU-sched; see look_up_lock_class() and
@@ -1470,11 +1537,10 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
}
-static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+static void print_lock_trace(const struct lock_trace *trace,
+ unsigned int spaces)
{
- unsigned long *entries = stack_trace + trace->offset;
-
- stack_trace_print(entries, trace->nr_entries, spaces);
+ stack_trace_print(trace->entries, trace->nr_entries, spaces);
}
/*
@@ -1489,7 +1555,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
printk("\n-> #%u", depth);
print_lock_name(target->class);
printk(KERN_CONT ":\n");
- print_lock_trace(&target->trace, 6);
+ print_lock_trace(target->trace, 6);
}
static void
@@ -1592,7 +1658,8 @@ static noinline void print_circular_bug(struct lock_list *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
- if (!save_trace(&this->trace))
+ this->trace = save_trace();
+ if (!this->trace)
return;
depth = get_lock_depth(target);
@@ -1715,7 +1782,7 @@ check_path(struct lock_class *target, struct lock_list *src_entry,
*/
static noinline int
check_noncircular(struct held_lock *src, struct held_lock *target,
- struct lock_trace *trace)
+ struct lock_trace **const trace)
{
int ret;
struct lock_list *uninitialized_var(target_entry);
@@ -1729,13 +1796,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
ret = check_path(hlock_class(target), &src_entry, &target_entry);
if (unlikely(!ret)) {
- if (!trace->nr_entries) {
+ if (!*trace) {
/*
* If save_trace fails here, the printing might
* trigger a WARN but because of the !nr_entries it
* should not do bad things.
*/
- save_trace(trace);
+ *trace = save_trace();
}
print_circular_bug(&src_entry, target_entry, src, target);
@@ -1859,7 +1926,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
len += printk("%*s %s", depth, "", usage_str[bit]);
len += printk(KERN_CONT " at:\n");
- print_lock_trace(class->usage_traces + bit, len);
+ print_lock_trace(class->usage_traces[bit], len);
}
}
printk("%*s }\n", depth, "");
@@ -1884,7 +1951,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
do {
print_lock_class_header(entry->class, depth);
printk("%*s ... acquired at:\n", depth, "");
- print_lock_trace(&entry->trace, 2);
+ print_lock_trace(entry->trace, 2);
printk("\n");
if (depth == 0 && (entry != root)) {
@@ -1995,14 +2062,14 @@ print_bad_irq_dependency(struct task_struct *curr,
print_lock_name(backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
- print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);
+ print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
print_lock_name(forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
- print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);
+ print_lock_trace(forwards_entry->class->usage_traces[bit2], 1);
pr_warn("\nother info that might help us debug this:\n\n");
print_irq_lock_scenario(backwards_entry, forwards_entry,
@@ -2011,13 +2078,15 @@ print_bad_irq_dependency(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
- if (!save_trace(&prev_root->trace))
+ prev_root->trace = save_trace();
+ if (!prev_root->trace)
return;
print_shortest_lock_dependencies(backwards_entry, prev_root);
pr_warn("\nthe dependencies between the lock to be acquired");
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
- if (!save_trace(&next_root->trace))
+ next_root->trace = save_trace();
+ if (!next_root->trace)
return;
print_shortest_lock_dependencies(forwards_entry, next_root);
@@ -2369,7 +2438,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, struct lock_trace *trace)
+ struct held_lock *next, int distance,
+ struct lock_trace **const trace)
{
struct lock_list *entry;
int ret;
@@ -2444,8 +2514,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return ret;
#endif
- if (!trace->nr_entries && !save_trace(trace))
- return 0;
+ if (!*trace) {
+ *trace = save_trace();
+ if (!*trace)
+ return 0;
+ }
/*
* Ok, all validations passed, add the new lock
@@ -2453,14 +2526,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
*/
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
&hlock_class(prev)->locks_after,
- next->acquire_ip, distance, trace);
+ next->acquire_ip, distance, *trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
&hlock_class(next)->locks_before,
- next->acquire_ip, distance, trace);
+ next->acquire_ip, distance, *trace);
if (!ret)
return 0;
@@ -2476,7 +2549,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
- struct lock_trace trace = { .nr_entries = 0 };
+ struct lock_trace *trace = NULL;
int depth = curr->lockdep_depth;
struct held_lock *hlock;
@@ -3015,7 +3088,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
print_lock(this);
pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
- print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+ print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1);
print_irqtrace_events(curr);
pr_warn("\nother info that might help us debug this:\n");
@@ -3096,7 +3169,8 @@ print_irq_inversion_bug(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
- if (!save_trace(&root->trace))
+ root->trace = save_trace();
+ if (!root->trace)
return;
print_shortest_lock_dependencies(other, root);
@@ -3580,7 +3654,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
hlock_class(this)->usage_mask |= new_mask;
- if (!save_trace(hlock_class(this)->usage_traces + new_bit))
+ if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
return 0;
switch (new_bit) {
@@ -4133,11 +4207,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
}
/*
- * Remove the lock to the list of currently held locks - this gets
+ * Remove the lock from the list of currently held locks - this gets
* called on mutex_unlock()/spin_unlock*() (or on a failed
* mutex_lock_interruptible()).
- *
- * @nested is an hysterical artifact, needs a tree wide cleanup.
*/
static int
__lock_release(struct lockdep_map *lock, unsigned long ip)
@@ -4416,8 +4488,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
EXPORT_SYMBOL_GPL(lock_acquire);
-void lock_release(struct lockdep_map *lock, int nested,
- unsigned long ip)
+void lock_release(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
@@ -5157,6 +5228,12 @@ void __init lockdep_init(void)
) / 1024
);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+ printk(" memory used for stack traces: %zu kB\n",
+ (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024
+ );
+#endif
+
printk(" per task-struct memory footprint: %zu bytes\n",
sizeof(((struct task_struct *)NULL)->held_locks));
}
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index cc83568d5012..18d85aebbb57 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -92,6 +92,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_LOCKDEP_ENTRIES 16384UL
#define MAX_LOCKDEP_CHAINS_BITS 15
#define MAX_STACK_TRACE_ENTRIES 262144UL
+#define STACK_TRACE_HASH_SIZE 8192
#else
#define MAX_LOCKDEP_ENTRIES 32768UL
@@ -102,6 +103,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
* addresses. Protected by the hash_lock.
*/
#define MAX_STACK_TRACE_ENTRIES 524288UL
+#define STACK_TRACE_HASH_SIZE 16384
#endif
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
@@ -116,7 +118,8 @@ extern struct lock_chain lock_chains[];
extern void get_usage_chars(struct lock_class *class,
char usage[LOCK_USAGE_CHARS]);
-extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+extern const char *__get_key_name(const struct lockdep_subclass_key *key,
+ char *str);
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
@@ -137,6 +140,10 @@ extern unsigned int max_bfs_queue_depth;
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#ifdef CONFIG_TRACE_IRQFLAGS
+u64 lockdep_stack_trace_count(void);
+u64 lockdep_stack_hash_count(void);
+#endif
#else
static inline unsigned long
lockdep_count_forward_deps(struct lock_class *class)
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index bda006f8a88b..231684cfc5ae 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -285,6 +285,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_process_chains);
seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+ seq_printf(m, " number of stack traces: %11llu\n",
+ lockdep_stack_trace_count());
+ seq_printf(m, " number of stack hash chains: %11llu\n",
+ lockdep_stack_hash_count());
+#endif
seq_printf(m, " combined max dependencies: %11u\n",
(nr_hardirq_chains + 1) *
(nr_softirq_chains + 1) *
@@ -399,7 +405,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
{
- struct lockdep_subclass_key *ckey;
+ const struct lockdep_subclass_key *ckey;
struct lock_class_stats *stats;
struct lock_class *class;
const char *cname;
@@ -637,12 +643,12 @@ static int lock_stat_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-static const struct file_operations proc_lock_stat_operations = {
- .open = lock_stat_open,
- .write = lock_stat_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = lock_stat_release,
+static const struct proc_ops lock_stat_proc_ops = {
+ .proc_open = lock_stat_open,
+ .proc_write = lock_stat_write,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = lock_stat_release,
};
#endif /* CONFIG_LOCK_STAT */
@@ -654,8 +660,7 @@ static int __init lockdep_proc_init(void)
#endif
proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show);
#ifdef CONFIG_LOCK_STAT
- proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
- &proc_lock_stat_operations);
+ proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, &lock_stat_proc_ops);
#endif
return 0;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index c513031cd7e3..99475a66c94f 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -16,7 +16,6 @@
#include <linux/kthread.h>
#include <linux/sched/rt.h>
#include <linux/spinlock.h>
-#include <linux/rwlock.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/smp.h>
@@ -889,16 +888,16 @@ static int __init lock_torture_init(void)
cxt.nrealwriters_stress = 2 * num_online_cpus();
#ifdef CONFIG_DEBUG_MUTEXES
- if (strncmp(torture_type, "mutex", 5) == 0)
+ if (str_has_prefix(torture_type, "mutex"))
cxt.debug_lock = true;
#endif
#ifdef CONFIG_DEBUG_RT_MUTEXES
- if (strncmp(torture_type, "rtmutex", 7) == 0)
+ if (str_has_prefix(torture_type, "rtmutex"))
cxt.debug_lock = true;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
- if ((strncmp(torture_type, "spin", 4) == 0) ||
- (strncmp(torture_type, "rw_lock", 7) == 0))
+ if ((str_has_prefix(torture_type, "spin")) ||
+ (str_has_prefix(torture_type, "rw_lock")))
cxt.debug_lock = true;
#endif
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 5e069734363c..5352ce50a97e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -65,11 +65,37 @@ EXPORT_SYMBOL(__mutex_init);
#define MUTEX_FLAGS 0x07
+/*
+ * Internal helper function; C doesn't allow us to hide it :/
+ *
+ * DO NOT USE (outside of mutex code).
+ */
+static inline struct task_struct *__mutex_owner(struct mutex *lock)
+{
+ return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
+}
+
static inline struct task_struct *__owner_task(unsigned long owner)
{
return (struct task_struct *)(owner & ~MUTEX_FLAGS);
}
+bool mutex_is_locked(struct mutex *lock)
+{
+ return __mutex_owner(lock) != NULL;
+}
+EXPORT_SYMBOL(mutex_is_locked);
+
+__must_check enum mutex_trylock_recursive_enum
+mutex_trylock_recursive(struct mutex *lock)
+{
+ if (unlikely(__mutex_owner(lock) == current))
+ return MUTEX_TRYLOCK_RECURSIVE;
+
+ return mutex_trylock(lock);
+}
+EXPORT_SYMBOL(mutex_trylock_recursive);
+
static inline unsigned long __owner_flags(unsigned long owner)
{
return owner & MUTEX_FLAGS;
@@ -1065,7 +1091,7 @@ err:
err_early_kill:
spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
- mutex_release(&lock->dep_map, 1, ip);
+ mutex_release(&lock->dep_map, ip);
preempt_enable();
return ret;
}
@@ -1199,7 +1225,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
DEFINE_WAKE_Q(wake_q);
unsigned long owner;
- mutex_release(&lock->dep_map, 1, ip);
+ mutex_release(&lock->dep_map, ip);
/*
* Release the lock before (potentially) taking the spinlock such that
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 6ef600aa0f47..1f7734949ac8 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -134,20 +134,17 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* cmpxchg in an attempt to undo our queueing.
*/
- while (!READ_ONCE(node->locked)) {
- /*
- * If we need to reschedule bail... so we can block.
- * Use vcpu_is_preempted() to avoid waiting for a preempted
- * lock holder:
- */
- if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
- goto unqueue;
-
- cpu_relax();
- }
- return true;
+ /*
+ * Wait to acquire the lock or cancelation. Note that need_resched()
+ * will come with an IPI, which will wake smp_cond_load_relaxed() if it
+ * is implemented with a monitor-wait. vcpu_is_preempted() relies on
+ * polling, be careful.
+ */
+ if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
+ vcpu_is_preempted(node_cpu(node->prev))))
+ return true;
-unqueue:
+ /* unqueue */
/*
* Step - A -- stabilize @prev
*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2473f10c6956..b9515fcc9b29 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -31,14 +31,15 @@
/*
* The basic principle of a queue-based spinlock can best be understood
* by studying a classic queue-based spinlock implementation called the
- * MCS lock. The paper below provides a good description for this kind
- * of lock.
+ * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
+ * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
+ * Scott") is available at
*
- * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206115
*
- * This queued spinlock implementation is based on the MCS lock, however to make
- * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
- * API, we must modify it somehow.
+ * This queued spinlock implementation is based on the MCS lock, however to
+ * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
+ * existing API, we must modify it somehow.
*
* In particular; where the traditional MCS lock consists of a tail pointer
* (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 89bab079e7a4..e84d21aa0722 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -269,7 +269,7 @@ pv_wait_early(struct pv_node *prev, int loop)
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;
- return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu);
+ return READ_ONCE(prev->state) != vcpu_running;
}
/*
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index fa83d36e30c6..851bbb10819d 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -628,8 +628,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/* [10] Grab the next task, i.e. owner of @lock */
- task = rt_mutex_owner(lock);
- get_task_struct(task);
+ task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
/*
@@ -709,8 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/* [10] Grab the next task, i.e. the owner of @lock */
- task = rt_mutex_owner(lock);
- get_task_struct(task);
+ task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
/* [11] requeue the pi waiters if necessary */
@@ -1519,7 +1517,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
if (ret)
- mutex_release(&lock->dep_map, 1, _RET_IP_);
+ mutex_release(&lock->dep_map, _RET_IP_);
return ret;
}
@@ -1563,7 +1561,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
RT_MUTEX_MIN_CHAINWALK,
rt_mutex_slowlock);
if (ret)
- mutex_release(&lock->dep_map, 1, _RET_IP_);
+ mutex_release(&lock->dep_map, _RET_IP_);
return ret;
}
@@ -1602,7 +1600,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock);
*/
void __sched rt_mutex_unlock(struct rt_mutex *lock)
{
- mutex_release(&lock->dep_map, 1, _RET_IP_);
+ mutex_release(&lock->dep_map, _RET_IP_);
rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index bd0f0d05724c..0d9b6be9ecc8 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -105,8 +105,9 @@
#ifdef CONFIG_DEBUG_RWSEMS
# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
if (!debug_locks_silent && \
- WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
#c, atomic_long_read(&(sem)->count), \
+ (unsigned long) sem->magic, \
atomic_long_read(&(sem)->owner), (long)current, \
list_empty(&(sem)->wait_list) ? "" : "not ")) \
debug_locks_off(); \
@@ -330,6 +331,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
+#ifdef CONFIG_DEBUG_RWSEMS
+ sem->magic = sem;
+#endif
atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
@@ -724,11 +728,12 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
rcu_read_lock();
for (;;) {
- if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) {
- state = OWNER_NONSPINNABLE;
- break;
- }
-
+ /*
+ * When a waiting writer set the handoff flag, it may spin
+ * on the owner as well. Once that writer acquires the lock,
+ * we can spin on it. So we don't need to quit even when the
+ * handoff bit is set.
+ */
new = rwsem_owner_flags(sem, &new_flags);
if ((new != owner) || (new_flags != flags)) {
state = rwsem_owner_state(new, new_flags, nonspinnable);
@@ -974,6 +979,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
{
return false;
}
+
+static inline int
+rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
+{
+ return 0;
+}
+#define OWNER_NULL 1
#endif
/*
@@ -1206,6 +1218,18 @@ wait:
raw_spin_unlock_irq(&sem->wait_lock);
+ /*
+ * After setting the handoff bit and failing to acquire
+ * the lock, attempt to spin on owner to accelerate lock
+ * transfer. If the previous owner is a on-cpu writer and it
+ * has just released the lock, OWNER_NULL will be returned.
+ * In this case, we attempt to acquire the lock again
+ * without sleeping.
+ */
+ if (wstate == WRITER_HANDOFF &&
+ rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL)
+ goto trylock_again;
+
/* Block until there are no active lockers. */
for (;;) {
if (signal_pending_state(state, current))
@@ -1240,7 +1264,7 @@ wait:
break;
}
}
-
+trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
@@ -1338,11 +1362,14 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+
/*
* Optimize for the case when the rwsem is not locked at all.
*/
- long tmp = RWSEM_UNLOCKED_VALUE;
-
+ tmp = RWSEM_UNLOCKED_VALUE;
do {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) {
@@ -1383,8 +1410,11 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
- long tmp = RWSEM_UNLOCKED_VALUE;
+ long tmp;
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+
+ tmp = RWSEM_UNLOCKED_VALUE;
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
RWSEM_WRITER_LOCKED)) {
rwsem_set_owner(sem);
@@ -1400,7 +1430,9 @@ inline void __up_read(struct rw_semaphore *sem)
{
long tmp;
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
@@ -1418,12 +1450,14 @@ static inline void __up_write(struct rw_semaphore *sem)
{
long tmp;
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
/*
* sem->owner may differ from current if the ownership is transferred
* to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
*/
DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
!rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
@@ -1470,7 +1504,7 @@ int __sched down_read_killable(struct rw_semaphore *sem)
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_release(&sem->dep_map, _RET_IP_);
return -EINTR;
}
@@ -1512,7 +1546,7 @@ int __sched down_write_killable(struct rw_semaphore *sem)
if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
__down_write_killable)) {
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_release(&sem->dep_map, _RET_IP_);
return -EINTR;
}
@@ -1539,7 +1573,7 @@ EXPORT_SYMBOL(down_write_trylock);
*/
void up_read(struct rw_semaphore *sem)
{
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_release(&sem->dep_map, _RET_IP_);
__up_read(sem);
}
EXPORT_SYMBOL(up_read);
@@ -1549,7 +1583,7 @@ EXPORT_SYMBOL(up_read);
*/
void up_write(struct rw_semaphore *sem)
{
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_release(&sem->dep_map, _RET_IP_);
__up_write(sem);
}
EXPORT_SYMBOL(up_write);
@@ -1605,7 +1639,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
__down_write_killable)) {
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_release(&sem->dep_map, _RET_IP_);
return -EINTR;
}
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 399669f7eba8..472dd462a40c 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -51,19 +51,19 @@ EXPORT_SYMBOL(__rwlock_init);
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
- struct task_struct *owner = NULL;
+ struct task_struct *owner = READ_ONCE(lock->owner);
- if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
- owner = lock->owner;
+ if (owner == SPINLOCK_OWNER_INIT)
+ owner = NULL;
printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
msg, raw_smp_processor_id(),
current->comm, task_pid_nr(current));
printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
".owner_cpu: %d\n",
- lock, lock->magic,
+ lock, READ_ONCE(lock->magic),
owner ? owner->comm : "<none>",
owner ? task_pid_nr(owner) : -1,
- lock->owner_cpu);
+ READ_ONCE(lock->owner_cpu));
dump_stack();
}
@@ -80,16 +80,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg)
static inline void
debug_spin_lock_before(raw_spinlock_t *lock)
{
- SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
- SPIN_BUG_ON(lock->owner == current, lock, "recursion");
- SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
static inline void debug_spin_lock_after(raw_spinlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_spin_unlock(raw_spinlock_t *lock)
@@ -99,8 +99,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
/*
@@ -187,8 +187,8 @@ static inline void debug_write_lock_before(rwlock_t *lock)
static inline void debug_write_lock_after(rwlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_write_unlock(rwlock_t *lock)
@@ -197,8 +197,8 @@ static inline void debug_write_unlock(rwlock_t *lock)
RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
void do_raw_write_lock(rwlock_t *lock)
diff --git a/kernel/module.c b/kernel/module.c
index 5933395af9a0..33569a01d6e1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/extable.h>
#include <linux/moduleloader.h>
+#include <linux/module_signature.h>
#include <linux/trace_events.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
@@ -65,9 +66,9 @@
/*
* Modules' sections will be aligned on page boundaries
* to ensure complete separation of code and data, but
- * only when CONFIG_STRICT_MODULE_RWX=y
+ * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
*/
-#ifdef CONFIG_STRICT_MODULE_RWX
+#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
# define debug_align(X) ALIGN(X, PAGE_SIZE)
#else
# define debug_align(X) (X)
@@ -213,7 +214,8 @@ static struct module *mod_find(unsigned long addr)
{
struct module *mod;
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
if (within_module(addr, mod))
return mod;
}
@@ -447,7 +449,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
return true;
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
struct symsearch arr[] = {
{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
NOT_GPL_ONLY, false },
@@ -544,12 +547,20 @@ static const char *kernel_symbol_name(const struct kernel_symbol *sym)
#endif
}
-static int cmp_name(const void *va, const void *vb)
+static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
{
- const char *a;
- const struct kernel_symbol *b;
- a = va; b = vb;
- return strcmp(a, kernel_symbol_name(b));
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ if (!sym->namespace_offset)
+ return NULL;
+ return offset_to_ptr(&sym->namespace_offset);
+#else
+ return sym->namespace;
+#endif
+}
+
+static int cmp_name(const void *name, const void *sym)
+{
+ return strcmp(name, kernel_symbol_name(sym));
}
static bool find_exported_symbol_in_section(const struct symsearch *syms,
@@ -607,7 +618,8 @@ static struct module *find_module_all(const char *name, size_t len,
module_assert_mutex_or_preempt();
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
continue;
if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
@@ -1024,6 +1036,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
free_module(mod);
+ /* someone could wait for the module in add_unformed_module() */
+ wake_up_all(&module_wq);
return 0;
out:
mutex_unlock(&module_mutex);
@@ -1379,6 +1393,41 @@ static inline int same_magic(const char *amagic, const char *bmagic,
}
#endif /* CONFIG_MODVERSIONS */
+static char *get_modinfo(const struct load_info *info, const char *tag);
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ char *prev);
+
+static int verify_namespace_is_imported(const struct load_info *info,
+ const struct kernel_symbol *sym,
+ struct module *mod)
+{
+ const char *namespace;
+ char *imported_namespace;
+
+ namespace = kernel_symbol_namespace(sym);
+ if (namespace && namespace[0]) {
+ imported_namespace = get_modinfo(info, "import_ns");
+ while (imported_namespace) {
+ if (strcmp(namespace, imported_namespace) == 0)
+ return 0;
+ imported_namespace = get_next_modinfo(
+ info, "import_ns", imported_namespace);
+ }
+#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ pr_warn(
+#else
+ pr_err(
+#endif
+ "%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
+ mod->name, kernel_symbol_name(sym), namespace);
+#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ return -EINVAL;
+#endif
+ }
+ return 0;
+}
+
+
/* Resolve a symbol for this module. I.e. if we find one, record usage. */
static const struct kernel_symbol *resolve_symbol(struct module *mod,
const struct load_info *info,
@@ -1407,6 +1456,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
goto getname;
}
+ err = verify_namespace_is_imported(info, sym, mod);
+ if (err) {
+ sym = ERR_PTR(err);
+ goto getname;
+ }
+
err = ref_module(mod, owner);
if (err) {
sym = ERR_PTR(err);
@@ -1729,6 +1784,8 @@ static int module_add_modinfo_attrs(struct module *mod)
error_out:
if (i > 0)
module_remove_modinfo_attrs(mod, --i);
+ else
+ kfree(mod->modinfo_attrs);
return error;
}
@@ -1979,49 +2036,6 @@ static void module_enable_nx(const struct module *mod)
frob_writable_data(&mod->init_layout, set_memory_nx);
}
-/* Iterate through all modules and set each module's text as RW */
-void set_all_modules_text_rw(void)
-{
- struct module *mod;
-
- if (!rodata_enabled)
- return;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- frob_text(&mod->core_layout, set_memory_rw);
- frob_text(&mod->init_layout, set_memory_rw);
- }
- mutex_unlock(&module_mutex);
-}
-
-/* Iterate through all modules and set each module's text as RO */
-void set_all_modules_text_ro(void)
-{
- struct module *mod;
-
- if (!rodata_enabled)
- return;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_rcu(mod, &modules, list) {
- /*
- * Ignore going modules since it's possible that ro
- * protection has already been disabled, otherwise we'll
- * run into protection faults at module deallocation.
- */
- if (mod->state == MODULE_STATE_UNFORMED ||
- mod->state == MODULE_STATE_GOING)
- continue;
-
- frob_text(&mod->core_layout, set_memory_ro);
- frob_text(&mod->init_layout, set_memory_ro);
- }
- mutex_unlock(&module_mutex);
-}
#else /* !CONFIG_STRICT_MODULE_RWX */
static void module_enable_nx(const struct module *mod) { }
#endif /* CONFIG_STRICT_MODULE_RWX */
@@ -2481,7 +2495,8 @@ static char *next_string(char *string, unsigned long *secsize)
return string;
}
-static char *get_modinfo(struct load_info *info, const char *tag)
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ char *prev)
{
char *p;
unsigned int taglen = strlen(tag);
@@ -2492,13 +2507,25 @@ static char *get_modinfo(struct load_info *info, const char *tag)
* get_modinfo() calls made before rewrite_section_headers()
* must use sh_offset, as sh_addr isn't set!
*/
- for (p = (char *)info->hdr + infosec->sh_offset; p; p = next_string(p, &size)) {
+ char *modinfo = (char *)info->hdr + infosec->sh_offset;
+
+ if (prev) {
+ size -= prev - modinfo;
+ modinfo = next_string(prev, &size);
+ }
+
+ for (p = modinfo; p; p = next_string(p, &size)) {
if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
return p + taglen + 1;
}
return NULL;
}
+static char *get_modinfo(const struct load_info *info, const char *tag)
+{
+ return get_next_modinfo(info, tag, NULL);
+}
+
static void setup_modinfo(struct module *mod, struct load_info *info)
{
struct module_attribute *attr;
@@ -2776,8 +2803,9 @@ static inline void kmemleak_load_module(const struct module *mod,
#ifdef CONFIG_MODULE_SIG
static int module_sig_check(struct load_info *info, int flags)
{
- int err = -ENOKEY;
+ int err = -ENODATA;
const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const char *reason;
const void *mod = info->hdr;
/*
@@ -2792,16 +2820,38 @@ static int module_sig_check(struct load_info *info, int flags)
err = mod_verify_sig(mod, info);
}
- if (!err) {
+ switch (err) {
+ case 0:
info->sig_ok = true;
return 0;
- }
- /* Not having a signature is only an error if we're strict. */
- if (err == -ENOKEY && !is_module_sig_enforced())
- err = 0;
+ /* We don't permit modules to be loaded into trusted kernels
+ * without a valid signature on them, but if we're not
+ * enforcing, certain errors are non-fatal.
+ */
+ case -ENODATA:
+ reason = "Loading of unsigned module";
+ goto decide;
+ case -ENOPKG:
+ reason = "Loading of module with unsupported crypto";
+ goto decide;
+ case -ENOKEY:
+ reason = "Loading of module with unavailable key";
+ decide:
+ if (is_module_sig_enforced()) {
+ pr_notice("%s: %s is rejected\n", info->name, reason);
+ return -EKEYREJECTED;
+ }
- return err;
+ return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+
+ /* All other errors are fatal, including nomem, unparseable
+ * signatures and signature check failures - even if signatures
+ * aren't required.
+ */
+ default:
+ return err;
+ }
}
#else /* !CONFIG_MODULE_SIG */
static int module_sig_check(struct load_info *info, int flags)
@@ -2966,9 +3016,7 @@ static int setup_load_info(struct load_info *info, int flags)
/* Try to find a name early so we can log errors with a module name */
info->index.info = find_sec(info, ".modinfo");
- if (!info->index.info)
- info->name = "(missing .modinfo section)";
- else
+ if (info->index.info)
info->name = get_modinfo(info, "name");
/* Find internal symbols and strings. */
@@ -2983,14 +3031,15 @@ static int setup_load_info(struct load_info *info, int flags)
}
if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n", info->name);
+ pr_warn("%s: module has no symbols (stripped?)\n",
+ info->name ?: "(missing .modinfo section or name field)");
return -ENOEXEC;
}
info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
if (!info->index.mod) {
pr_warn("%s: No module found in object\n",
- info->name ?: "(missing .modinfo name field)");
+ info->name ?: "(missing .modinfo section or name field)");
return -ENOEXEC;
}
/* This is temporary: point mod into copy of data. */
@@ -3136,7 +3185,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
#endif
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
/* sechdrs[0].sh_size is always zero */
- mod->ftrace_callsites = section_objs(info, "__mcount_loc",
+ mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
sizeof(*mod->ftrace_callsites),
&mod->num_ftrace_callsites);
#endif
@@ -4305,16 +4354,16 @@ static int modules_open(struct inode *inode, struct file *file)
return err;
}
-static const struct file_operations proc_modules_operations = {
- .open = modules_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops modules_proc_ops = {
+ .proc_open = modules_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
static int __init proc_modules_init(void)
{
- proc_create("modules", 0, NULL, &proc_modules_operations);
+ proc_create("modules", 0, NULL, &modules_proc_ops);
return 0;
}
module_init(proc_modules_init);
diff --git a/kernel/module_signature.c b/kernel/module_signature.c
new file mode 100644
index 000000000000..4224a1086b7d
--- /dev/null
+++ b/kernel/module_signature.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/errno.h>
+#include <linux/printk.h>
+#include <linux/module_signature.h>
+#include <asm/byteorder.h>
+
+/**
+ * mod_check_sig - check that the given signature is sane
+ *
+ * @ms: Signature to check.
+ * @file_len: Size of the file to which @ms is appended.
+ * @name: What is being checked. Used for error messages.
+ */
+int mod_check_sig(const struct module_signature *ms, size_t file_len,
+ const char *name)
+{
+ if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms))
+ return -EBADMSG;
+
+ if (ms->id_type != PKEY_ID_PKCS7) {
+ pr_err("%s: Module is not signed with expected PKCS#7 message\n",
+ name);
+ return -ENOPKG;
+ }
+
+ if (ms->algo != 0 ||
+ ms->hash != 0 ||
+ ms->signer_len != 0 ||
+ ms->key_id_len != 0 ||
+ ms->__pad[0] != 0 ||
+ ms->__pad[1] != 0 ||
+ ms->__pad[2] != 0) {
+ pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n",
+ name);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index b10fb1986ca9..9d9fc678c91d 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -7,37 +7,13 @@
#include <linux/kernel.h>
#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/module_signature.h>
#include <linux/string.h>
#include <linux/verification.h>
#include <crypto/public_key.h>
#include "module-internal.h"
-enum pkey_id_type {
- PKEY_ID_PGP, /* OpenPGP generated key ID */
- PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */
- PKEY_ID_PKCS7, /* Signature in PKCS#7 message */
-};
-
-/*
- * Module signature information block.
- *
- * The constituents of the signature section are, in order:
- *
- * - Signer's name
- * - Key identifier
- * - Signature data
- * - Information block
- */
-struct module_signature {
- u8 algo; /* Public-key crypto algorithm [0] */
- u8 hash; /* Digest algorithm [0] */
- u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */
- u8 signer_len; /* Length of signer's name [0] */
- u8 key_id_len; /* Length of key identifier [0] */
- u8 __pad[3];
- __be32 sig_len; /* Length of signature data */
-};
-
/*
* Verify the signature on a module.
*/
@@ -45,6 +21,7 @@ int mod_verify_sig(const void *mod, struct load_info *info)
{
struct module_signature ms;
size_t sig_len, modlen = info->len;
+ int ret;
pr_devel("==>%s(,%zu)\n", __func__, modlen);
@@ -52,32 +29,15 @@ int mod_verify_sig(const void *mod, struct load_info *info)
return -EBADMSG;
memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
- modlen -= sizeof(ms);
+
+ ret = mod_check_sig(&ms, modlen, info->name);
+ if (ret)
+ return ret;
sig_len = be32_to_cpu(ms.sig_len);
- if (sig_len >= modlen)
- return -EBADMSG;
- modlen -= sig_len;
+ modlen -= sig_len + sizeof(ms);
info->len = modlen;
- if (ms.id_type != PKEY_ID_PKCS7) {
- pr_err("%s: Module is not signed with expected PKCS#7 message\n",
- info->name);
- return -ENOPKG;
- }
-
- if (ms.algo != 0 ||
- ms.hash != 0 ||
- ms.signer_len != 0 ||
- ms.key_id_len != 0 ||
- ms.__pad[0] != 0 ||
- ms.__pad[1] != 0 ||
- ms.__pad[2] != 0) {
- pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n",
- info->name);
- return -EBADMSG;
- }
-
return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
VERIFY_USE_SECONDARY_KEYRING,
VERIFYING_MODULE_SIGNATURE,
diff --git a/kernel/notifier.c b/kernel/notifier.c
index d9f5081d578d..63d7501ac638 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -23,22 +23,10 @@ static int notifier_chain_register(struct notifier_block **nl,
struct notifier_block *n)
{
while ((*nl) != NULL) {
- WARN_ONCE(((*nl) == n), "double register detected");
- if (n->priority > (*nl)->priority)
- break;
- nl = &((*nl)->next);
- }
- n->next = *nl;
- rcu_assign_pointer(*nl, n);
- return 0;
-}
-
-static int notifier_chain_cond_register(struct notifier_block **nl,
- struct notifier_block *n)
-{
- while ((*nl) != NULL) {
- if ((*nl) == n)
+ if (unlikely((*nl) == n)) {
+ WARN(1, "double register detected");
return 0;
+ }
if (n->priority > (*nl)->priority)
break;
nl = &((*nl)->next);
@@ -233,29 +221,6 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
/**
- * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
- * @nh: Pointer to head of the blocking notifier chain
- * @n: New entry in notifier chain
- *
- * Adds a notifier to a blocking notifier chain, only if not already
- * present in the chain.
- * Must be called in process context.
- *
- * Currently always returns zero.
- */
-int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
- struct notifier_block *n)
-{
- int ret;
-
- down_write(&nh->rwsem);
- ret = notifier_chain_cond_register(&nh->head, n);
- up_write(&nh->rwsem);
- return ret;
-}
-EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
-
-/**
* blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: Entry to remove from notifier chain
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index c815f58e6bc0..ed9882108cd2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -18,6 +18,7 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -40,6 +41,10 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_CGROUPS
.cgroup_ns = &init_cgroup_ns,
#endif
+#ifdef CONFIG_TIME_NS
+ .time_ns = &init_time_ns,
+ .time_ns_for_children = &init_time_ns,
+#endif
};
static inline struct nsproxy *create_nsproxy(void)
@@ -106,8 +111,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_net;
}
+ new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
+ tsk->nsproxy->time_ns_for_children);
+ if (IS_ERR(new_nsp->time_ns_for_children)) {
+ err = PTR_ERR(new_nsp->time_ns_for_children);
+ goto out_time;
+ }
+ new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
+
return new_nsp;
+out_time:
+ put_net(new_nsp->net_ns);
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
@@ -136,15 +151,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
+ int ret;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWCGROUP)))) {
- get_nsproxy(old_ns);
- return 0;
- }
-
- if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
+ if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+ get_nsproxy(old_ns);
+ return 0;
+ }
+ } else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -162,6 +178,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
+ ret = timens_on_fork(new_ns, tsk);
+ if (ret) {
+ free_nsproxy(new_ns);
+ return ret;
+ }
+
tsk->nsproxy = new_ns;
return 0;
}
@@ -176,6 +198,10 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
+ if (ns->time_ns)
+ put_time_ns(ns->time_ns);
+ if (ns->time_ns_for_children)
+ put_time_ns(ns->time_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
@@ -192,7 +218,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+ CLONE_NEWTIME)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
diff --git a/kernel/padata.c b/kernel/padata.c
index 15a8ad63f4ff..72777c10bb9c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -2,7 +2,7 @@
/*
* padata.c - generic interface to process data streams in parallel
*
- * See Documentation/padata.txt for an api documentation.
+ * See Documentation/core-api/padata.rst for more information.
*
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
@@ -35,6 +35,8 @@
#define MAX_OBJ_NUM 1000
+static void padata_free_pd(struct parallel_data *pd);
+
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
int cpu, target_cpu;
@@ -46,18 +48,13 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
return target_cpu;
}
-static int padata_cpu_hash(struct parallel_data *pd)
+static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
{
- unsigned int seq_nr;
- int cpu_index;
-
/*
* Hash the sequence numbers to the cpus by taking
* seq_nr mod. number of cpus in use.
*/
-
- seq_nr = atomic_inc_return(&pd->seq_nr);
- cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
+ int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
return padata_index_to_cpu(pd, cpu_index);
}
@@ -92,32 +89,48 @@ static void padata_parallel_worker(struct work_struct *parallel_work)
/**
* padata_do_parallel - padata parallelization function
*
- * @pinst: padata instance
+ * @ps: padatashell
* @padata: object to be parallelized
- * @cb_cpu: cpu the serialization callback function will run on,
- * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
+ * @cb_cpu: pointer to the CPU that the serialization callback function should
+ * run on. If it's not in the serial cpumask of @pinst
+ * (i.e. cpumask.cbcpu), this function selects a fallback CPU and if
+ * none found, returns -EINVAL.
*
* The parallelization callback function will run with BHs off.
* Note: Every object which is parallelized by padata_do_parallel
* must be seen by padata_do_serial.
+ *
+ * Return: 0 on success or else negative error code.
*/
-int padata_do_parallel(struct padata_instance *pinst,
- struct padata_priv *padata, int cb_cpu)
+int padata_do_parallel(struct padata_shell *ps,
+ struct padata_priv *padata, int *cb_cpu)
{
- int target_cpu, err;
+ struct padata_instance *pinst = ps->pinst;
+ int i, cpu, cpu_index, target_cpu, err;
struct padata_parallel_queue *queue;
struct parallel_data *pd;
rcu_read_lock_bh();
- pd = rcu_dereference_bh(pinst->pd);
+ pd = rcu_dereference_bh(ps->pd);
err = -EINVAL;
if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
goto out;
- if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
- goto out;
+ if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) {
+ if (!cpumask_weight(pd->cpumask.cbcpu))
+ goto out;
+
+ /* Select an alternate fallback CPU and notify the caller. */
+ cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu);
+
+ cpu = cpumask_first(pd->cpumask.cbcpu);
+ for (i = 0; i < cpu_index; i++)
+ cpu = cpumask_next(cpu, pd->cpumask.cbcpu);
+
+ *cb_cpu = cpu;
+ }
err = -EBUSY;
if ((pinst->flags & PADATA_RESET))
@@ -129,9 +142,10 @@ int padata_do_parallel(struct padata_instance *pinst,
err = 0;
atomic_inc(&pd->refcnt);
padata->pd = pd;
- padata->cb_cpu = cb_cpu;
+ padata->cb_cpu = *cb_cpu;
- target_cpu = padata_cpu_hash(pd);
+ padata->seq_nr = atomic_inc_return(&pd->seq_nr);
+ target_cpu = padata_cpu_hash(pd, padata->seq_nr);
padata->cpu = target_cpu;
queue = per_cpu_ptr(pd->pqueue, target_cpu);
@@ -139,7 +153,7 @@ int padata_do_parallel(struct padata_instance *pinst,
list_add_tail(&padata->list, &queue->parallel.list);
spin_unlock(&queue->parallel.lock);
- queue_work_on(target_cpu, pinst->wq, &queue->work);
+ queue_work(pinst->parallel_wq, &queue->work);
out:
rcu_read_unlock_bh();
@@ -149,72 +163,60 @@ out:
EXPORT_SYMBOL(padata_do_parallel);
/*
- * padata_get_next - Get the next object that needs serialization.
- *
- * Return values are:
- *
- * A pointer to the control struct of the next object that needs
- * serialization, if present in one of the percpu reorder queues.
+ * padata_find_next - Find the next object that needs serialization.
*
- * -EINPROGRESS, if the next object that needs serialization will
- * be parallel processed by another cpu and is not yet present in
- * the cpu's reorder queue.
- *
- * -ENODATA, if this cpu has to do the parallel processing for
- * the next object.
+ * Return:
+ * * A pointer to the control struct of the next object that needs
+ * serialization, if present in one of the percpu reorder queues.
+ * * NULL, if the next object that needs serialization will
+ * be parallel processed by another cpu and is not yet present in
+ * the cpu's reorder queue.
*/
-static struct padata_priv *padata_get_next(struct parallel_data *pd)
+static struct padata_priv *padata_find_next(struct parallel_data *pd,
+ bool remove_object)
{
- int cpu, num_cpus;
- unsigned int next_nr, next_index;
struct padata_parallel_queue *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
+ int cpu = pd->cpu;
- num_cpus = cpumask_weight(pd->cpumask.pcpu);
-
- /*
- * Calculate the percpu reorder queue and the sequence
- * number of the next object.
- */
- next_nr = pd->processed;
- next_index = next_nr % num_cpus;
- cpu = padata_index_to_cpu(pd, next_index);
next_queue = per_cpu_ptr(pd->pqueue, cpu);
-
reorder = &next_queue->reorder;
spin_lock(&reorder->lock);
- if (!list_empty(&reorder->list)) {
- padata = list_entry(reorder->list.next,
- struct padata_priv, list);
-
- list_del_init(&padata->list);
- atomic_dec(&pd->reorder_objects);
+ if (list_empty(&reorder->list)) {
+ spin_unlock(&reorder->lock);
+ return NULL;
+ }
- pd->processed++;
+ padata = list_entry(reorder->list.next, struct padata_priv, list);
+ /*
+ * Checks the rare case where two or more parallel jobs have hashed to
+ * the same CPU and one of the later ones finishes first.
+ */
+ if (padata->seq_nr != pd->processed) {
spin_unlock(&reorder->lock);
- goto out;
+ return NULL;
}
- spin_unlock(&reorder->lock);
- if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
- padata = ERR_PTR(-ENODATA);
- goto out;
+ if (remove_object) {
+ list_del_init(&padata->list);
+ ++pd->processed;
+ pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false);
}
- padata = ERR_PTR(-EINPROGRESS);
-out:
+ spin_unlock(&reorder->lock);
return padata;
}
static void padata_reorder(struct parallel_data *pd)
{
+ struct padata_instance *pinst = pd->ps->pinst;
int cb_cpu;
struct padata_priv *padata;
struct padata_serial_queue *squeue;
- struct padata_instance *pinst = pd->pinst;
+ struct padata_parallel_queue *next_queue;
/*
* We need to ensure that only one cpu can work on dequeueing of
@@ -230,27 +232,16 @@ static void padata_reorder(struct parallel_data *pd)
return;
while (1) {
- padata = padata_get_next(pd);
+ padata = padata_find_next(pd, true);
/*
* If the next object that needs serialization is parallel
* processed by another cpu and is still on it's way to the
* cpu's reorder queue, nothing to do for now.
*/
- if (PTR_ERR(padata) == -EINPROGRESS)
+ if (!padata)
break;
- /*
- * This cpu has to do the parallel processing of the next
- * object. It's waiting in the cpu's parallelization queue,
- * so exit immediately.
- */
- if (PTR_ERR(padata) == -ENODATA) {
- del_timer(&pd->timer);
- spin_unlock_bh(&pd->lock);
- return;
- }
-
cb_cpu = padata->cb_cpu;
squeue = per_cpu_ptr(pd->squeue, cb_cpu);
@@ -258,82 +249,43 @@ static void padata_reorder(struct parallel_data *pd)
list_add_tail(&padata->list, &squeue->serial.list);
spin_unlock(&squeue->serial.lock);
- queue_work_on(cb_cpu, pinst->wq, &squeue->work);
+ queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work);
}
spin_unlock_bh(&pd->lock);
/*
* The next object that needs serialization might have arrived to
- * the reorder queues in the meantime, we will be called again
- * from the timer function if no one else cares for it.
+ * the reorder queues in the meantime.
*
- * Ensure reorder_objects is read after pd->lock is dropped so we see
- * an increment from another task in padata_do_serial. Pairs with
+ * Ensure reorder queue is read after pd->lock is dropped so we see
+ * new objects from another task in padata_do_serial. Pairs with
* smp_mb__after_atomic in padata_do_serial.
*/
smp_mb();
- if (atomic_read(&pd->reorder_objects)
- && !(pinst->flags & PADATA_RESET))
- mod_timer(&pd->timer, jiffies + HZ);
- else
- del_timer(&pd->timer);
- return;
+ next_queue = per_cpu_ptr(pd->pqueue, pd->cpu);
+ if (!list_empty(&next_queue->reorder.list) &&
+ padata_find_next(pd, false))
+ queue_work(pinst->serial_wq, &pd->reorder_work);
}
static void invoke_padata_reorder(struct work_struct *work)
{
- struct padata_parallel_queue *pqueue;
struct parallel_data *pd;
local_bh_disable();
- pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
- pd = pqueue->pd;
+ pd = container_of(work, struct parallel_data, reorder_work);
padata_reorder(pd);
local_bh_enable();
}
-static void padata_reorder_timer(struct timer_list *t)
-{
- struct parallel_data *pd = from_timer(pd, t, timer);
- unsigned int weight;
- int target_cpu, cpu;
-
- cpu = get_cpu();
-
- /* We don't lock pd here to not interfere with parallel processing
- * padata_reorder() calls on other CPUs. We just need any CPU out of
- * the cpumask.pcpu set. It would be nice if it's the right one but
- * it doesn't matter if we're off to the next one by using an outdated
- * pd->processed value.
- */
- weight = cpumask_weight(pd->cpumask.pcpu);
- target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
-
- /* ensure to call the reorder callback on the correct CPU */
- if (cpu != target_cpu) {
- struct padata_parallel_queue *pqueue;
- struct padata_instance *pinst;
-
- /* The timer function is serialized wrt itself -- no locking
- * needed.
- */
- pinst = pd->pinst;
- pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
- queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
- } else {
- padata_reorder(pd);
- }
-
- put_cpu();
-}
-
static void padata_serial_worker(struct work_struct *serial_work)
{
struct padata_serial_queue *squeue;
struct parallel_data *pd;
LIST_HEAD(local_list);
+ int cnt;
local_bh_disable();
squeue = container_of(serial_work, struct padata_serial_queue, work);
@@ -343,6 +295,8 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_replace_init(&squeue->serial.list, &local_list);
spin_unlock(&squeue->serial.lock);
+ cnt = 0;
+
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -352,9 +306,12 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_del_init(&padata->list);
padata->serial(padata);
- atomic_dec(&pd->refcnt);
+ cnt++;
}
local_bh_enable();
+
+ if (atomic_sub_and_test(cnt, &pd->refcnt))
+ padata_free_pd(pd);
}
/**
@@ -367,65 +324,67 @@ static void padata_serial_worker(struct work_struct *serial_work)
*/
void padata_do_serial(struct padata_priv *padata)
{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct parallel_data *pd;
- int reorder_via_wq = 0;
-
- pd = padata->pd;
-
- cpu = get_cpu();
-
- /* We need to run on the same CPU padata_do_parallel(.., padata, ..)
- * was called on -- or, at least, enqueue the padata object into the
- * correct per-cpu queue.
- */
- if (cpu != padata->cpu) {
- reorder_via_wq = 1;
- cpu = padata->cpu;
- }
-
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
+ struct parallel_data *pd = padata->pd;
+ struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue,
+ padata->cpu);
+ struct padata_priv *cur;
spin_lock(&pqueue->reorder.lock);
- atomic_inc(&pd->reorder_objects);
- list_add_tail(&padata->list, &pqueue->reorder.list);
+ /* Sort in ascending order of sequence number. */
+ list_for_each_entry_reverse(cur, &pqueue->reorder.list, list)
+ if (cur->seq_nr < padata->seq_nr)
+ break;
+ list_add(&padata->list, &cur->list);
spin_unlock(&pqueue->reorder.lock);
/*
- * Ensure the atomic_inc of reorder_objects above is ordered correctly
+ * Ensure the addition to the reorder list is ordered correctly
* with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
* in padata_reorder.
*/
smp_mb__after_atomic();
- put_cpu();
-
- /* If we're running on the wrong CPU, call padata_reorder() via a
- * kernel worker.
- */
- if (reorder_via_wq)
- queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work);
- else
- padata_reorder(pd);
+ padata_reorder(pd);
}
EXPORT_SYMBOL(padata_do_serial);
-static int padata_setup_cpumasks(struct parallel_data *pd,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+static int padata_setup_cpumasks(struct padata_instance *pinst)
{
- if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
- return -ENOMEM;
+ struct workqueue_attrs *attrs;
+ int err;
- cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
- if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
- free_cpumask_var(pd->cpumask.pcpu);
+ attrs = alloc_workqueue_attrs();
+ if (!attrs)
return -ENOMEM;
- }
- cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
+ /* Restrict parallel_wq workers to pd->cpumask.pcpu. */
+ cpumask_copy(attrs->cpumask, pinst->cpumask.pcpu);
+ err = apply_workqueue_attrs(pinst->parallel_wq, attrs);
+ free_workqueue_attrs(attrs);
+
+ return err;
+}
+
+static int pd_setup_cpumasks(struct parallel_data *pd,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
+{
+ int err = -ENOMEM;
+
+ if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
+ goto out;
+ if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
+ goto free_pcpu_mask;
+
+ cpumask_copy(pd->cpumask.pcpu, pcpumask);
+ cpumask_copy(pd->cpumask.cbcpu, cbcpumask);
+
return 0;
+
+free_pcpu_mask:
+ free_cpumask_var(pd->cpumask.pcpu);
+out:
+ return err;
}
static void __padata_list_init(struct padata_list *pd_list)
@@ -451,37 +410,30 @@ static void padata_init_squeues(struct parallel_data *pd)
/* Initialize all percpu queues used by parallel workers */
static void padata_init_pqueues(struct parallel_data *pd)
{
- int cpu_index, cpu;
+ int cpu;
struct padata_parallel_queue *pqueue;
- cpu_index = 0;
- for_each_possible_cpu(cpu) {
+ for_each_cpu(cpu, pd->cpumask.pcpu) {
pqueue = per_cpu_ptr(pd->pqueue, cpu);
- if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
- pqueue->cpu_index = -1;
- continue;
- }
-
- pqueue->pd = pd;
- pqueue->cpu_index = cpu_index;
- cpu_index++;
-
__padata_list_init(&pqueue->reorder);
__padata_list_init(&pqueue->parallel);
INIT_WORK(&pqueue->work, padata_parallel_worker);
- INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
atomic_set(&pqueue->num_obj, 0);
}
}
/* Allocate and initialize the internal cpumask dependend resources. */
-static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
{
+ struct padata_instance *pinst = ps->pinst;
+ const struct cpumask *cbcpumask;
+ const struct cpumask *pcpumask;
struct parallel_data *pd;
+ cbcpumask = pinst->rcpumask.cbcpu;
+ pcpumask = pinst->rcpumask.pcpu;
+
pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
if (!pd)
goto err;
@@ -493,17 +445,18 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
pd->squeue = alloc_percpu(struct padata_serial_queue);
if (!pd->squeue)
goto err_free_pqueue;
- if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
+
+ pd->ps = ps;
+ if (pd_setup_cpumasks(pd, pcpumask, cbcpumask))
goto err_free_squeue;
padata_init_pqueues(pd);
padata_init_squeues(pd);
- timer_setup(&pd->timer, padata_reorder_timer, 0);
atomic_set(&pd->seq_nr, -1);
- atomic_set(&pd->reorder_objects, 0);
- atomic_set(&pd->refcnt, 0);
- pd->pinst = pinst;
+ atomic_set(&pd->refcnt, 1);
spin_lock_init(&pd->lock);
+ pd->cpu = cpumask_first(pd->cpumask.pcpu);
+ INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
return pd;
@@ -526,31 +479,6 @@ static void padata_free_pd(struct parallel_data *pd)
kfree(pd);
}
-/* Flush all objects out of the padata queues. */
-static void padata_flush_queues(struct parallel_data *pd)
-{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct padata_serial_queue *squeue;
-
- for_each_cpu(cpu, pd->cpumask.pcpu) {
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
- flush_work(&pqueue->work);
- }
-
- del_timer_sync(&pd->timer);
-
- if (atomic_read(&pd->reorder_objects))
- padata_reorder(pd);
-
- for_each_cpu(cpu, pd->cpumask.cbcpu) {
- squeue = per_cpu_ptr(pd->squeue, cpu);
- flush_work(&squeue->work);
- }
-
- BUG_ON(atomic_read(&pd->refcnt) != 0);
-}
-
static void __padata_start(struct padata_instance *pinst)
{
pinst->flags |= PADATA_INIT;
@@ -564,72 +492,52 @@ static void __padata_stop(struct padata_instance *pinst)
pinst->flags &= ~PADATA_INIT;
synchronize_rcu();
-
- get_online_cpus();
- padata_flush_queues(pinst->pd);
- put_online_cpus();
}
/* Replace the internal control structure with a new one. */
-static void padata_replace(struct padata_instance *pinst,
- struct parallel_data *pd_new)
+static int padata_replace_one(struct padata_shell *ps)
+{
+ struct parallel_data *pd_new;
+
+ pd_new = padata_alloc_pd(ps);
+ if (!pd_new)
+ return -ENOMEM;
+
+ ps->opd = rcu_dereference_protected(ps->pd, 1);
+ rcu_assign_pointer(ps->pd, pd_new);
+
+ return 0;
+}
+
+static int padata_replace(struct padata_instance *pinst)
{
- struct parallel_data *pd_old = pinst->pd;
- int notification_mask = 0;
+ struct padata_shell *ps;
+ int err;
pinst->flags |= PADATA_RESET;
- rcu_assign_pointer(pinst->pd, pd_new);
+ cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu,
+ cpu_online_mask);
- synchronize_rcu();
+ cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu,
+ cpu_online_mask);
- if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
- notification_mask |= PADATA_CPU_PARALLEL;
- if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
- notification_mask |= PADATA_CPU_SERIAL;
+ list_for_each_entry(ps, &pinst->pslist, list) {
+ err = padata_replace_one(ps);
+ if (err)
+ break;
+ }
- padata_flush_queues(pd_old);
- padata_free_pd(pd_old);
+ synchronize_rcu();
- if (notification_mask)
- blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
- notification_mask,
- &pd_new->cpumask);
+ list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
+ if (atomic_dec_and_test(&ps->opd->refcnt))
+ padata_free_pd(ps->opd);
pinst->flags &= ~PADATA_RESET;
-}
-/**
- * padata_register_cpumask_notifier - Registers a notifier that will be called
- * if either pcpu or cbcpu or both cpumasks change.
- *
- * @pinst: A poineter to padata instance
- * @nblock: A pointer to notifier block.
- */
-int padata_register_cpumask_notifier(struct padata_instance *pinst,
- struct notifier_block *nblock)
-{
- return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
- nblock);
-}
-EXPORT_SYMBOL(padata_register_cpumask_notifier);
-
-/**
- * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
- * registered earlier using padata_register_cpumask_notifier
- *
- * @pinst: A pointer to data instance.
- * @nlock: A pointer to notifier block.
- */
-int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
- struct notifier_block *nblock)
-{
- return blocking_notifier_chain_unregister(
- &pinst->cpumask_change_notifier,
- nblock);
+ return err;
}
-EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
-
/* If cpumask contains no active cpu, we mark the instance as invalid. */
static bool padata_validate_cpumask(struct padata_instance *pinst,
@@ -649,7 +557,7 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
cpumask_var_t cbcpumask)
{
int valid;
- struct parallel_data *pd;
+ int err;
valid = padata_validate_cpumask(pinst, pcpumask);
if (!valid) {
@@ -662,29 +570,26 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
__padata_stop(pinst);
out_replace:
- pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
- if (!pd)
- return -ENOMEM;
-
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
- padata_replace(pinst, pd);
+ err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst);
if (valid)
__padata_start(pinst);
- return 0;
+ return err;
}
/**
- * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
- * equivalent to @cpumask.
- *
+ * padata_set_cpumask - Sets specified by @cpumask_type cpumask to the value
+ * equivalent to @cpumask.
* @pinst: padata instance
* @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
* to parallel and serial cpumasks respectively.
* @cpumask: the cpumask to use
+ *
+ * Return: 0 on success or negative error code
*/
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask)
@@ -692,8 +597,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
- mutex_lock(&pinst->lock);
get_online_cpus();
+ mutex_lock(&pinst->lock);
switch (cpumask_type) {
case PADATA_CPU_PARALLEL:
@@ -711,8 +616,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
out:
- put_online_cpus();
mutex_unlock(&pinst->lock);
+ put_online_cpus();
return err;
}
@@ -722,6 +627,8 @@ EXPORT_SYMBOL(padata_set_cpumask);
* padata_start - start the parallel processing
*
* @pinst: padata instance to start
+ *
+ * Return: 0 on success or negative error code
*/
int padata_start(struct padata_instance *pinst)
{
@@ -757,82 +664,33 @@ EXPORT_SYMBOL(padata_stop);
static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
{
- struct parallel_data *pd;
+ int err = 0;
if (cpumask_test_cpu(cpu, cpu_online_mask)) {
- pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
- pinst->cpumask.cbcpu);
- if (!pd)
- return -ENOMEM;
-
- padata_replace(pinst, pd);
+ err = padata_replace(pinst);
if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
__padata_start(pinst);
}
- return 0;
+ return err;
}
static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
{
- struct parallel_data *pd = NULL;
-
- if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+ int err = 0;
+ if (!cpumask_test_cpu(cpu, cpu_online_mask)) {
if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
!padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
__padata_stop(pinst);
- pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
- pinst->cpumask.cbcpu);
- if (!pd)
- return -ENOMEM;
-
- padata_replace(pinst, pd);
-
- cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
- cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
+ err = padata_replace(pinst);
}
- return 0;
-}
-
- /**
- * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
- * padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to remove
- * @mask: bitmask specifying from which cpumask @cpu should be removed
- * The @mask may be any combination of the following flags:
- * PADATA_CPU_SERIAL - serial cpumask
- * PADATA_CPU_PARALLEL - parallel cpumask
- */
-int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
- int err;
-
- if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
- return -EINVAL;
-
- mutex_lock(&pinst->lock);
-
- get_online_cpus();
- if (mask & PADATA_CPU_SERIAL)
- cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
- if (mask & PADATA_CPU_PARALLEL)
- cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
-
- err = __padata_remove_cpu(pinst, cpu);
- put_online_cpus();
-
- mutex_unlock(&pinst->lock);
-
return err;
}
-EXPORT_SYMBOL(padata_remove_cpu);
static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
{
@@ -855,7 +713,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node)
return ret;
}
-static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node)
+static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
struct padata_instance *pinst;
int ret;
@@ -876,13 +734,19 @@ static enum cpuhp_state hp_online;
static void __padata_free(struct padata_instance *pinst)
{
#ifdef CONFIG_HOTPLUG_CPU
+ cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node);
cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node);
#endif
+ WARN_ON(!list_empty(&pinst->pslist));
+
padata_stop(pinst);
- padata_free_pd(pinst->pd);
+ free_cpumask_var(pinst->rcpumask.cbcpu);
+ free_cpumask_var(pinst->rcpumask.pcpu);
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
+ destroy_workqueue(pinst->serial_wq);
+ destroy_workqueue(pinst->parallel_wq);
kfree(pinst);
}
@@ -1016,58 +880,86 @@ static struct kobj_type padata_attr_type = {
* padata_alloc - allocate and initialize a padata instance and specify
* cpumasks for serial and parallel workers.
*
- * @wq: workqueue to use for the allocated padata instance
+ * @name: used to identify the instance
* @pcpumask: cpumask that will be used for padata parallelization
* @cbcpumask: cpumask that will be used for padata serialization
*
- * Must be called from a cpus_read_lock() protected region
+ * Return: new instance on success, NULL on error
*/
-static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+static struct padata_instance *padata_alloc(const char *name,
const struct cpumask *pcpumask,
const struct cpumask *cbcpumask)
{
struct padata_instance *pinst;
- struct parallel_data *pd = NULL;
pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
if (!pinst)
goto err;
- if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
+ pinst->parallel_wq = alloc_workqueue("%s_parallel", WQ_UNBOUND, 0,
+ name);
+ if (!pinst->parallel_wq)
goto err_free_inst;
+
+ get_online_cpus();
+
+ pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM |
+ WQ_CPU_INTENSIVE, 1, name);
+ if (!pinst->serial_wq)
+ goto err_put_cpus;
+
+ if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
+ goto err_free_serial_wq;
if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
free_cpumask_var(pinst->cpumask.pcpu);
- goto err_free_inst;
+ goto err_free_serial_wq;
}
if (!padata_validate_cpumask(pinst, pcpumask) ||
!padata_validate_cpumask(pinst, cbcpumask))
goto err_free_masks;
- pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
- if (!pd)
+ if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL))
goto err_free_masks;
+ if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL))
+ goto err_free_rcpumask_pcpu;
- rcu_assign_pointer(pinst->pd, pd);
-
- pinst->wq = wq;
+ INIT_LIST_HEAD(&pinst->pslist);
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+ cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask);
+ cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask);
+
+ if (padata_setup_cpumasks(pinst))
+ goto err_free_rcpumask_cbcpu;
pinst->flags = 0;
- BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
kobject_init(&pinst->kobj, &padata_attr_type);
mutex_init(&pinst->lock);
#ifdef CONFIG_HOTPLUG_CPU
cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node);
+ cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD,
+ &pinst->node);
#endif
+
+ put_online_cpus();
+
return pinst;
+err_free_rcpumask_cbcpu:
+ free_cpumask_var(pinst->rcpumask.cbcpu);
+err_free_rcpumask_pcpu:
+ free_cpumask_var(pinst->rcpumask.pcpu);
err_free_masks:
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
+err_free_serial_wq:
+ destroy_workqueue(pinst->serial_wq);
+err_put_cpus:
+ put_online_cpus();
+ destroy_workqueue(pinst->parallel_wq);
err_free_inst:
kfree(pinst);
err:
@@ -1079,21 +971,20 @@ err:
* Use the cpu_possible_mask for serial and
* parallel workers.
*
- * @wq: workqueue to use for the allocated padata instance
+ * @name: used to identify the instance
*
- * Must be called from a cpus_read_lock() protected region
+ * Return: new instance on success, NULL on error
*/
-struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
+struct padata_instance *padata_alloc_possible(const char *name)
{
- lockdep_assert_cpus_held();
- return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+ return padata_alloc(name, cpu_possible_mask, cpu_possible_mask);
}
EXPORT_SYMBOL(padata_alloc_possible);
/**
* padata_free - free a padata instance
*
- * @padata_inst: padata instance to free
+ * @pinst: padata instance to free
*/
void padata_free(struct padata_instance *pinst)
{
@@ -1101,6 +992,63 @@ void padata_free(struct padata_instance *pinst)
}
EXPORT_SYMBOL(padata_free);
+/**
+ * padata_alloc_shell - Allocate and initialize padata shell.
+ *
+ * @pinst: Parent padata_instance object.
+ *
+ * Return: new shell on success, NULL on error
+ */
+struct padata_shell *padata_alloc_shell(struct padata_instance *pinst)
+{
+ struct parallel_data *pd;
+ struct padata_shell *ps;
+
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto out;
+
+ ps->pinst = pinst;
+
+ get_online_cpus();
+ pd = padata_alloc_pd(ps);
+ put_online_cpus();
+
+ if (!pd)
+ goto out_free_ps;
+
+ mutex_lock(&pinst->lock);
+ RCU_INIT_POINTER(ps->pd, pd);
+ list_add(&ps->list, &pinst->pslist);
+ mutex_unlock(&pinst->lock);
+
+ return ps;
+
+out_free_ps:
+ kfree(ps);
+out:
+ return NULL;
+}
+EXPORT_SYMBOL(padata_alloc_shell);
+
+/**
+ * padata_free_shell - free a padata shell
+ *
+ * @ps: padata shell to free
+ */
+void padata_free_shell(struct padata_shell *ps)
+{
+ struct padata_instance *pinst = ps->pinst;
+
+ mutex_lock(&pinst->lock);
+ list_del(&ps->list);
+ padata_free_pd(rcu_dereference_protected(ps->pd, 1));
+ mutex_unlock(&pinst->lock);
+
+ kfree(ps);
+}
+EXPORT_SYMBOL(padata_free_shell);
+
#ifdef CONFIG_HOTPLUG_CPU
static __init int padata_driver_init(void)
@@ -1108,17 +1056,24 @@ static __init int padata_driver_init(void)
int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
- padata_cpu_online,
- padata_cpu_prep_down);
+ padata_cpu_online, NULL);
if (ret < 0)
return ret;
hp_online = ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead",
+ NULL, padata_cpu_dead);
+ if (ret < 0) {
+ cpuhp_remove_multi_state(hp_online);
+ return ret;
+ }
return 0;
}
module_init(padata_driver_init);
static __exit void padata_driver_exit(void)
{
+ cpuhp_remove_multi_state(CPUHP_PADATA_DEAD);
cpuhp_remove_multi_state(hp_online);
}
module_exit(padata_driver_exit);
diff --git a/kernel/panic.c b/kernel/panic.c
index 057540b6eee9..b69ee9e76cb2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -12,6 +12,7 @@
#include <linux/debug_locks.h>
#include <linux/sched/debug.h>
#include <linux/interrupt.h>
+#include <linux/kgdb.h>
#include <linux/kmsg_dump.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
@@ -179,6 +180,7 @@ void panic(const char *fmt, ...)
* after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();
+ preempt_disable_notrace();
/*
* It's possible to come here directly from a panic-assertion and
@@ -220,6 +222,13 @@ void panic(const char *fmt, ...)
#endif
/*
+ * If kgdb is enabled, give it a chance to run before we stop all
+ * the other CPUs or else we won't be able to debug processes left
+ * running on them.
+ */
+ kgdb_panic(buf);
+
+ /*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
* If we want to run this after calling panic_notifiers, pass
@@ -551,9 +560,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
{
disable_trace_on_warning();
- if (args)
- pr_warn(CUT_HERE);
-
if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
raw_smp_processor_id(), current->pid, file, line,
@@ -591,37 +597,26 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
add_taint(taint, LOCKDEP_STILL_OK);
}
-#ifdef WANT_WARN_ON_SLOWPATH
-void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
+#ifndef __WARN_FLAGS
+void warn_slowpath_fmt(const char *file, int line, unsigned taint,
+ const char *fmt, ...)
{
struct warn_args args;
- args.fmt = fmt;
- va_start(args.args, fmt);
- __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL,
- &args);
- va_end(args.args);
-}
-EXPORT_SYMBOL(warn_slowpath_fmt);
+ pr_warn(CUT_HERE);
-void warn_slowpath_fmt_taint(const char *file, int line,
- unsigned taint, const char *fmt, ...)
-{
- struct warn_args args;
+ if (!fmt) {
+ __warn(file, line, __builtin_return_address(0), taint,
+ NULL, NULL);
+ return;
+ }
args.fmt = fmt;
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
}
-EXPORT_SYMBOL(warn_slowpath_fmt_taint);
-
-void warn_slowpath_null(const char *file, int line)
-{
- pr_warn(CUT_HERE);
- __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
-}
-EXPORT_SYMBOL(warn_slowpath_null);
+EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
@@ -676,17 +671,6 @@ EXPORT_SYMBOL(__stack_chk_fail);
#endif
-#ifdef CONFIG_ARCH_HAS_REFCOUNT
-void refcount_error_report(struct pt_regs *regs, const char *err)
-{
- WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
- err, (void *)instruction_pointer(regs),
- current->comm, task_pid_nr(current),
- from_kuid_munged(&init_user_ns, current_uid()),
- from_kuid_munged(&init_user_ns, current_euid()));
-}
-#endif
-
core_param(panic, panic_timeout, int, 0644);
core_param(panic_print, panic_print, ulong, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
diff --git a/kernel/params.c b/kernel/params.c
index cf448785d058..8e56f8b12d8f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -12,6 +12,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/ctype.h>
+#include <linux/security.h>
#ifdef CONFIG_SYSFS
/* Protects all built-in parameters, modules use their own param_lock */
@@ -96,13 +97,19 @@ bool parameq(const char *a, const char *b)
return parameqn(a, b, strlen(a)+1);
}
-static void param_check_unsafe(const struct kernel_param *kp)
+static bool param_check_unsafe(const struct kernel_param *kp)
{
+ if (kp->flags & KERNEL_PARAM_FL_HWPARAM &&
+ security_locked_down(LOCKDOWN_MODULE_PARAMETERS))
+ return false;
+
if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
pr_notice("Setting dangerous option %s - tainting kernel\n",
kp->name);
add_taint(TAINT_USER, LOCKDEP_STILL_OK);
}
+
+ return true;
}
static int parse_one(char *param,
@@ -132,8 +139,10 @@ static int parse_one(char *param,
pr_debug("handling %s with %p\n", param,
params[i].ops->set);
kernel_param_lock(params[i].mod);
- param_check_unsafe(&params[i]);
- err = params[i].ops->set(val, &params[i]);
+ if (param_check_unsafe(&params[i]))
+ err = params[i].ops->set(val, &params[i]);
+ else
+ err = -EPERM;
kernel_param_unlock(params[i].mod);
return err;
}
@@ -553,8 +562,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
return -EPERM;
kernel_param_lock(mk->mod);
- param_check_unsafe(attribute->param);
- err = attribute->param->ops->set(buf, attribute->param);
+ if (param_check_unsafe(attribute->param))
+ err = attribute->param->ops->set(buf, attribute->param);
+ else
+ err = -EPERM;
kernel_param_unlock(mk->mod);
if (!err)
return len;
diff --git a/kernel/pid.c b/kernel/pid.c
index 0a9f2e437217..0f4ecb57214c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -157,7 +157,8 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
+ size_t set_tid_size)
{
struct pid *pid;
enum pid_type type;
@@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns)
struct upid *upid;
int retval = -ENOMEM;
+ /*
+ * set_tid_size contains the size of the set_tid array. Starting at
+ * the most nested currently active PID namespace it tells alloc_pid()
+ * which PID to set for a process in that most nested PID namespace
+ * up to set_tid_size PID namespaces. It does not have to set the PID
+ * for a process in all nested PID namespaces but set_tid_size must
+ * never be greater than the current ns->level + 1.
+ */
+ if (set_tid_size > ns->level + 1)
+ return ERR_PTR(-EINVAL);
+
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
@@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns)
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
- int pid_min = 1;
+ int tid = 0;
+
+ if (set_tid_size) {
+ tid = set_tid[ns->level - i];
+
+ retval = -EINVAL;
+ if (tid < 1 || tid >= pid_max)
+ goto out_free;
+ /*
+ * Also fail if a PID != 1 is requested and
+ * no PID 1 exists.
+ */
+ if (tid != 1 && !tmp->child_reaper)
+ goto out_free;
+ retval = -EPERM;
+ if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
+ goto out_free;
+ set_tid_size--;
+ }
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
- /*
- * init really needs pid 1, but after reaching the maximum
- * wrap back to RESERVED_PIDS
- */
- if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
- pid_min = RESERVED_PIDS;
-
- /*
- * Store a null pointer so find_pid_ns does not find
- * a partially initialized PID (see below).
- */
- nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ if (tid) {
+ nr = idr_alloc(&tmp->idr, NULL, tid,
+ tid + 1, GFP_ATOMIC);
+ /*
+ * If ENOSPC is returned it means that the PID is
+ * alreay in use. Return EEXIST in that case.
+ */
+ if (nr == -ENOSPC)
+ nr = -EEXIST;
+ } else {
+ int pid_min = 1;
+ /*
+ * init really needs pid 1, but after reaching the
+ * maximum wrap back to RESERVED_PIDS
+ */
+ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ pid_max, GFP_ATOMIC);
+ }
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
@@ -299,7 +341,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
*pid_ptr = new;
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
- if (!hlist_empty(&pid->tasks[tmp]))
+ if (pid_has_task(pid, tmp))
return;
free_pid(pid);
@@ -497,7 +539,7 @@ static int pidfd_create(struct pid *pid)
*/
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
- int fd, ret;
+ int fd;
struct pid *p;
if (flags)
@@ -510,13 +552,11 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
if (!p)
return -ESRCH;
- ret = 0;
- rcu_read_lock();
- if (!pid_task(p, PIDTYPE_TGID))
- ret = -EINVAL;
- rcu_read_unlock();
+ if (pid_has_task(p, PIDTYPE_TGID))
+ fd = pidfd_create(p);
+ else
+ fd = -EINVAL;
- fd = ret ?: pidfd_create(p);
put_pid(p);
return fd;
}
@@ -538,3 +578,93 @@ void __init pid_idr_init(void)
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
+
+static struct file *__pidfd_fget(struct task_struct *task, int fd)
+{
+ struct file *file;
+ int ret;
+
+ ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
+ file = fget_task(task, fd);
+ else
+ file = ERR_PTR(-EPERM);
+
+ mutex_unlock(&task->signal->cred_guard_mutex);
+
+ return file ?: ERR_PTR(-EBADF);
+}
+
+static int pidfd_getfd(struct pid *pid, int fd)
+{
+ struct task_struct *task;
+ struct file *file;
+ int ret;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
+
+ file = __pidfd_fget(task, fd);
+ put_task_struct(task);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = security_file_receive(file);
+ if (ret) {
+ fput(file);
+ return ret;
+ }
+
+ ret = get_unused_fd_flags(O_CLOEXEC);
+ if (ret < 0)
+ fput(file);
+ else
+ fd_install(ret, file);
+
+ return ret;
+}
+
+/**
+ * sys_pidfd_getfd() - Get a file descriptor from another process
+ *
+ * @pidfd: the pidfd file descriptor of the process
+ * @fd: the file descriptor number to get
+ * @flags: flags on how to get the fd (reserved)
+ *
+ * This syscall gets a copy of a file descriptor from another process
+ * based on the pidfd, and file descriptor number. It requires that
+ * the calling process has the ability to ptrace the process represented
+ * by the pidfd. The process which is having its file descriptor copied
+ * is otherwise unaffected.
+ *
+ * Return: On success, a cloexec file descriptor is returned.
+ * On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
+ unsigned int, flags)
+{
+ struct pid *pid;
+ struct fd f;
+ int ret;
+
+ /* flags is currently unused - make sure it's unset */
+ if (flags)
+ return -EINVAL;
+
+ f = fdget(pidfd);
+ if (!f.file)
+ return -EBADF;
+
+ pid = pidfd_pid(f.file);
+ if (IS_ERR(pid))
+ ret = PTR_ERR(pid);
+ else
+ ret = pidfd_getfd(pid, fd);
+
+ fdput(f);
+ return ret;
+}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a6a79f85c81a..d40017e79ebe 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -26,8 +26,6 @@
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
-/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
-#define MAX_PID_NS_LEVEL 32
/* Write once array, filled from the beginning. */
static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index d3667b4075c1..7cbfbeacd68a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,7 +27,10 @@ config SUSPEND_SKIP_SYNC
Skip the kernel sys_sync() before freezing user processes.
Some systems prefer not to pay this cost on every invocation
of suspend, or they are content with invoking sync() from
- user-space before invoking suspend. Say Y if that's your case.
+ user-space before invoking suspend. There's a run-time switch
+ at '/sys/power/sync_on_suspend' to configure this behaviour.
+ This setting changes the default for the run-tim switch. Say Y
+ to change the default to disable the kernel sys_sync().
config HIBERNATE_CALLBACKS
bool
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index 41e83a779e19..9af5a50d3489 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -116,7 +116,7 @@ int pm_autosleep_set_state(suspend_state_t state)
int __init pm_autosleep_init(void)
{
- autosleep_ws = wakeup_source_register("autosleep");
+ autosleep_ws = wakeup_source_register(NULL, "autosleep");
if (!autosleep_ws)
return -ENOMEM;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index cd7434e6000d..6dbeedb7354c 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -9,7 +9,7 @@
* Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
*/
-#define pr_fmt(fmt) "PM: " fmt
+#define pr_fmt(fmt) "PM: hibernation: " fmt
#include <linux/export.h>
#include <linux/suspend.h>
@@ -30,6 +30,7 @@
#include <linux/ctype.h>
#include <linux/genhd.h>
#include <linux/ktime.h>
+#include <linux/security.h>
#include <trace/events/power.h>
#include "power.h"
@@ -68,7 +69,7 @@ static const struct platform_hibernation_ops *hibernation_ops;
bool hibernation_available(void)
{
- return (nohibernate == 0);
+ return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION);
}
/**
@@ -105,7 +106,7 @@ EXPORT_SYMBOL(system_entering_hibernation);
#ifdef CONFIG_PM_DEBUG
static void hibernation_debug_sleep(void)
{
- pr_info("hibernation debug: Waiting for 5 seconds.\n");
+ pr_info("debug: Waiting for 5 seconds.\n");
mdelay(5000);
}
@@ -276,7 +277,7 @@ static int create_image(int platform_mode)
error = dpm_suspend_end(PMSG_FREEZE);
if (error) {
- pr_err("Some devices failed to power down, aborting hibernation\n");
+ pr_err("Some devices failed to power down, aborting\n");
return error;
}
@@ -294,7 +295,7 @@ static int create_image(int platform_mode)
error = syscore_suspend();
if (error) {
- pr_err("Some system devices failed to power down, aborting hibernation\n");
+ pr_err("Some system devices failed to power down, aborting\n");
goto Enable_irqs;
}
@@ -309,7 +310,7 @@ static int create_image(int platform_mode)
restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
- pr_err("Error %d creating hibernation image\n", error);
+ pr_err("Error %d creating image\n", error);
if (!in_suspend) {
events_check_enabled = false;
@@ -679,7 +680,7 @@ static int load_image_and_restore(void)
if (!error)
hibernation_restore(flags & SF_PLATFORM_MODE);
- pr_err("Failed to load hibernation image, recovering.\n");
+ pr_err("Failed to load image, recovering.\n");
swsusp_free();
free_basic_memory_bitmaps();
Unlock:
@@ -742,7 +743,7 @@ int hibernate(void)
else
flags |= SF_CRC32_MODE;
- pm_pr_dbg("Writing image.\n");
+ pm_pr_dbg("Writing hibernation image.\n");
error = swsusp_write(flags);
swsusp_free();
if (!error) {
@@ -754,7 +755,7 @@ int hibernate(void)
in_suspend = 0;
pm_restore_gfp_mask();
} else {
- pm_pr_dbg("Image restored successfully.\n");
+ pm_pr_dbg("Hibernation image restored successfully.\n");
}
Free_bitmaps:
@@ -893,7 +894,7 @@ static int software_resume(void)
goto Close_Finish;
}
- pm_pr_dbg("Preparing processes for restore.\n");
+ pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
@@ -902,7 +903,7 @@ static int software_resume(void)
Finish:
__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
- pr_info("resume from hibernation failed (%d)\n", error);
+ pr_info("resume failed (%d)\n", error);
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
Unlock:
@@ -1067,7 +1068,8 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
lock_system_sleep();
swsusp_resume_device = res;
unlock_system_sleep();
- pm_pr_dbg("Configured resume from disk to %u\n", swsusp_resume_device);
+ pm_pr_dbg("Configured hibernation resume from disk to %u\n",
+ swsusp_resume_device);
noresume = 0;
software_resume();
return n;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index bdbd605c4215..69b7a8aeca3b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/suspend.h>
#include <linux/syscalls.h>
+#include <linux/pm_runtime.h>
#include "power.h"
@@ -189,6 +190,38 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
}
power_attr(mem_sleep);
+
+/*
+ * sync_on_suspend: invoke ksys_sync_helper() before suspend.
+ *
+ * show() returns whether ksys_sync_helper() is invoked before suspend.
+ * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it.
+ */
+bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC);
+
+static ssize_t sync_on_suspend_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", sync_on_suspend_enabled);
+}
+
+static ssize_t sync_on_suspend_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ sync_on_suspend_enabled = !!val;
+ return n;
+}
+
+power_attr(sync_on_suspend);
#endif /* CONFIG_SUSPEND */
#ifdef CONFIG_PM_SLEEP_DEBUG
@@ -254,7 +287,6 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(pm_test);
#endif /* CONFIG_PM_SLEEP_DEBUG */
-#ifdef CONFIG_DEBUG_FS
static char *suspend_step_name(enum suspend_stat_step step)
{
switch (step) {
@@ -275,6 +307,92 @@ static char *suspend_step_name(enum suspend_stat_step step)
}
}
+#define suspend_attr(_name) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ return sprintf(buf, "%d\n", suspend_stats._name); \
+} \
+static struct kobj_attribute _name = __ATTR_RO(_name)
+
+suspend_attr(success);
+suspend_attr(fail);
+suspend_attr(failed_freeze);
+suspend_attr(failed_prepare);
+suspend_attr(failed_suspend);
+suspend_attr(failed_suspend_late);
+suspend_attr(failed_suspend_noirq);
+suspend_attr(failed_resume);
+suspend_attr(failed_resume_early);
+suspend_attr(failed_resume_noirq);
+
+static ssize_t last_failed_dev_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int index;
+ char *last_failed_dev = NULL;
+
+ index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ index %= REC_FAILED_NUM;
+ last_failed_dev = suspend_stats.failed_devs[index];
+
+ return sprintf(buf, "%s\n", last_failed_dev);
+}
+static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev);
+
+static ssize_t last_failed_errno_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int index;
+ int last_failed_errno;
+
+ index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
+ index %= REC_FAILED_NUM;
+ last_failed_errno = suspend_stats.errno[index];
+
+ return sprintf(buf, "%d\n", last_failed_errno);
+}
+static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno);
+
+static ssize_t last_failed_step_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int index;
+ enum suspend_stat_step step;
+ char *last_failed_step = NULL;
+
+ index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
+ index %= REC_FAILED_NUM;
+ step = suspend_stats.failed_steps[index];
+ last_failed_step = suspend_step_name(step);
+
+ return sprintf(buf, "%s\n", last_failed_step);
+}
+static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step);
+
+static struct attribute *suspend_attrs[] = {
+ &success.attr,
+ &fail.attr,
+ &failed_freeze.attr,
+ &failed_prepare.attr,
+ &failed_suspend.attr,
+ &failed_suspend_late.attr,
+ &failed_suspend_noirq.attr,
+ &failed_resume.attr,
+ &failed_resume_early.attr,
+ &failed_resume_noirq.attr,
+ &last_failed_dev.attr,
+ &last_failed_errno.attr,
+ &last_failed_step.attr,
+ NULL,
+};
+
+static struct attribute_group suspend_attr_group = {
+ .name = "suspend_stats",
+ .attrs = suspend_attrs,
+};
+
+#ifdef CONFIG_DEBUG_FS
static int suspend_stats_show(struct seq_file *s, void *unused)
{
int i, index, last_dev, last_errno, last_step;
@@ -495,7 +613,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)
len = p ? p - buf : n;
/* Check hibernation first. */
- if (len == 4 && !strncmp(buf, "disk", len))
+ if (len == 4 && str_has_prefix(buf, "disk"))
return PM_SUSPEND_MAX;
#ifdef CONFIG_SUSPEND
@@ -769,6 +887,7 @@ static struct attribute * g[] = {
&wakeup_count_attr.attr,
#ifdef CONFIG_SUSPEND
&mem_sleep_attr.attr,
+ &sync_on_suspend_attr.attr,
#endif
#ifdef CONFIG_PM_AUTOSLEEP
&autosleep_attr.attr,
@@ -794,6 +913,14 @@ static const struct attribute_group attr_group = {
.attrs = g,
};
+static const struct attribute_group *attr_groups[] = {
+ &attr_group,
+#ifdef CONFIG_PM_SLEEP
+ &suspend_attr_group,
+#endif
+ NULL,
+};
+
struct workqueue_struct *pm_wq;
EXPORT_SYMBOL_GPL(pm_wq);
@@ -815,7 +942,7 @@ static int __init pm_init(void)
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
- error = sysfs_create_group(power_kobj, &attr_group);
+ error = sysfs_create_groups(power_kobj, attr_groups);
if (error)
return error;
pm_print_times_init();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 44bee462ff57..7cdc64dc2373 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -179,7 +179,7 @@ extern void swsusp_close(fmode_t);
extern int swsusp_unmark(void);
#endif
-struct timeval;
+struct __kernel_old_timeval;
/* kernel/power/swsusp.c */
extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 33e3febaba53..83edf8698118 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -78,57 +78,9 @@ static struct pm_qos_object cpu_dma_pm_qos = {
.name = "cpu_dma_latency",
};
-static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
-static struct pm_qos_constraints network_lat_constraints = {
- .list = PLIST_HEAD_INIT(network_lat_constraints.list),
- .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
- .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
- .type = PM_QOS_MIN,
- .notifiers = &network_lat_notifier,
-};
-static struct pm_qos_object network_lat_pm_qos = {
- .constraints = &network_lat_constraints,
- .name = "network_latency",
-};
-
-
-static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
-static struct pm_qos_constraints network_tput_constraints = {
- .list = PLIST_HEAD_INIT(network_tput_constraints.list),
- .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
- .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
- .type = PM_QOS_MAX,
- .notifiers = &network_throughput_notifier,
-};
-static struct pm_qos_object network_throughput_pm_qos = {
- .constraints = &network_tput_constraints,
- .name = "network_throughput",
-};
-
-
-static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
-static struct pm_qos_constraints memory_bw_constraints = {
- .list = PLIST_HEAD_INIT(memory_bw_constraints.list),
- .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
- .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
- .type = PM_QOS_SUM,
- .notifiers = &memory_bandwidth_notifier,
-};
-static struct pm_qos_object memory_bandwidth_pm_qos = {
- .constraints = &memory_bw_constraints,
- .name = "memory_bandwidth",
-};
-
-
static struct pm_qos_object *pm_qos_array[] = {
&null_pm_qos,
&cpu_dma_pm_qos,
- &network_lat_pm_qos,
- &network_throughput_pm_qos,
- &memory_bandwidth_pm_qos,
};
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
@@ -698,3 +650,251 @@ static int __init pm_qos_power_init(void)
}
late_initcall(pm_qos_power_init);
+
+/* Definitions related to the frequency QoS below. */
+
+/**
+ * freq_constraints_init - Initialize frequency QoS constraints.
+ * @qos: Frequency QoS constraints to initialize.
+ */
+void freq_constraints_init(struct freq_constraints *qos)
+{
+ struct pm_qos_constraints *c;
+
+ c = &qos->min_freq;
+ plist_head_init(&c->list);
+ c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE;
+ c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE;
+ c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE;
+ c->type = PM_QOS_MAX;
+ c->notifiers = &qos->min_freq_notifiers;
+ BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers);
+
+ c = &qos->max_freq;
+ plist_head_init(&c->list);
+ c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE;
+ c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE;
+ c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE;
+ c->type = PM_QOS_MIN;
+ c->notifiers = &qos->max_freq_notifiers;
+ BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers);
+}
+
+/**
+ * freq_qos_read_value - Get frequency QoS constraint for a given list.
+ * @qos: Constraints to evaluate.
+ * @type: QoS request type.
+ */
+s32 freq_qos_read_value(struct freq_constraints *qos,
+ enum freq_qos_req_type type)
+{
+ s32 ret;
+
+ switch (type) {
+ case FREQ_QOS_MIN:
+ ret = IS_ERR_OR_NULL(qos) ?
+ FREQ_QOS_MIN_DEFAULT_VALUE :
+ pm_qos_read_value(&qos->min_freq);
+ break;
+ case FREQ_QOS_MAX:
+ ret = IS_ERR_OR_NULL(qos) ?
+ FREQ_QOS_MAX_DEFAULT_VALUE :
+ pm_qos_read_value(&qos->max_freq);
+ break;
+ default:
+ WARN_ON(1);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * freq_qos_apply - Add/modify/remove frequency QoS request.
+ * @req: Constraint request to apply.
+ * @action: Action to perform (add/update/remove).
+ * @value: Value to assign to the QoS request.
+ *
+ * This is only meant to be called from inside pm_qos, not drivers.
+ */
+int freq_qos_apply(struct freq_qos_request *req,
+ enum pm_qos_req_action action, s32 value)
+{
+ int ret;
+
+ switch(req->type) {
+ case FREQ_QOS_MIN:
+ ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode,
+ action, value);
+ break;
+ case FREQ_QOS_MAX:
+ ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode,
+ action, value);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+/**
+ * freq_qos_add_request - Insert new frequency QoS request into a given list.
+ * @qos: Constraints to update.
+ * @req: Preallocated request object.
+ * @type: Request type.
+ * @value: Request value.
+ *
+ * Insert a new entry into the @qos list of requests, recompute the effective
+ * QoS constraint value for that list and initialize the @req object. The
+ * caller needs to save that object for later use in updates and removal.
+ *
+ * Return 1 if the effective constraint value has changed, 0 if the effective
+ * constraint value has not changed, or a negative error code on failures.
+ */
+int freq_qos_add_request(struct freq_constraints *qos,
+ struct freq_qos_request *req,
+ enum freq_qos_req_type type, s32 value)
+{
+ int ret;
+
+ if (IS_ERR_OR_NULL(qos) || !req)
+ return -EINVAL;
+
+ if (WARN(freq_qos_request_active(req),
+ "%s() called for active request\n", __func__))
+ return -EINVAL;
+
+ req->qos = qos;
+ req->type = type;
+ ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value);
+ if (ret < 0) {
+ req->qos = NULL;
+ req->type = 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_add_request);
+
+/**
+ * freq_qos_update_request - Modify existing frequency QoS request.
+ * @req: Request to modify.
+ * @new_value: New request value.
+ *
+ * Update an existing frequency QoS request along with the effective constraint
+ * value for the list of requests it belongs to.
+ *
+ * Return 1 if the effective constraint value has changed, 0 if the effective
+ * constraint value has not changed, or a negative error code on failures.
+ */
+int freq_qos_update_request(struct freq_qos_request *req, s32 new_value)
+{
+ if (!req)
+ return -EINVAL;
+
+ if (WARN(!freq_qos_request_active(req),
+ "%s() called for unknown object\n", __func__))
+ return -EINVAL;
+
+ if (req->pnode.prio == new_value)
+ return 0;
+
+ return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value);
+}
+EXPORT_SYMBOL_GPL(freq_qos_update_request);
+
+/**
+ * freq_qos_remove_request - Remove frequency QoS request from its list.
+ * @req: Request to remove.
+ *
+ * Remove the given frequency QoS request from the list of constraints it
+ * belongs to and recompute the effective constraint value for that list.
+ *
+ * Return 1 if the effective constraint value has changed, 0 if the effective
+ * constraint value has not changed, or a negative error code on failures.
+ */
+int freq_qos_remove_request(struct freq_qos_request *req)
+{
+ int ret;
+
+ if (!req)
+ return -EINVAL;
+
+ if (WARN(!freq_qos_request_active(req),
+ "%s() called for unknown object\n", __func__))
+ return -EINVAL;
+
+ ret = freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
+ req->qos = NULL;
+ req->type = 0;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_remove_request);
+
+/**
+ * freq_qos_add_notifier - Add frequency QoS change notifier.
+ * @qos: List of requests to add the notifier to.
+ * @type: Request type.
+ * @notifier: Notifier block to add.
+ */
+int freq_qos_add_notifier(struct freq_constraints *qos,
+ enum freq_qos_req_type type,
+ struct notifier_block *notifier)
+{
+ int ret;
+
+ if (IS_ERR_OR_NULL(qos) || !notifier)
+ return -EINVAL;
+
+ switch (type) {
+ case FREQ_QOS_MIN:
+ ret = blocking_notifier_chain_register(qos->min_freq.notifiers,
+ notifier);
+ break;
+ case FREQ_QOS_MAX:
+ ret = blocking_notifier_chain_register(qos->max_freq.notifiers,
+ notifier);
+ break;
+ default:
+ WARN_ON(1);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_add_notifier);
+
+/**
+ * freq_qos_remove_notifier - Remove frequency QoS change notifier.
+ * @qos: List of requests to remove the notifier from.
+ * @type: Request type.
+ * @notifier: Notifier block to remove.
+ */
+int freq_qos_remove_notifier(struct freq_constraints *qos,
+ enum freq_qos_req_type type,
+ struct notifier_block *notifier)
+{
+ int ret;
+
+ if (IS_ERR_OR_NULL(qos) || !notifier)
+ return -EINVAL;
+
+ switch (type) {
+ case FREQ_QOS_MIN:
+ ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers,
+ notifier);
+ break;
+ case FREQ_QOS_MAX:
+ ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers,
+ notifier);
+ break;
+ default:
+ WARN_ON(1);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_remove_notifier);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 83105874f255..ddade80ad276 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -8,7 +8,7 @@
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
*/
-#define pr_fmt(fmt) "PM: " fmt
+#define pr_fmt(fmt) "PM: hibernation: " fmt
#include <linux/version.h>
#include <linux/module.h>
@@ -734,8 +734,15 @@ zone_found:
* We have found the zone. Now walk the radix tree to find the leaf node
* for our PFN.
*/
+
+ /*
+ * If the zone we wish to scan is the the current zone and the
+ * pfn falls into the current node then we do not need to walk
+ * the tree.
+ */
node = bm->cur.node;
- if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+ if (zone == bm->cur.zone &&
+ ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
goto node_found;
node = zone->rtree;
@@ -1140,24 +1147,24 @@ void free_basic_memory_bitmaps(void)
void clear_free_pages(void)
{
-#ifdef CONFIG_PAGE_POISONING_ZERO
struct memory_bitmap *bm = free_pages_map;
unsigned long pfn;
if (WARN_ON(!(free_pages_map)))
return;
- memory_bm_position_reset(bm);
- pfn = memory_bm_next_pfn(bm);
- while (pfn != BM_END_OF_MAP) {
- if (pfn_valid(pfn))
- clear_highpage(pfn_to_page(pfn));
-
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) {
+ memory_bm_position_reset(bm);
pfn = memory_bm_next_pfn(bm);
+ while (pfn != BM_END_OF_MAP) {
+ if (pfn_valid(pfn))
+ clear_highpage(pfn_to_page(pfn));
+
+ pfn = memory_bm_next_pfn(bm);
+ }
+ memory_bm_position_reset(bm);
+ pr_info("free pages cleared after restore\n");
}
- memory_bm_position_reset(bm);
- pr_info("free pages cleared after restore\n");
-#endif /* PAGE_POISONING_ZERO */
}
/**
@@ -1559,9 +1566,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
*/
static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
{
- x *= multiplier;
- do_div(x, base);
- return (unsigned long)x;
+ return div64_u64(x * multiplier, base);
}
static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
@@ -1698,16 +1703,20 @@ int hibernate_preallocate_memory(void)
ktime_t start, stop;
int error;
- pr_info("Preallocating image memory... ");
+ pr_info("Preallocating image memory\n");
start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
- if (error)
+ if (error) {
+ pr_err("Cannot allocate original bitmap\n");
goto err_out;
+ }
error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
- if (error)
+ if (error) {
+ pr_err("Cannot allocate copy bitmap\n");
goto err_out;
+ }
alloc_normal = 0;
alloc_highmem = 0;
@@ -1797,8 +1806,11 @@ int hibernate_preallocate_memory(void)
alloc -= pages;
pages += pages_highmem;
pages_highmem = preallocate_image_highmem(alloc);
- if (pages_highmem < alloc)
+ if (pages_highmem < alloc) {
+ pr_err("Image allocation is %lu pages short\n",
+ alloc - pages_highmem);
goto err_out;
+ }
pages += pages_highmem;
/*
* size is the desired number of saveable pages to leave in
@@ -1829,13 +1841,12 @@ int hibernate_preallocate_memory(void)
out:
stop = ktime_get();
- pr_cont("done (allocated %lu pages)\n", pages);
+ pr_info("Allocated %lu pages for snapshot\n", pages);
swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
err_out:
- pr_cont("\n");
swsusp_free();
return -ENOMEM;
}
@@ -1969,7 +1980,7 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- pr_info("Creating hibernation image:\n");
+ pr_info("Creating image:\n");
drain_local_pages(NULL);
nr_pages = count_data_pages();
@@ -2003,7 +2014,7 @@ asmlinkage __visible int swsusp_save(void)
nr_copy_pages = nr_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- pr_info("Hibernation image created (%d pages copied)\n", nr_pages);
+ pr_info("Image created (%d pages copied)\n", nr_pages);
return 0;
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c874a7026e24..8b1bb5ee7e5d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -121,43 +121,26 @@ static void s2idle_loop(void)
{
pm_pr_dbg("suspend-to-idle\n");
+ /*
+ * Suspend-to-idle equals:
+ * frozen processes + suspended devices + idle processors.
+ * Thus s2idle_enter() should be called right after all devices have
+ * been suspended.
+ *
+ * Wakeups during the noirq suspend of devices may be spurious, so try
+ * to avoid them upfront.
+ */
for (;;) {
- int error;
-
- dpm_noirq_begin();
-
- /*
- * Suspend-to-idle equals
- * frozen processes + suspended devices + idle processors.
- * Thus s2idle_enter() should be called right after
- * all devices have been suspended.
- *
- * Wakeups during the noirq suspend of devices may be spurious,
- * so prevent them from terminating the loop right away.
- */
- error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
- if (!error)
- s2idle_enter();
- else if (error == -EBUSY && pm_wakeup_pending())
- error = 0;
-
- if (!error && s2idle_ops && s2idle_ops->wake)
- s2idle_ops->wake();
-
- dpm_noirq_resume_devices(PMSG_RESUME);
-
- dpm_noirq_end();
-
- if (error)
- break;
-
- if (s2idle_ops && s2idle_ops->sync)
- s2idle_ops->sync();
-
- if (pm_wakeup_pending())
+ if (s2idle_ops && s2idle_ops->wake) {
+ if (s2idle_ops->wake())
+ break;
+ } else if (pm_wakeup_pending()) {
break;
+ }
pm_wakeup_clear(false);
+
+ s2idle_enter();
}
pm_pr_dbg("resume from suspend-to-idle\n");
@@ -271,14 +254,21 @@ static int platform_suspend_prepare_late(suspend_state_t state)
static int platform_suspend_prepare_noirq(suspend_state_t state)
{
- return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare_late ?
- suspend_ops->prepare_late() : 0;
+ if (state == PM_SUSPEND_TO_IDLE)
+ return s2idle_ops && s2idle_ops->prepare_late ?
+ s2idle_ops->prepare_late() : 0;
+
+ return suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0;
}
static void platform_resume_noirq(suspend_state_t state)
{
- if (state != PM_SUSPEND_TO_IDLE && suspend_ops->wake)
+ if (state == PM_SUSPEND_TO_IDLE) {
+ if (s2idle_ops && s2idle_ops->restore_early)
+ s2idle_ops->restore_early();
+ } else if (suspend_ops->wake) {
suspend_ops->wake();
+ }
}
static void platform_resume_early(suspend_state_t state)
@@ -415,11 +405,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (error)
goto Devices_early_resume;
- if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) {
- s2idle_loop();
- goto Platform_early_resume;
- }
-
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
pr_err("noirq suspend of devices failed\n");
@@ -432,6 +417,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
+ if (state == PM_SUSPEND_TO_IDLE) {
+ s2idle_loop();
+ goto Platform_wake;
+ }
+
error = suspend_disable_secondary_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -575,7 +565,7 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_TO_IDLE)
s2idle_begin();
- if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) {
+ if (sync_on_suspend_enabled) {
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
ksys_sync_helper();
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 60564b58de07..e1ed58adb69e 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -70,7 +70,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
static char info_test[] __initdata =
KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
- unsigned long now;
+ time64_t now;
struct rtc_wkalrm alm;
int status;
@@ -81,10 +81,10 @@ repeat:
printk(err_readtime, dev_name(&rtc->dev), status);
return;
}
- rtc_tm_to_time(&alm.time, &now);
+ now = rtc_tm_to_time64(&alm.time);
memset(&alm, 0, sizeof alm);
- rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+ rtc_time64_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
alm.enabled = true;
status = rtc_set_alarm(rtc, &alm);
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 4210152e56f0..105df4dfc783 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(wakelocks_lock);
struct wakelock {
char *name;
struct rb_node node;
- struct wakeup_source ws;
+ struct wakeup_source *ws;
#ifdef CONFIG_PM_WAKELOCKS_GC
struct list_head lru;
#endif
@@ -46,7 +46,7 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)
for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
wl = rb_entry(node, struct wakelock, node);
- if (wl->ws.active == show_active)
+ if (wl->ws->active == show_active)
str += scnprintf(str, end - str, "%s ", wl->name);
}
if (str > buf)
@@ -112,16 +112,16 @@ static void __wakelocks_gc(struct work_struct *work)
u64 idle_time_ns;
bool active;
- spin_lock_irq(&wl->ws.lock);
- idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
- active = wl->ws.active;
- spin_unlock_irq(&wl->ws.lock);
+ spin_lock_irq(&wl->ws->lock);
+ idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws->last_time));
+ active = wl->ws->active;
+ spin_unlock_irq(&wl->ws->lock);
if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
break;
if (!active) {
- wakeup_source_remove(&wl->ws);
+ wakeup_source_unregister(wl->ws);
rb_erase(&wl->node, &wakelocks_tree);
list_del(&wl->lru);
kfree(wl->name);
@@ -187,9 +187,15 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
kfree(wl);
return ERR_PTR(-ENOMEM);
}
- wl->ws.name = wl->name;
- wl->ws.last_time = ktime_get();
- wakeup_source_add(&wl->ws);
+
+ wl->ws = wakeup_source_register(NULL, wl->name);
+ if (!wl->ws) {
+ kfree(wl->name);
+ kfree(wl);
+ return ERR_PTR(-ENOMEM);
+ }
+ wl->ws->last_time = ktime_get();
+
rb_link_node(&wl->node, parent, node);
rb_insert_color(&wl->node, &wakelocks_tree);
wakelocks_lru_add(wl);
@@ -233,9 +239,9 @@ int pm_wake_lock(const char *buf)
u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
do_div(timeout_ms, NSEC_PER_MSEC);
- __pm_wakeup_event(&wl->ws, timeout_ms);
+ __pm_wakeup_event(wl->ws, timeout_ms);
} else {
- __pm_stay_awake(&wl->ws);
+ __pm_stay_awake(wl->ws);
}
wakelocks_lru_most_recent(wl);
@@ -271,7 +277,7 @@ int pm_wake_unlock(const char *buf)
ret = PTR_ERR(wl);
goto out;
}
- __pm_relax(&wl->ws);
+ __pm_relax(wl->ws);
wakelocks_lru_most_recent(wl);
wakelocks_gc();
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 1d21ebacfdb8..17a9591e54ff 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -11,11 +11,18 @@
int _braille_console_setup(char **str, char **brl_options)
{
- if (!strncmp(*str, "brl,", 4)) {
+ size_t len;
+
+ len = str_has_prefix(*str, "brl,");
+ if (len) {
*brl_options = "";
- *str += 4;
- } else if (!strncmp(*str, "brl=", 4)) {
- *brl_options = *str + 4;
+ *str += len;
+ return 0;
+ }
+
+ len = str_has_prefix(*str, "brl=");
+ if (len) {
+ *brl_options = *str + len;
*str = strchr(*brl_options, ',');
if (!*str) {
pr_err("need port name after brl=\n");
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1888f6a3b694..fada22dc4ab6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -118,19 +118,29 @@ static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
static int __control_devkmsg(char *str)
{
+ size_t len;
+
if (!str)
return -EINVAL;
- if (!strncmp(str, "on", 2)) {
+ len = str_has_prefix(str, "on");
+ if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_ON;
- return 2;
- } else if (!strncmp(str, "off", 3)) {
+ return len;
+ }
+
+ len = str_has_prefix(str, "off");
+ if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_OFF;
- return 3;
- } else if (!strncmp(str, "ratelimit", 9)) {
+ return len;
+ }
+
+ len = str_has_prefix(str, "ratelimit");
+ if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
- return 9;
+ return len;
}
+
return -EINVAL;
}
@@ -238,7 +248,7 @@ static void __up_console_sem(unsigned long ip)
{
unsigned long flags;
- mutex_release(&console_lock_dep_map, 1, ip);
+ mutex_release(&console_lock_dep_map, ip);
printk_safe_enter_irqsave(flags);
up(&console_sem);
@@ -1669,20 +1679,20 @@ static int console_lock_spinning_disable_and_check(void)
raw_spin_unlock(&console_owner_lock);
if (!waiter) {
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+ spin_release(&console_owner_dep_map, _THIS_IP_);
return 0;
}
/* The waiter is now free to continue */
WRITE_ONCE(console_waiter, false);
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+ spin_release(&console_owner_dep_map, _THIS_IP_);
/*
* Hand off console_lock to waiter. The waiter will perform
* the up(). After this, the waiter is the console_lock owner.
*/
- mutex_release(&console_lock_dep_map, 1, _THIS_IP_);
+ mutex_release(&console_lock_dep_map, _THIS_IP_);
return 1;
}
@@ -1736,7 +1746,7 @@ static int console_trylock_spinning(void)
/* Owner will clear console_waiter on hand off */
while (READ_ONCE(console_waiter))
cpu_relax();
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+ spin_release(&console_owner_dep_map, _THIS_IP_);
printk_safe_exit_irqrestore(flags);
/*
@@ -2760,8 +2770,6 @@ void register_console(struct console *newcon)
* for us.
*/
logbuf_lock_irqsave(flags);
- console_seq = syslog_seq;
- console_idx = syslog_idx;
/*
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
@@ -2773,6 +2781,8 @@ void register_console(struct console *newcon)
*/
exclusive_console = newcon;
exclusive_console_stop_seq = console_seq;
+ console_seq = syslog_seq;
+ console_idx = syslog_idx;
logbuf_unlock_irqrestore(flags);
}
console_unlock();
@@ -2951,7 +2961,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
.func = wake_up_klogd_work_func,
- .flags = IRQ_WORK_LAZY,
+ .flags = ATOMIC_INIT(IRQ_WORK_LAZY),
};
void wake_up_klogd(void)
@@ -3274,7 +3284,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
/* move first record forward until length fits into the buffer */
seq = dumper->cur_seq;
idx = dumper->cur_idx;
- while (l > size && seq < dumper->next_seq) {
+ while (l >= size && seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
l -= msg_print_text(msg, true, time, NULL, 0);
diff --git a/kernel/profile.c b/kernel/profile.c
index af7c94bf5fa1..6f69a4195d56 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -336,7 +336,7 @@ static int profile_dead_cpu(unsigned int cpu)
struct page *page;
int i;
- if (prof_cpu_mask != NULL)
+ if (cpumask_available(prof_cpu_mask))
cpumask_clear_cpu(cpu, prof_cpu_mask);
for (i = 0; i < 2; i++) {
@@ -373,7 +373,7 @@ static int profile_prepare_cpu(unsigned int cpu)
static int profile_online_cpu(unsigned int cpu)
{
- if (prof_cpu_mask != NULL)
+ if (cpumask_available(prof_cpu_mask))
cpumask_set_cpu(cpu, prof_cpu_mask);
return 0;
@@ -403,7 +403,7 @@ void profile_tick(int type)
{
struct pt_regs *regs = get_irq_regs();
- if (!user_mode(regs) && prof_cpu_mask != NULL &&
+ if (!user_mode(regs) && cpumask_available(prof_cpu_mask) &&
cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
profile_hit(type, (void *)profile_pc(regs));
}
@@ -442,18 +442,18 @@ static ssize_t prof_cpu_mask_proc_write(struct file *file,
return err;
}
-static const struct file_operations prof_cpu_mask_proc_fops = {
- .open = prof_cpu_mask_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = prof_cpu_mask_proc_write,
+static const struct proc_ops prof_cpu_mask_proc_ops = {
+ .proc_open = prof_cpu_mask_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = prof_cpu_mask_proc_write,
};
void create_prof_cpu_mask(void)
{
/* create /proc/irq/prof_cpu_mask */
- proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
+ proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops);
}
/*
@@ -517,10 +517,10 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
return count;
}
-static const struct file_operations proc_profile_operations = {
- .read = read_profile,
- .write = write_profile,
- .llseek = default_llseek,
+static const struct proc_ops profile_proc_ops = {
+ .proc_read = read_profile,
+ .proc_write = write_profile,
+ .proc_lseek = default_llseek,
};
int __ref create_proc_profile(void)
@@ -548,7 +548,7 @@ int __ref create_proc_profile(void)
err = 0;
#endif
entry = proc_create("profile", S_IWUSR | S_IRUGO,
- NULL, &proc_profile_operations);
+ NULL, &profile_proc_ops);
if (!entry)
goto err_state_onl;
proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index cb9ddcc08119..43d6179508d6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -264,12 +264,17 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
return ret;
}
-static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns,
+ unsigned int mode)
{
+ int ret;
+
if (mode & PTRACE_MODE_NOAUDIT)
- return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT);
else
- return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE);
+
+ return ret == 0;
}
/* Returns 0 on success, -errno on denial. */
@@ -321,7 +326,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
gid_eq(caller_gid, tcred->sgid) &&
gid_eq(caller_gid, tcred->gid))
goto ok;
- if (ptrace_has_cap(tcred->user_ns, mode))
+ if (ptrace_has_cap(cred, tcred->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -340,7 +345,7 @@ ok:
mm = task->mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptrace_has_cap(mm->user_ns, mode)))
+ !ptrace_has_cap(cred, mm->user_ns, mode)))
return -EPERM;
return security_ptrace_access_check(task, mode);
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 480edf328b51..1cc940fef17c 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -7,7 +7,7 @@ menu "RCU Subsystem"
config TREE_RCU
bool
- default y if !PREEMPT && SMP
+ default y if SMP
help
This option selects the RCU implementation that is
designed for very large SMP system with hundreds or
@@ -16,7 +16,8 @@ config TREE_RCU
config PREEMPT_RCU
bool
- default y if PREEMPT
+ default y if PREEMPTION
+ select TREE_RCU
help
This option selects the RCU implementation that is
designed for very large SMP systems with hundreds or
@@ -28,7 +29,7 @@ config PREEMPT_RCU
config TINY_RCU
bool
- default y if !PREEMPT && !SMP
+ default y if !PREEMPTION && !SMP
help
This option selects the RCU implementation that is
designed for UP systems from which real-time response
@@ -70,7 +71,7 @@ config TREE_SRCU
This option selects the full-fledged version of SRCU.
config TASKS_RCU
- def_bool PREEMPT
+ def_bool PREEMPTION
select SRCU
help
This option enables a task-based RCU implementation that uses
@@ -78,7 +79,7 @@ config TASKS_RCU
user-mode execution as quiescent states.
config RCU_STALL_COMMON
- def_bool ( TREE_RCU || PREEMPT_RCU )
+ def_bool TREE_RCU
help
This option enables RCU CPU stall code that is common between
the TINY and TREE variants of RCU. The purpose is to allow
@@ -86,13 +87,13 @@ config RCU_STALL_COMMON
making these warnings mandatory for the tree variants.
config RCU_NEED_SEGCBLIST
- def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
+ def_bool ( TREE_RCU || TREE_SRCU )
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
range 2 64 if 64BIT
range 2 32 if !64BIT
- depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+ depends on TREE_RCU && RCU_EXPERT
default 64 if 64BIT
default 32 if !64BIT
help
@@ -112,7 +113,7 @@ config RCU_FANOUT_LEAF
int "Tree-based hierarchical RCU leaf-level fanout value"
range 2 64 if 64BIT
range 2 32 if !64BIT
- depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+ depends on TREE_RCU && RCU_EXPERT
default 16
help
This option controls the leaf-level fanout of hierarchical
@@ -187,7 +188,7 @@ config RCU_BOOST_DELAY
config RCU_NOCB_CPU
bool "Offload RCU callback processing from boot-selected CPUs"
- depends on TREE_RCU || PREEMPT_RCU
+ depends on TREE_RCU
depends on RCU_EXPERT || NO_HZ_FULL
default n
help
@@ -200,8 +201,8 @@ config RCU_NOCB_CPU
specified at boot time by the rcu_nocbs parameter. For each
such CPU, a kthread ("rcuox/N") will be created to invoke
callbacks, where the "N" is the CPU being offloaded, and where
- the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched
- (!PREEMPT kernels). Nothing prevents this kthread from running
+ the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched
+ (!PREEMPTION kernels). Nothing prevents this kthread from running
on the specified CPUs, but (1) the kthreads may be preempted
between each callback, and (2) affinity or cgroups can be used
to force the kthreads to run on whatever set of CPUs is desired.
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 5ec3ea4028e2..4aa02eee8f6c 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -8,6 +8,17 @@ menu "RCU Debugging"
config PROVE_RCU
def_bool PROVE_LOCKING
+config PROVE_RCU_LIST
+ bool "RCU list lockdep debugging"
+ depends on PROVE_RCU && RCU_EXPERT
+ default n
+ help
+ Enable RCU lockdep checking for list usages. By default it is
+ turned off since there are several list RCU users that still
+ need to be converted to pass a lockdep expression. To prevent
+ false-positive splats, we keep it default disabled but once all
+ users are converted, we can remove this config option.
+
config TORTURE_TEST
tristate
default n
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 020e8b6a644b..82d5fba48b2f 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -9,6 +9,5 @@ obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
obj-$(CONFIG_TREE_RCU) += tree.o
-obj-$(CONFIG_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TINY_RCU) += tiny.o
obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 5290b01de534..05f936ed167a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -198,35 +198,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-void kfree(const void *);
-
-/*
- * Reclaim the specified callback, either by invoking it (non-lazy case)
- * or freeing it directly (lazy case). Return true if lazy, false otherwise.
- */
-static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
-{
- rcu_callback_t f;
- unsigned long offset = (unsigned long)head->func;
-
- rcu_lock_acquire(&rcu_callback_map);
- if (__is_kfree_rcu_offset(offset)) {
- trace_rcu_invoke_kfree_callback(rn, head, offset);
- kfree((void *)head - offset);
- rcu_lock_release(&rcu_callback_map);
- return true;
- } else {
- trace_rcu_invoke_callback(rn, head);
- f = head->func;
- WRITE_ONCE(head->func, (rcu_callback_t)0L);
- f(head);
- rcu_lock_release(&rcu_callback_map);
- return false;
- }
-}
-
#ifdef CONFIG_RCU_STALL_COMMON
+extern int rcu_cpu_stall_ftrace_dump;
extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
int rcu_jiffies_till_stall_check(void);
@@ -280,7 +254,7 @@ void rcu_test_sync_prims(void);
*/
extern void resched_cpu(int cpu);
-#if defined(SRCU) || !defined(TINY_RCU)
+#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU)
#include <linux/rcu_node_tree.h>
@@ -298,6 +272,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
{
int i;
+ for (i = 0; i < RCU_NUM_LVLS; i++)
+ levelspread[i] = INT_MIN;
if (rcu_fanout_exact) {
levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
for (i = rcu_num_lvls - 2; i >= 0; i--)
@@ -415,7 +391,7 @@ do { \
#define raw_lockdep_assert_held_rcu_node(p) \
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
-#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
+#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
#ifdef CONFIG_SRCU
void srcu_init(void);
@@ -451,10 +427,9 @@ enum rcutorture_type {
INVALID_RCU_FLAVOR
};
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+#if defined(CONFIG_TREE_RCU)
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
unsigned long *gp_seq);
-void rcutorture_record_progress(unsigned long vernum);
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
unsigned long secs,
@@ -467,7 +442,6 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
*flags = 0;
*gp_seq = 0;
}
-static inline void rcutorture_record_progress(unsigned long vernum) { }
#ifdef CONFIG_RCU_TRACE
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 9bd5f6023c21..5f4fd3b8777c 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -20,15 +20,49 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
rclp->head = NULL;
rclp->tail = &rclp->head;
rclp->len = 0;
- rclp->len_lazy = 0;
+}
+
+/*
+ * Enqueue an rcu_head structure onto the specified callback list.
+ */
+void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
+{
+ *rclp->tail = rhp;
+ rclp->tail = &rhp->next;
+ WRITE_ONCE(rclp->len, rclp->len + 1);
+}
+
+/*
+ * Flush the second rcu_cblist structure onto the first one, obliterating
+ * any contents of the first. If rhp is non-NULL, enqueue it as the sole
+ * element of the second rcu_cblist structure, but ensuring that the second
+ * rcu_cblist structure, if initially non-empty, always appears non-empty
+ * throughout the process. If rdp is NULL, the second rcu_cblist structure
+ * is instead initialized to empty.
+ */
+void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
+ struct rcu_cblist *srclp,
+ struct rcu_head *rhp)
+{
+ drclp->head = srclp->head;
+ if (drclp->head)
+ drclp->tail = srclp->tail;
+ else
+ drclp->tail = &drclp->head;
+ drclp->len = srclp->len;
+ if (!rhp) {
+ rcu_cblist_init(srclp);
+ } else {
+ rhp->next = NULL;
+ srclp->head = rhp;
+ srclp->tail = &rhp->next;
+ WRITE_ONCE(srclp->len, 1);
+ }
}
/*
* Dequeue the oldest rcu_head structure from the specified callback
- * list. This function assumes that the callback is non-lazy, but
- * the caller can later invoke rcu_cblist_dequeued_lazy() if it
- * finds otherwise (and if it cares about laziness). This allows
- * different users to have different ways of determining laziness.
+ * list.
*/
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
{
@@ -44,6 +78,67 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
return rhp;
}
+/* Set the length of an rcu_segcblist structure. */
+static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ atomic_long_set(&rsclp->len, v);
+#else
+ WRITE_ONCE(rsclp->len, v);
+#endif
+}
+
+/*
+ * Increase the numeric length of an rcu_segcblist structure by the
+ * specified amount, which can be negative. This can cause the ->len
+ * field to disagree with the actual number of callbacks on the structure.
+ * This increase is fully ordered with respect to the callers accesses
+ * both before and after.
+ */
+static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ smp_mb__before_atomic(); /* Up to the caller! */
+ atomic_long_add(v, &rsclp->len);
+ smp_mb__after_atomic(); /* Up to the caller! */
+#else
+ smp_mb(); /* Up to the caller! */
+ WRITE_ONCE(rsclp->len, rsclp->len + v);
+ smp_mb(); /* Up to the caller! */
+#endif
+}
+
+/*
+ * Increase the numeric length of an rcu_segcblist structure by one.
+ * This can cause the ->len field to disagree with the actual number of
+ * callbacks on the structure. This increase is fully ordered with respect
+ * to the callers accesses both before and after.
+ */
+void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
+{
+ rcu_segcblist_add_len(rsclp, 1);
+}
+
+/*
+ * Exchange the numeric length of the specified rcu_segcblist structure
+ * with the specified value. This can cause the ->len field to disagree
+ * with the actual number of callbacks on the structure. This exchange is
+ * fully ordered with respect to the callers accesses both before and after.
+ */
+static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return atomic_long_xchg(&rsclp->len, v);
+#else
+ long ret = rsclp->len;
+
+ smp_mb(); /* Up to the caller! */
+ WRITE_ONCE(rsclp->len, v);
+ smp_mb(); /* Up to the caller! */
+ return ret;
+#endif
+}
+
/*
* Initialize an rcu_segcblist structure.
*/
@@ -56,8 +151,8 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
rsclp->head = NULL;
for (i = 0; i < RCU_CBLIST_NSEGS; i++)
rsclp->tails[i] = &rsclp->head;
- rsclp->len = 0;
- rsclp->len_lazy = 0;
+ rcu_segcblist_set_len(rsclp, 0);
+ rsclp->enabled = 1;
}
/*
@@ -68,8 +163,16 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
{
WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
- WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
- rsclp->tails[RCU_NEXT_TAIL] = NULL;
+ rsclp->enabled = 0;
+}
+
+/*
+ * Mark the specified rcu_segcblist structure as offloaded. This
+ * structure must be empty.
+ */
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
+{
+ rsclp->offloaded = 1;
}
/*
@@ -118,6 +221,18 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
}
/*
+ * Return false if there are no CBs awaiting grace periods, otherwise,
+ * return true and store the nearest waited-upon grace period into *lp.
+ */
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp)
+{
+ if (!rcu_segcblist_pend_cbs(rsclp))
+ return false;
+ *lp = rsclp->gp_seq[RCU_WAIT_TAIL];
+ return true;
+}
+
+/*
* Enqueue the specified callback onto the specified rcu_segcblist
* structure, updating accounting as needed. Note that the ->len
* field may be accessed locklessly, hence the WRITE_ONCE().
@@ -127,15 +242,13 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
* absolutely not OK for it to ever miss posting a callback.
*/
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
- struct rcu_head *rhp, bool lazy)
+ struct rcu_head *rhp)
{
- WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
- if (lazy)
- rsclp->len_lazy++;
+ rcu_segcblist_inc_len(rsclp);
smp_mb(); /* Ensure counts are updated before callback is enqueued. */
rhp->next = NULL;
- *rsclp->tails[RCU_NEXT_TAIL] = rhp;
- rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
+ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
+ WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);
}
/*
@@ -149,23 +262,21 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
* period. You have been warned.
*/
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
- struct rcu_head *rhp, bool lazy)
+ struct rcu_head *rhp)
{
int i;
if (rcu_segcblist_n_cbs(rsclp) == 0)
return false;
- WRITE_ONCE(rsclp->len, rsclp->len + 1);
- if (lazy)
- rsclp->len_lazy++;
+ rcu_segcblist_inc_len(rsclp);
smp_mb(); /* Ensure counts are updated before callback is entrained. */
rhp->next = NULL;
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
if (rsclp->tails[i] != rsclp->tails[i - 1])
break;
- *rsclp->tails[i] = rhp;
+ WRITE_ONCE(*rsclp->tails[i], rhp);
for (; i <= RCU_NEXT_TAIL; i++)
- rsclp->tails[i] = &rhp->next;
+ WRITE_ONCE(rsclp->tails[i], &rhp->next);
return true;
}
@@ -181,10 +292,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
- rclp->len_lazy += rsclp->len_lazy;
- rclp->len += rsclp->len;
- rsclp->len_lazy = 0;
- WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
+ rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
}
/*
@@ -200,12 +308,12 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_ready_cbs(rsclp))
return; /* Nothing to do. */
*rclp->tail = rsclp->head;
- rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
- *rsclp->tails[RCU_DONE_TAIL] = NULL;
+ WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]);
+ WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
rclp->tail = rsclp->tails[RCU_DONE_TAIL];
for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
- rsclp->tails[i] = &rsclp->head;
+ WRITE_ONCE(rsclp->tails[i], &rsclp->head);
}
/*
@@ -224,9 +332,9 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
return; /* Nothing to do. */
*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
- *rsclp->tails[RCU_DONE_TAIL] = NULL;
+ WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
- rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
+ WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
}
/*
@@ -236,10 +344,7 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
- rsclp->len_lazy += rclp->len_lazy;
- /* ->len sampled locklessly. */
- WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
- rclp->len_lazy = 0;
+ rcu_segcblist_add_len(rsclp, rclp->len);
rclp->len = 0;
}
@@ -255,10 +360,10 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
if (!rclp->head)
return; /* No callbacks to move. */
*rclp->tail = rsclp->head;
- rsclp->head = rclp->head;
+ WRITE_ONCE(rsclp->head, rclp->head);
for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
if (&rsclp->head == rsclp->tails[i])
- rsclp->tails[i] = rclp->tail;
+ WRITE_ONCE(rsclp->tails[i], rclp->tail);
else
break;
rclp->head = NULL;
@@ -274,8 +379,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
{
if (!rclp->head)
return; /* Nothing to do. */
- *rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
- rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
+ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
+ WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
rclp->head = NULL;
rclp->tail = &rclp->head;
}
@@ -299,7 +404,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
break;
- rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
+ WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
}
/* If no callbacks moved, nothing more need be done. */
@@ -308,7 +413,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
/* Clean up tail pointers that might have been misordered above. */
for (j = RCU_WAIT_TAIL; j < i; j++)
- rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
+ WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
/*
* Callbacks moved, so clean up the misordered ->tails[] pointers
@@ -319,7 +424,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
break; /* No more callbacks. */
- rsclp->tails[j] = rsclp->tails[i];
+ WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
rsclp->gp_seq[j] = rsclp->gp_seq[i];
}
}
@@ -384,7 +489,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
* structure other than in the RCU_NEXT_TAIL segment.
*/
for (; i < RCU_NEXT_TAIL; i++) {
- rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
+ WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]);
rsclp->gp_seq[i] = seq;
}
return true;
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 71b64648464e..5c293afc07b8 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -9,16 +9,17 @@
#include <linux/rcu_segcblist.h>
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
+/* Return number of callbacks in the specified callback list. */
+static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
{
- rclp->len_lazy--;
+ return READ_ONCE(rclp->len);
}
void rcu_cblist_init(struct rcu_cblist *rclp);
+void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
+void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
+ struct rcu_cblist *srclp,
+ struct rcu_head *rhp);
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
/*
@@ -36,34 +37,32 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
*/
static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
{
- return !rsclp->head;
+ return !READ_ONCE(rsclp->head);
}
/* Return number of callbacks in segmented callback list. */
static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return atomic_long_read(&rsclp->len);
+#else
return READ_ONCE(rsclp->len);
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
-{
- return rsclp->len_lazy;
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
-{
- return rsclp->len - rsclp->len_lazy;
+#endif
}
/*
* Is the specified rcu_segcblist enabled, for example, not corresponding
- * to an offline or callback-offloaded CPU?
+ * to an offline CPU?
*/
static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
{
- return !!rsclp->tails[RCU_NEXT_TAIL];
+ return rsclp->enabled;
+}
+
+/* Is the specified rcu_segcblist offloaded? */
+static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
+{
+ return rsclp->offloaded;
}
/*
@@ -73,40 +72,22 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
*/
static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
{
- return !*rsclp->tails[seg];
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer. Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
-{
- return rsclp->head;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer. Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
-{
- WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
- return rsclp->tails[RCU_NEXT_TAIL];
+ return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));
}
+void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
- struct rcu_head *rhp, bool lazy);
+ struct rcu_head *rhp);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
- struct rcu_head *rhp, bool lazy);
+ struct rcu_head *rhp);
void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp);
void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 7a6890b23c5f..da94b89cd531 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -86,10 +86,11 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
"Shutdown at end of performance tests.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
+torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?");
static char *perf_type = "rcu";
module_param(perf_type, charp, 0444);
-MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
+MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)");
static int nrealreaders;
static int nrealwriters;
@@ -105,19 +106,10 @@ static atomic_t n_rcu_perf_writer_finished;
static wait_queue_head_t shutdown_wq;
static u64 t_rcu_perf_writer_started;
static u64 t_rcu_perf_writer_finished;
-static unsigned long b_rcu_perf_writer_started;
-static unsigned long b_rcu_perf_writer_finished;
+static unsigned long b_rcu_gp_test_started;
+static unsigned long b_rcu_gp_test_finished;
static DEFINE_PER_CPU(atomic_t, n_async_inflight);
-static int rcu_perf_writer_state;
-#define RTWS_INIT 0
-#define RTWS_ASYNC 1
-#define RTWS_BARRIER 2
-#define RTWS_EXP_SYNC 3
-#define RTWS_SYNC 4
-#define RTWS_IDLE 5
-#define RTWS_STOPPING 6
-
#define MAX_MEAS 10000
#define MIN_MEAS 100
@@ -375,14 +367,22 @@ rcu_perf_writer(void *arg)
if (holdoff)
schedule_timeout_uninterruptible(holdoff * HZ);
+ /*
+ * Wait until rcu_end_inkernel_boot() is called for normal GP tests
+ * so that RCU is not always expedited for normal GP tests.
+ * The system_state test is approximate, but works well in practice.
+ */
+ while (!gp_exp && system_state != SYSTEM_RUNNING)
+ schedule_timeout_uninterruptible(1);
+
t = ktime_get_mono_fast_ns();
if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
t_rcu_perf_writer_started = t;
if (gp_exp) {
- b_rcu_perf_writer_started =
+ b_rcu_gp_test_started =
cur_ops->exp_completed() / 2;
} else {
- b_rcu_perf_writer_started = cur_ops->get_gp_seq();
+ b_rcu_gp_test_started = cur_ops->get_gp_seq();
}
}
@@ -396,25 +396,20 @@ retry:
if (!rhp)
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
- rcu_perf_writer_state = RTWS_ASYNC;
atomic_inc(this_cpu_ptr(&n_async_inflight));
cur_ops->async(rhp, rcu_perf_async_cb);
rhp = NULL;
} else if (!kthread_should_stop()) {
- rcu_perf_writer_state = RTWS_BARRIER;
cur_ops->gp_barrier();
goto retry;
} else {
kfree(rhp); /* Because we are stopping. */
}
} else if (gp_exp) {
- rcu_perf_writer_state = RTWS_EXP_SYNC;
cur_ops->exp_sync();
} else {
- rcu_perf_writer_state = RTWS_SYNC;
cur_ops->sync();
}
- rcu_perf_writer_state = RTWS_IDLE;
t = ktime_get_mono_fast_ns();
*wdp = t - *wdp;
i_max = i;
@@ -435,10 +430,10 @@ retry:
PERFOUT_STRING("Test complete");
t_rcu_perf_writer_finished = t;
if (gp_exp) {
- b_rcu_perf_writer_finished =
+ b_rcu_gp_test_finished =
cur_ops->exp_completed() / 2;
} else {
- b_rcu_perf_writer_finished =
+ b_rcu_gp_test_finished =
cur_ops->get_gp_seq();
}
if (shutdown) {
@@ -455,10 +450,8 @@ retry:
rcu_perf_wait_shutdown();
} while (!torture_must_stop());
if (gp_async) {
- rcu_perf_writer_state = RTWS_BARRIER;
cur_ops->gp_barrier();
}
- rcu_perf_writer_state = RTWS_STOPPING;
writer_n_durations[me] = i_max;
torture_kthread_stopping("rcu_perf_writer");
return 0;
@@ -523,8 +516,8 @@ rcu_perf_cleanup(void)
t_rcu_perf_writer_finished -
t_rcu_perf_writer_started,
ngps,
- rcuperf_seq_diff(b_rcu_perf_writer_finished,
- b_rcu_perf_writer_started));
+ rcuperf_seq_diff(b_rcu_gp_test_finished,
+ b_rcu_gp_test_started));
for (i = 0; i < nrealwriters; i++) {
if (!writer_durations)
break;
@@ -592,6 +585,159 @@ rcu_perf_shutdown(void *arg)
return -EINVAL;
}
+/*
+ * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number
+ * of iterations and measure total time and number of GP for all iterations to complete.
+ */
+
+torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
+torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
+torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
+
+static struct task_struct **kfree_reader_tasks;
+static int kfree_nrealthreads;
+static atomic_t n_kfree_perf_thread_started;
+static atomic_t n_kfree_perf_thread_ended;
+
+struct kfree_obj {
+ char kfree_obj[8];
+ struct rcu_head rh;
+};
+
+static int
+kfree_perf_thread(void *arg)
+{
+ int i, loop = 0;
+ long me = (long)arg;
+ struct kfree_obj *alloc_ptr;
+ u64 start_time, end_time;
+
+ VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+
+ start_time = ktime_get_mono_fast_ns();
+
+ if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) {
+ if (gp_exp)
+ b_rcu_gp_test_started = cur_ops->exp_completed() / 2;
+ else
+ b_rcu_gp_test_started = cur_ops->get_gp_seq();
+ }
+
+ do {
+ for (i = 0; i < kfree_alloc_num; i++) {
+ alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
+ if (!alloc_ptr)
+ return -ENOMEM;
+
+ kfree_rcu(alloc_ptr, rh);
+ }
+
+ cond_resched();
+ } while (!torture_must_stop() && ++loop < kfree_loops);
+
+ if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) {
+ end_time = ktime_get_mono_fast_ns();
+
+ if (gp_exp)
+ b_rcu_gp_test_finished = cur_ops->exp_completed() / 2;
+ else
+ b_rcu_gp_test_finished = cur_ops->get_gp_seq();
+
+ pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n",
+ (unsigned long long)(end_time - start_time), kfree_loops,
+ rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started));
+ if (shutdown) {
+ smp_mb(); /* Assign before wake. */
+ wake_up(&shutdown_wq);
+ }
+ }
+
+ torture_kthread_stopping("kfree_perf_thread");
+ return 0;
+}
+
+static void
+kfree_perf_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (kfree_reader_tasks) {
+ for (i = 0; i < kfree_nrealthreads; i++)
+ torture_stop_kthread(kfree_perf_thread,
+ kfree_reader_tasks[i]);
+ kfree(kfree_reader_tasks);
+ }
+
+ torture_cleanup_end();
+}
+
+/*
+ * shutdown kthread. Just waits to be awakened, then shuts down system.
+ */
+static int
+kfree_perf_shutdown(void *arg)
+{
+ do {
+ wait_event(shutdown_wq,
+ atomic_read(&n_kfree_perf_thread_ended) >=
+ kfree_nrealthreads);
+ } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
+
+ smp_mb(); /* Wake before output. */
+
+ kfree_perf_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
+static int __init
+kfree_perf_init(void)
+{
+ long i;
+ int firsterr = 0;
+
+ kfree_nrealthreads = compute_real(kfree_nthreads);
+ /* Start up the kthreads. */
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(kfree_perf_shutdown, NULL,
+ shutdown_task);
+ if (firsterr)
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+
+ kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
+ GFP_KERNEL);
+ if (kfree_reader_tasks == NULL) {
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ for (i = 0; i < kfree_nrealthreads; i++) {
+ firsterr = torture_create_kthread(kfree_perf_thread, (void *)i,
+ kfree_reader_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+
+ while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads)
+ schedule_timeout_uninterruptible(1);
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ kfree_perf_cleanup();
+ return firsterr;
+}
+
static int __init
rcu_perf_init(void)
{
@@ -624,6 +770,9 @@ rcu_perf_init(void)
if (cur_ops->init)
cur_ops->init();
+ if (kfree_rcu_test)
+ return kfree_perf_init();
+
nrealwriters = compute_real(nwriters);
nrealreaders = compute_real(nreaders);
atomic_set(&n_rcu_perf_reader_started, 0);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index fce4e7e6f502..1aeecc165b21 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -44,6 +44,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/sysctl.h>
#include <linux/oom.h>
+#include <linux/tick.h>
#include "rcu.h"
@@ -161,6 +162,7 @@ static atomic_long_t n_rcu_torture_timers;
static long n_barrier_attempts;
static long n_barrier_successes; /* did rcu_barrier test succeed? */
static struct list_head rcu_torture_removed;
+static unsigned long shutdown_jiffies;
static int rcu_torture_writer_state;
#define RTWS_FIXED_DELAY 0
@@ -228,6 +230,15 @@ static u64 notrace rcu_trace_clock_local(void)
}
#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/*
+ * Stop aggressive CPU-hog tests a bit before the end of the test in order
+ * to avoid interfering with test shutdown.
+ */
+static bool shutdown_time_arrived(void)
+{
+ return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ);
+}
+
static unsigned long boost_starttime; /* jiffies of next boost test start. */
static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
@@ -1353,15 +1364,15 @@ rcu_torture_reader(void *arg)
set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
timer_setup_on_stack(&t, rcu_torture_timer, 0);
-
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU);
do {
if (irqreader && cur_ops->irq_capable) {
if (!timer_pending(&t))
mod_timer(&t, jiffies + 1);
}
- if (!rcu_torture_one_read(&rand))
+ if (!rcu_torture_one_read(&rand) && !torture_must_stop())
schedule_timeout_interruptible(HZ);
- if (time_after(jiffies, lastsleep)) {
+ if (time_after(jiffies, lastsleep) && !torture_must_stop()) {
schedule_timeout_interruptible(1);
lastsleep = jiffies + 10;
}
@@ -1373,6 +1384,7 @@ rcu_torture_reader(void *arg)
del_timer_sync(&t);
destroy_timer_on_stack(&t);
}
+ tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
torture_kthread_stopping("rcu_torture_reader");
return 0;
}
@@ -1432,15 +1444,18 @@ rcu_torture_stats_print(void)
n_rcu_torture_barrier_error);
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
- if (atomic_read(&n_rcu_torture_mberror) != 0 ||
- n_rcu_torture_barrier_error != 0 ||
- n_rcu_torture_boost_ktrerror != 0 ||
- n_rcu_torture_boost_rterror != 0 ||
- n_rcu_torture_boost_failure != 0 ||
+ if (atomic_read(&n_rcu_torture_mberror) ||
+ n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
+ n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
- WARN_ON_ONCE(1);
+ WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
+ WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
+ WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
+ WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
+ WARN_ON_ONCE(n_rcu_torture_boost_failure); // RCU boost failed
+ WARN_ON_ONCE(i > 1); // Too-short grace period
}
pr_cont("Reader Pipe: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1646,43 +1661,52 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp)
struct rcu_fwd_cb {
struct rcu_head rh;
struct rcu_fwd_cb *rfc_next;
+ struct rcu_fwd *rfc_rfp;
int rfc_gps;
};
-static DEFINE_SPINLOCK(rcu_fwd_lock);
-static struct rcu_fwd_cb *rcu_fwd_cb_head;
-static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head;
-static long n_launders_cb;
-static unsigned long rcu_fwd_startat;
-static bool rcu_fwd_emergency_stop;
+
#define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */
#define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */
#define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */
#define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */
+#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
+
struct rcu_launder_hist {
long n_launders;
unsigned long launder_gp_seq;
};
-#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
-static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
-static unsigned long rcu_launder_gp_seq_start;
-static void rcu_torture_fwd_cb_hist(void)
+struct rcu_fwd {
+ spinlock_t rcu_fwd_lock;
+ struct rcu_fwd_cb *rcu_fwd_cb_head;
+ struct rcu_fwd_cb **rcu_fwd_cb_tail;
+ long n_launders_cb;
+ unsigned long rcu_fwd_startat;
+ struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
+ unsigned long rcu_launder_gp_seq_start;
+};
+
+struct rcu_fwd *rcu_fwds;
+bool rcu_fwd_emergency_stop;
+
+static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
{
unsigned long gps;
unsigned long gps_old;
int i;
int j;
- for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
- if (n_launders_hist[i].n_launders > 0)
+ for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
+ if (rfp->n_launders_hist[i].n_launders > 0)
break;
pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
- __func__, jiffies - rcu_fwd_startat);
- gps_old = rcu_launder_gp_seq_start;
+ __func__, jiffies - rfp->rcu_fwd_startat);
+ gps_old = rfp->rcu_launder_gp_seq_start;
for (j = 0; j <= i; j++) {
- gps = n_launders_hist[j].launder_gp_seq;
+ gps = rfp->n_launders_hist[j].launder_gp_seq;
pr_cont(" %ds/%d: %ld:%ld",
- j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders,
+ j + 1, FWD_CBS_HIST_DIV,
+ rfp->n_launders_hist[j].n_launders,
rcutorture_seq_diff(gps, gps_old));
gps_old = gps;
}
@@ -1696,63 +1720,72 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
int i;
struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh);
struct rcu_fwd_cb **rfcpp;
+ struct rcu_fwd *rfp = rfcp->rfc_rfp;
rfcp->rfc_next = NULL;
rfcp->rfc_gps++;
- spin_lock_irqsave(&rcu_fwd_lock, flags);
- rfcpp = rcu_fwd_cb_tail;
- rcu_fwd_cb_tail = &rfcp->rfc_next;
+ spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
+ rfcpp = rfp->rcu_fwd_cb_tail;
+ rfp->rcu_fwd_cb_tail = &rfcp->rfc_next;
WRITE_ONCE(*rfcpp, rfcp);
- WRITE_ONCE(n_launders_cb, n_launders_cb + 1);
- i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
- if (i >= ARRAY_SIZE(n_launders_hist))
- i = ARRAY_SIZE(n_launders_hist) - 1;
- n_launders_hist[i].n_launders++;
- n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
- spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+ WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1);
+ i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
+ if (i >= ARRAY_SIZE(rfp->n_launders_hist))
+ i = ARRAY_SIZE(rfp->n_launders_hist) - 1;
+ rfp->n_launders_hist[i].n_launders++;
+ rfp->n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
+ spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
}
// Give the scheduler a chance, even on nohz_full CPUs.
-static void rcu_torture_fwd_prog_cond_resched(void)
+static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
{
- if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
- if (need_resched())
+ if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ // Real call_rcu() floods hit userspace, so emulate that.
+ if (need_resched() || (iter & 0xfff))
schedule();
- } else {
- cond_resched();
+ return;
}
+ // No userspace emulation: CB invocation throttles call_rcu()
+ cond_resched();
}
/*
* Free all callbacks on the rcu_fwd_cb_head list, either because the
* test is over or because we hit an OOM event.
*/
-static unsigned long rcu_torture_fwd_prog_cbfree(void)
+static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp)
{
unsigned long flags;
unsigned long freed = 0;
struct rcu_fwd_cb *rfcp;
for (;;) {
- spin_lock_irqsave(&rcu_fwd_lock, flags);
- rfcp = rcu_fwd_cb_head;
+ spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
+ rfcp = rfp->rcu_fwd_cb_head;
if (!rfcp) {
- spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+ spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
break;
}
- rcu_fwd_cb_head = rfcp->rfc_next;
- if (!rcu_fwd_cb_head)
- rcu_fwd_cb_tail = &rcu_fwd_cb_head;
- spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+ rfp->rcu_fwd_cb_head = rfcp->rfc_next;
+ if (!rfp->rcu_fwd_cb_head)
+ rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+ spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
kfree(rfcp);
freed++;
- rcu_torture_fwd_prog_cond_resched();
+ rcu_torture_fwd_prog_cond_resched(freed);
+ if (tick_nohz_full_enabled()) {
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ }
}
return freed;
}
/* Carry out need_resched()/cond_resched() forward-progress testing. */
-static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
+static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
+ int *tested, int *tested_tries)
{
unsigned long cver;
unsigned long dur;
@@ -1782,18 +1815,20 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
sd = cur_ops->stall_dur() + 1;
sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
dur = sd4 + torture_random(&trs) % (sd - sd4);
- WRITE_ONCE(rcu_fwd_startat, jiffies);
- stopat = rcu_fwd_startat + dur;
+ WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
+ stopat = rfp->rcu_fwd_startat + dur;
while (time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
idx = cur_ops->readlock();
udelay(10);
cur_ops->readunlock(idx);
if (!fwd_progress_need_resched || need_resched())
- rcu_torture_fwd_prog_cond_resched();
+ cond_resched();
}
(*tested_tries)++;
if (!time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
(*tested)++;
cver = READ_ONCE(rcu_torture_current_version) - cver;
@@ -1816,9 +1851,10 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
}
/* Carry out call_rcu() forward-progress testing. */
-static void rcu_torture_fwd_prog_cr(void)
+static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
{
unsigned long cver;
+ unsigned long flags;
unsigned long gps;
int i;
long n_launders;
@@ -1839,21 +1875,23 @@ static void rcu_torture_fwd_prog_cr(void)
/* Loop continuously posting RCU callbacks. */
WRITE_ONCE(rcu_fwd_cb_nodelay, true);
cur_ops->sync(); /* Later readers see above write. */
- WRITE_ONCE(rcu_fwd_startat, jiffies);
- stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
+ WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
+ stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
n_launders = 0;
- n_launders_cb = 0;
+ rfp->n_launders_cb = 0; // Hoist initialization for multi-kthread
n_launders_sa = 0;
n_max_cbs = 0;
n_max_gps = 0;
- for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++)
- n_launders_hist[i].n_launders = 0;
+ for (i = 0; i < ARRAY_SIZE(rfp->n_launders_hist); i++)
+ rfp->n_launders_hist[i].n_launders = 0;
cver = READ_ONCE(rcu_torture_current_version);
gps = cur_ops->get_gp_seq();
- rcu_launder_gp_seq_start = gps;
+ rfp->rcu_launder_gp_seq_start = gps;
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU);
while (time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
- rfcp = READ_ONCE(rcu_fwd_cb_head);
+ rfcp = READ_ONCE(rfp->rcu_fwd_cb_head);
rfcpn = NULL;
if (rfcp)
rfcpn = READ_ONCE(rfcp->rfc_next);
@@ -1861,7 +1899,7 @@ static void rcu_torture_fwd_prog_cr(void)
if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS &&
++n_max_gps >= MIN_FWD_CBS_LAUNDERED)
break;
- rcu_fwd_cb_head = rfcpn;
+ rfp->rcu_fwd_cb_head = rfcpn;
n_launders++;
n_launders_sa++;
} else {
@@ -1873,28 +1911,36 @@ static void rcu_torture_fwd_prog_cr(void)
n_max_cbs++;
n_launders_sa = 0;
rfcp->rfc_gps = 0;
+ rfcp->rfc_rfp = rfp;
}
cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
- rcu_torture_fwd_prog_cond_resched();
+ rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
+ if (tick_nohz_full_enabled()) {
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ }
}
stoppedat = jiffies;
- n_launders_cb_snap = READ_ONCE(n_launders_cb);
+ n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
- (void)rcu_torture_fwd_prog_cbfree();
+ (void)rcu_torture_fwd_prog_cbfree(rfp);
- if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) {
+ if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
+ !shutdown_time_arrived()) {
WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
__func__,
- stoppedat - rcu_fwd_startat, jiffies - stoppedat,
+ stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat,
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
n_max_gps, n_max_cbs, cver, gps);
- rcu_torture_fwd_cb_hist();
+ rcu_torture_fwd_cb_hist(rfp);
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
+ tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
WRITE_ONCE(rcu_fwd_cb_nodelay, false);
}
@@ -1906,20 +1952,22 @@ static void rcu_torture_fwd_prog_cr(void)
static int rcutorture_oom_notify(struct notifier_block *self,
unsigned long notused, void *nfreed)
{
+ struct rcu_fwd *rfp = rcu_fwds;
+
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
- rcu_torture_fwd_cb_hist();
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);
+ rcu_torture_fwd_cb_hist(rfp);
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree());
+ __func__, rcu_torture_fwd_prog_cbfree(rfp));
rcu_barrier();
pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree());
+ __func__, rcu_torture_fwd_prog_cbfree(rfp));
rcu_barrier();
pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree());
+ __func__, rcu_torture_fwd_prog_cbfree(rfp));
smp_mb(); /* Frees before return to avoid redoing OOM. */
(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
pr_info("%s returning after OOM processing.\n", __func__);
@@ -1933,6 +1981,7 @@ static struct notifier_block rcutorture_oom_nb = {
/* Carry out grace-period forward-progress testing. */
static int rcu_torture_fwd_prog(void *args)
{
+ struct rcu_fwd *rfp = args;
int tested = 0;
int tested_tries = 0;
@@ -1944,8 +1993,8 @@ static int rcu_torture_fwd_prog(void *args)
schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
WRITE_ONCE(rcu_fwd_emergency_stop, false);
register_oom_notifier(&rcutorture_oom_nb);
- rcu_torture_fwd_prog_nr(&tested, &tested_tries);
- rcu_torture_fwd_prog_cr();
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
+ rcu_torture_fwd_prog_cr(rfp);
unregister_oom_notifier(&rcutorture_oom_nb);
/* Avoid slow periods, better to test when busy. */
@@ -1961,6 +2010,8 @@ static int rcu_torture_fwd_prog(void *args)
/* If forward-progress checking is requested and feasible, spawn the thread. */
static int __init rcu_torture_fwd_prog_init(void)
{
+ struct rcu_fwd *rfp;
+
if (!fwd_progress)
return 0; /* Not requested, so don't do it. */
if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 ||
@@ -1979,8 +2030,12 @@ static int __init rcu_torture_fwd_prog_init(void)
fwd_progress_holdoff = 1;
if (fwd_progress_div <= 0)
fwd_progress_div = 4;
- return torture_create_kthread(rcu_torture_fwd_prog,
- NULL, fwd_prog_task);
+ rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
+ if (!rfp)
+ return -ENOMEM;
+ spin_lock_init(&rfp->rcu_fwd_lock);
+ rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+ return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
}
/* Callback function for RCU barrier testing. */
@@ -2160,6 +2215,7 @@ rcu_torture_cleanup(void)
return;
}
+ show_rcu_gp_kthreads();
rcu_torture_barrier_cleanup();
torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
torture_stop_kthread(rcu_torture_stall, stall_task);
@@ -2465,6 +2521,7 @@ rcu_torture_init(void)
goto unwind;
rcutor_hp = firsterr;
}
+ shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
if (firsterr)
goto unwind;
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 44d6606b8325..6208c1dae5c9 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
/*
* Workqueue handler to drive one grace period and invoke any callbacks
- * that become ready as a result. Single-CPU and !PREEMPT operation
+ * that become ready as a result. Single-CPU and !PREEMPTION operation
* means that we get away with murder on synchronization. ;-)
*/
void srcu_drive_gp(struct work_struct *wp)
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index cf0e886314f2..657e6a7d1c03 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -530,7 +530,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
cbdelay = srcu_get_delay(ssp);
- ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
@@ -762,6 +762,7 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
unsigned long flags;
struct srcu_data *sdp;
unsigned long t;
+ unsigned long tlast;
/* If the local srcu_data structure has callbacks, not idle. */
local_irq_save(flags);
@@ -780,9 +781,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
+ tlast = READ_ONCE(ssp->srcu_last_gp_end);
if (exp_holdoff == 0 ||
- time_in_range_open(t, ssp->srcu_last_gp_end,
- ssp->srcu_last_gp_end + exp_holdoff))
+ time_in_range_open(t, tlast, tlast + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
@@ -853,7 +854,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
local_irq_save(flags);
sdp = this_cpu_ptr(ssp->sda);
spin_lock_rcu_node(sdp);
- rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_gp_seq));
s = rcu_seq_snap(&ssp->srcu_gp_seq);
@@ -1052,7 +1053,7 @@ void srcu_barrier(struct srcu_struct *ssp)
sdp->srcu_barrier_head.func = srcu_barrier_cb;
debug_rcu_head_queue(&sdp->srcu_barrier_head);
if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
- &sdp->srcu_barrier_head, 0)) {
+ &sdp->srcu_barrier_head)) {
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
atomic_dec(&ssp->srcu_barrier_cpu_cnt);
}
@@ -1279,8 +1280,9 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
c0 = l0 - u0;
c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld %1p)",
- cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist));
+ pr_cont(" %d(%ld,%ld %c)",
+ cpu, c0, c1,
+ "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
s0 += c0;
s1 += c1;
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 477b4eb44af5..dd572ce7c747 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -22,6 +22,7 @@
#include <linux/time.h>
#include <linux/cpu.h>
#include <linux/prefetch.h>
+#include <linux/slab.h>
#include "rcu.h"
@@ -73,6 +74,31 @@ void rcu_sched_clock_irq(int user)
}
}
+/*
+ * Reclaim the specified callback, either by invoking it for non-kfree cases or
+ * freeing it directly (for kfree). Return true if kfreeing, false otherwise.
+ */
+static inline bool rcu_reclaim_tiny(struct rcu_head *head)
+{
+ rcu_callback_t f;
+ unsigned long offset = (unsigned long)head->func;
+
+ rcu_lock_acquire(&rcu_callback_map);
+ if (__is_kfree_rcu_offset(offset)) {
+ trace_rcu_invoke_kfree_callback("", head, offset);
+ kfree((void *)head - offset);
+ rcu_lock_release(&rcu_callback_map);
+ return true;
+ }
+
+ trace_rcu_invoke_callback("", head);
+ f = head->func;
+ WRITE_ONCE(head->func, (rcu_callback_t)0L);
+ f(head);
+ rcu_lock_release(&rcu_callback_map);
+ return false;
+}
+
/* Invoke the RCU callbacks whose grace period has elapsed. */
static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
{
@@ -100,7 +126,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
prefetch(next);
debug_rcu_head_unqueue(list);
local_bh_disable();
- __rcu_reclaim("", list);
+ rcu_reclaim_tiny(list);
local_bh_enable();
list = next;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a14e5fbbea46..d91c9156fab2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -43,7 +43,6 @@
#include <uapi/linux/sched/types.h>
#include <linux/prefetch.h>
#include <linux/delay.h>
-#include <linux/stop_machine.h>
#include <linux/random.h>
#include <linux/trace_events.h>
#include <linux/suspend.h>
@@ -55,7 +54,9 @@
#include <linux/oom.h>
#include <linux/smpboot.h>
#include <linux/jiffies.h>
+#include <linux/slab.h>
#include <linux/sched/isolation.h>
+#include <linux/sched/clock.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -83,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
};
-struct rcu_state rcu_state = {
+static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
.gp_state = RCU_GP_IDLE,
.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
@@ -187,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
* held, but the bit corresponding to the current CPU will be stable
* in most contexts.
*/
-unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
{
return READ_ONCE(rnp->qsmaskinitnext);
}
@@ -210,9 +211,9 @@ static long rcu_get_n_cbs_cpu(int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */
+ if (rcu_segcblist_is_enabled(&rdp->cblist))
return rcu_segcblist_n_cbs(&rdp->cblist);
- return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */
+ return 0;
}
void rcu_softirq_qs(void)
@@ -293,7 +294,7 @@ static void rcu_dynticks_eqs_online(void)
*
* No ordering, as we are sampling CPU-local information.
*/
-bool rcu_dynticks_curr_cpu_in_eqs(void)
+static bool rcu_dynticks_curr_cpu_in_eqs(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -304,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
* Snapshot the ->dynticks counter with full ordering so as to allow
* stable comparison of this counter with past and future snapshots.
*/
-int rcu_dynticks_snap(struct rcu_data *rdp)
+static int rcu_dynticks_snap(struct rcu_data *rdp)
{
int snap = atomic_add_return(0, &rdp->dynticks);
@@ -363,7 +364,7 @@ bool rcu_eqs_special_set(int cpu)
*
* The caller must have disabled interrupts and must not be idle.
*/
-static void __maybe_unused rcu_momentary_dyntick_idle(void)
+void rcu_momentary_dyntick_idle(void)
{
int special;
@@ -374,6 +375,7 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void)
WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
rcu_preempt_deferred_qs(current);
}
+EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
/**
* rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
@@ -416,6 +418,12 @@ module_param(qlowmark, long, 0444);
static ulong jiffies_till_first_fqs = ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
static bool rcu_kick_kthreads;
+static int rcu_divisor = 7;
+module_param(rcu_divisor, int, 0644);
+
+/* Force an exit from rcu_do_batch() after 3 milliseconds. */
+static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
+module_param(rcu_resched_ns, long, 0644);
/*
* How long the grace period must be before we start recruiting
@@ -489,7 +497,7 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next
module_param(rcu_kick_kthreads, bool, 0644);
static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
-static int rcu_pending(void);
+static int rcu_pending(int user);
/*
* Return the number of RCU GPs completed thus far for debug & stats.
@@ -521,16 +529,6 @@ static struct rcu_node *rcu_get_root(void)
}
/*
- * Convert a ->gp_state value to a character string.
- */
-static const char *gp_state_getname(short gs)
-{
- if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
- return "???";
- return gp_state_names[gs];
-}
-
-/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
@@ -569,7 +567,7 @@ static void rcu_eqs_enter(bool user)
}
lockdep_assert_irqs_disabled();
- trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks);
+ trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rdp = this_cpu_ptr(&rcu_data);
do_nocb_deferred_wakeup(rdp);
@@ -642,14 +640,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
* leave it in non-RCU-idle state.
*/
if (rdp->dynticks_nmi_nesting != 1) {
- trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
+ trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
+ atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
rdp->dynticks_nmi_nesting - 2);
return;
}
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks);
+ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
if (irq)
@@ -736,7 +735,7 @@ static void rcu_eqs_exit(bool user)
rcu_dynticks_task_exit();
rcu_dynticks_eqs_exit();
rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks);
+ trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
WRITE_ONCE(rdp->dynticks_nesting, 1);
WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
@@ -792,8 +791,8 @@ void rcu_user_exit(void)
*/
static __always_inline void rcu_nmi_enter_common(bool irq)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
long incby = 2;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
/* Complain about underflow. */
WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
@@ -817,10 +816,20 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
rcu_cleanup_after_idle();
incby = 1;
+ } else if (tick_nohz_full_cpu(rdp->cpu) &&
+ rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
+ READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
+ raw_spin_lock_rcu_node(rdp->mynode);
+ // Recheck under lock.
+ if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
+ rdp->rcu_forced_tick = true;
+ tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
+ }
+ raw_spin_unlock_rcu_node(rdp->mynode);
}
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
rdp->dynticks_nmi_nesting,
- rdp->dynticks_nmi_nesting + incby, rdp->dynticks);
+ rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
rdp->dynticks_nmi_nesting + incby);
barrier();
@@ -878,6 +887,22 @@ void rcu_irq_enter_irqson(void)
local_irq_restore(flags);
}
+/*
+ * If any sort of urgency was applied to the current CPU (for example,
+ * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
+ * to get to a quiescent state, disable it.
+ */
+static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
+{
+ raw_lockdep_assert_held_rcu_node(rdp->mynode);
+ WRITE_ONCE(rdp->rcu_urgent_qs, false);
+ WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
+ if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
+ tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
+ rdp->rcu_forced_tick = false;
+ }
+}
+
/**
* rcu_is_watching - see if RCU thinks that the current CPU is not idle
*
@@ -1066,6 +1091,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (tick_nohz_full_cpu(rdp->cpu) &&
time_after(jiffies,
READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
+ WRITE_ONCE(*ruqp, true);
resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
}
@@ -1251,6 +1277,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
unsigned long gp_seq_req;
bool ret = false;
+ rcu_lockdep_assert_cblist_protected(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@ -1292,7 +1319,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
unsigned long c;
bool needwake;
- lockdep_assert_irqs_disabled();
+ rcu_lockdep_assert_cblist_protected(rdp);
c = rcu_seq_snap(&rcu_state.gp_seq);
if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
/* Old request still live, so mark recent callbacks. */
@@ -1318,6 +1345,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
*/
static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
{
+ rcu_lockdep_assert_cblist_protected(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@ -1335,6 +1363,21 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
}
/*
+ * Move and classify callbacks, but only if doing so won't require
+ * that the RCU grace-period kthread be awakened.
+ */
+static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
+ !raw_spin_trylock_rcu_node(rnp))
+ return;
+ WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+/*
* Update CPU-local rcu_data state to record the beginnings and ends of
* grace periods. The caller must hold the ->lock of the leaf rcu_node
* structure corresponding to the current CPU, and must have irqs disabled.
@@ -1342,8 +1385,10 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
- bool ret;
+ bool ret = false;
bool need_gp;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
raw_lockdep_assert_held_rcu_node(rnp);
@@ -1353,10 +1398,12 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Handle the ends of any preceding grace periods first. */
if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
unlikely(READ_ONCE(rdp->gpwrap))) {
- ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */
+ if (!offloaded)
+ ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
} else {
- ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */
+ if (!offloaded)
+ ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
}
/* Now handle the beginnings of any new-to-this-CPU grace periods. */
@@ -1657,6 +1704,7 @@ static void rcu_gp_cleanup(void)
unsigned long gp_duration;
bool needgp = false;
unsigned long new_gp_seq;
+ bool offloaded;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root();
struct swait_queue_head *sq;
@@ -1722,7 +1770,9 @@ static void rcu_gp_cleanup(void)
needgp = true;
}
/* Advance CBs to reduce false positives below. */
- if (!rcu_accelerate_cbs(rnp, rdp) && needgp) {
+ offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
+ if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
rcu_state.gp_req_activity = jiffies;
trace_rcu_grace_period(rcu_state.name,
@@ -1881,7 +1931,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
struct rcu_node *rnp_p;
raw_lockdep_assert_held_rcu_node(rnp);
- if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) ||
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
rnp->qsmask != 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1916,7 +1966,9 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needwake;
+ bool needwake = false;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_node *rnp;
rnp = rdp->mynode;
@@ -1935,7 +1987,6 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
return;
}
mask = rdp->grpmask;
- rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
@@ -1943,8 +1994,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
*/
- needwake = rcu_accelerate_cbs(rnp, rdp);
+ if (!offloaded)
+ needwake = rcu_accelerate_cbs(rnp, rdp);
+ rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
if (needwake)
@@ -2067,6 +2120,9 @@ int rcutree_dead_cpu(unsigned int cpu)
rcu_boost_kthread_setaffinity(rnp, -1);
/* Do any needed no-CB deferred wakeups from this CPU. */
do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
+
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
return 0;
}
@@ -2077,14 +2133,16 @@ int rcutree_dead_cpu(unsigned int cpu)
static void rcu_do_batch(struct rcu_data *rdp)
{
unsigned long flags;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
long bl, count;
+ long pending, tlimit = 0;
/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
trace_rcu_batch_start(rcu_state.name,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist), 0);
trace_rcu_batch_end(rcu_state.name, 0,
!rcu_segcblist_empty(&rdp->cblist),
@@ -2099,31 +2157,63 @@ static void rcu_do_batch(struct rcu_data *rdp)
* callback counts, as rcu_barrier() needs to be conservative.
*/
local_irq_save(flags);
+ rcu_nocb_lock(rdp);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
- bl = rdp->blimit;
+ pending = rcu_segcblist_n_cbs(&rdp->cblist);
+ bl = max(rdp->blimit, pending >> rcu_divisor);
+ if (unlikely(bl > 100))
+ tlimit = local_clock() + rcu_resched_ns;
trace_rcu_batch_start(rcu_state.name,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
- local_irq_restore(flags);
+ if (offloaded)
+ rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ rcu_callback_t f;
+
debug_rcu_head_unqueue(rhp);
- if (__rcu_reclaim(rcu_state.name, rhp))
- rcu_cblist_dequeued_lazy(&rcl);
+
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_callback(rcu_state.name, rhp);
+
+ f = rhp->func;
+ WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
+ f(rhp);
+
+ rcu_lock_release(&rcu_callback_map);
+
/*
* Stop only if limit reached and CPU has something to do.
* Note: The rcl structure counts down from zero.
*/
- if (-rcl.len >= bl &&
+ if (-rcl.len >= bl && !offloaded &&
(need_resched() ||
(!is_idle_task(current) && !rcu_is_callbacks_kthread())))
break;
+ if (unlikely(tlimit)) {
+ /* only call local_clock() every 32 callbacks */
+ if (likely((-rcl.len & 31) || local_clock() < tlimit))
+ continue;
+ /* Exceeded the time limit, so leave. */
+ break;
+ }
+ if (offloaded) {
+ WARN_ON_ONCE(in_serving_softirq());
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ cond_resched_tasks_rcu_qs();
+ lockdep_assert_irqs_enabled();
+ local_bh_disable();
+ }
}
local_irq_save(flags);
+ rcu_nocb_lock(rdp);
count = -rcl.len;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
@@ -2149,13 +2239,16 @@ static void rcu_do_batch(struct rcu_data *rdp)
* The following usually indicates a double call_rcu(). To track
* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
*/
- WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
+ WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ count != 0 && rcu_segcblist_empty(&rdp->cblist));
- local_irq_restore(flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
/* Re-invoke RCU core processing if there are callbacks remaining. */
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
invoke_rcu_core();
+ tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
}
/*
@@ -2180,7 +2273,7 @@ void rcu_sched_clock_irq(int user)
__this_cpu_write(rcu_data.rcu_urgent_qs, false);
}
rcu_flavor_sched_clock_irq(user);
- if (rcu_pending())
+ if (rcu_pending(user))
invoke_rcu_core();
trace_rcu_utilization(TPS("End scheduler-tick"));
@@ -2198,6 +2291,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
int cpu;
unsigned long flags;
unsigned long mask;
+ struct rcu_data *rdp;
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp) {
@@ -2205,7 +2299,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
- if (!IS_ENABLED(CONFIG_PREEMPT) ||
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
rcu_preempt_blocked_readers_cgp(rnp)) {
/*
* No point in scanning bits because they
@@ -2219,11 +2313,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
continue;
}
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
- if ((rnp->qsmask & bit) != 0) {
- if (f(per_cpu_ptr(&rcu_data, cpu)))
- mask |= bit;
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (f(rdp)) {
+ mask |= rdp->grpmask;
+ rcu_disable_urgency_upon_qs(rdp);
}
}
if (mask != 0) {
@@ -2251,7 +2345,7 @@ void rcu_force_quiescent_state(void)
rnp = __this_cpu_read(rcu_data.mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
- !raw_spin_trylock(&rnp->fqslock);
+ !raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
if (ret)
@@ -2280,6 +2374,8 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2299,7 +2395,7 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist)) {
+ rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
local_irq_save(flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
@@ -2309,7 +2405,7 @@ static __latent_entropy void rcu_core(void)
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
likely(READ_ONCE(rcu_scheduler_fully_active)))
rcu_do_batch(rdp);
@@ -2380,8 +2476,8 @@ static void rcu_cpu_kthread(unsigned int cpu)
char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
int spincnt;
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
for (spincnt = 0; spincnt < 10; spincnt++) {
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
@@ -2489,10 +2585,11 @@ static void rcu_leak_callback(struct rcu_head *rhp)
* is expected to specify a CPU.
*/
static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
+__call_rcu(struct rcu_head *head, rcu_callback_t func)
{
unsigned long flags;
struct rcu_data *rdp;
+ bool was_alldone;
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
@@ -2514,42 +2611,36 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
- if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
- int offline;
-
- if (cpu != -1)
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (likely(rdp->mynode)) {
- /* Post-boot, so this should be for a no-CBs CPU. */
- offline = !__call_rcu_nocb(rdp, head, lazy, flags);
- WARN_ON_ONCE(offline);
- /* Offline CPU, _call_rcu() illegal, leak callback. */
- local_irq_restore(flags);
- return;
- }
- /*
- * Very early boot, before rcu_init(). Initialize if needed
- * and then drop through to queue the callback.
- */
- WARN_ON_ONCE(cpu != -1);
+ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
+ // This can trigger due to call_rcu() from offline CPU:
+ WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
WARN_ON_ONCE(!rcu_is_watching());
+ // Very early boot, before rcu_init(). Initialize if needed
+ // and then drop through to queue the callback.
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
}
- rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
+
+ if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
+ return; // Enqueued onto ->nocb_bypass, so just leave.
+ /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
+ rcu_segcblist_enqueue(&rdp->cblist, head);
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
(unsigned long)func,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist));
else
trace_rcu_callback(rcu_state.name, head,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist));
/* Go handle any RCU core processing required. */
- __call_rcu_core(rdp, head, flags);
- local_irq_restore(flags);
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
+ __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
+ } else {
+ __call_rcu_core(rdp, head, flags);
+ local_irq_restore(flags);
+ }
}
/**
@@ -2589,28 +2680,230 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, -1, 0);
+ __call_rcu(head, func);
}
EXPORT_SYMBOL_GPL(call_rcu);
+
+/* Maximum number of jiffies to wait before draining a batch. */
+#define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_N_BATCHES 2
+
+/**
+ * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
+ * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
+ * @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @krcp: Pointer to @kfree_rcu_cpu structure
+ */
+
+struct kfree_rcu_cpu_work {
+ struct rcu_work rcu_work;
+ struct rcu_head *head_free;
+ struct kfree_rcu_cpu *krcp;
+};
+
+/**
+ * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
+ * @head: List of kfree_rcu() objects not yet waiting for a grace period
+ * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
+ * @lock: Synchronize access to this structure
+ * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
+ * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
+ * @initialized: The @lock and @rcu_work fields have been initialized
+ *
+ * This is a per-CPU structure. The reason that it is not included in
+ * the rcu_data structure is to permit this code to be extracted from
+ * the RCU files. Such extraction could allow further optimization of
+ * the interactions with the slab allocators.
+ */
+struct kfree_rcu_cpu {
+ struct rcu_head *head;
+ struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
+ spinlock_t lock;
+ struct delayed_work monitor_work;
+ bool monitor_todo;
+ bool initialized;
+};
+
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+
+/*
+ * This function is invoked in workqueue context after a grace period.
+ * It frees all the objects queued on ->head_free.
+ */
+static void kfree_rcu_work(struct work_struct *work)
+{
+ unsigned long flags;
+ struct rcu_head *head, *next;
+ struct kfree_rcu_cpu *krcp;
+ struct kfree_rcu_cpu_work *krwp;
+
+ krwp = container_of(to_rcu_work(work),
+ struct kfree_rcu_cpu_work, rcu_work);
+ krcp = krwp->krcp;
+ spin_lock_irqsave(&krcp->lock, flags);
+ head = krwp->head_free;
+ krwp->head_free = NULL;
+ spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // List "head" is now private, so traverse locklessly.
+ for (; head; head = next) {
+ unsigned long offset = (unsigned long)head->func;
+
+ next = head->next;
+ // Potentially optimize with kfree_bulk in future.
+ debug_rcu_head_unqueue(head);
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
+
+ if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
+ /* Could be optimized with kfree_bulk() in future. */
+ kfree((void *)head - offset);
+ }
+
+ rcu_lock_release(&rcu_callback_map);
+ cond_resched_tasks_rcu_qs();
+ }
+}
+
+/*
+ * Schedule the kfree batch RCU work to run in workqueue context after a GP.
+ *
+ * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
+ * timeout has been reached.
+ */
+static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+{
+ int i;
+ struct kfree_rcu_cpu_work *krwp = NULL;
+
+ lockdep_assert_held(&krcp->lock);
+ for (i = 0; i < KFREE_N_BATCHES; i++)
+ if (!krcp->krw_arr[i].head_free) {
+ krwp = &(krcp->krw_arr[i]);
+ break;
+ }
+
+ // If a previous RCU batch is in progress, we cannot immediately
+ // queue another one, so return false to tell caller to retry.
+ if (!krwp)
+ return false;
+
+ krwp->head_free = krcp->head;
+ krcp->head = NULL;
+ INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
+ queue_rcu_work(system_wq, &krwp->rcu_work);
+ return true;
+}
+
+static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
+ unsigned long flags)
+{
+ // Attempt to start a new batch.
+ krcp->monitor_todo = false;
+ if (queue_kfree_rcu_work(krcp)) {
+ // Success! Our job is done here.
+ spin_unlock_irqrestore(&krcp->lock, flags);
+ return;
+ }
+
+ // Previous RCU batch still in progress, try again later.
+ krcp->monitor_todo = true;
+ schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+ * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
+ */
+static void kfree_rcu_monitor(struct work_struct *work)
+{
+ unsigned long flags;
+ struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
+ monitor_work.work);
+
+ spin_lock_irqsave(&krcp->lock, flags);
+ if (krcp->monitor_todo)
+ kfree_rcu_drain_unlock(krcp, flags);
+ else
+ spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+/*
+ * Queue a request for lazy invocation of kfree() after a grace period.
+ *
+ * Each kfree_call_rcu() request is added to a batch. The batch will be drained
+ * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch
+ * will be kfree'd in workqueue context. This allows us to:
+ *
+ * 1. Batch requests together to reduce the number of grace periods during
+ * heavy kfree_rcu() load.
+ *
+ * 2. It makes it possible to use kfree_bulk() on a large number of
+ * kfree_rcu() requests thus reducing cache misses and the per-object
+ * overhead of kfree().
*/
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, -1, 1);
+ unsigned long flags;
+ struct kfree_rcu_cpu *krcp;
+
+ local_irq_save(flags); // For safely calling this_cpu_ptr().
+ krcp = this_cpu_ptr(&krc);
+ if (krcp->initialized)
+ spin_lock(&krcp->lock);
+
+ // Queue the object but don't yet schedule the batch.
+ if (debug_rcu_head_queue(head)) {
+ // Probable double kfree_rcu(), just leak.
+ WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
+ __func__, head);
+ goto unlock_return;
+ }
+ head->func = func;
+ head->next = krcp->head;
+ krcp->head = head;
+
+ // Set timer to drain after KFREE_DRAIN_JIFFIES.
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+ !krcp->monitor_todo) {
+ krcp->monitor_todo = true;
+ schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ }
+
+unlock_return:
+ if (krcp->initialized)
+ spin_unlock(&krcp->lock);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
+void __init kfree_rcu_scheduler_running(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ for_each_online_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ spin_lock_irqsave(&krcp->lock, flags);
+ if (!krcp->head || krcp->monitor_todo) {
+ spin_unlock_irqrestore(&krcp->lock, flags);
+ continue;
+ }
+ krcp->monitor_todo = true;
+ schedule_delayed_work_on(cpu, &krcp->monitor_work,
+ KFREE_DRAIN_JIFFIES);
+ spin_unlock_irqrestore(&krcp->lock, flags);
+ }
+}
+
/*
* During early boot, any blocking grace-period wait automatically
- * implies a grace period. Later on, this is never the case for PREEMPT.
+ * implies a grace period. Later on, this is never the case for PREEMPTION.
*
- * Howevr, because a context switch is a grace period for !PREEMPT, any
+ * Howevr, because a context switch is a grace period for !PREEMPTION, any
* blocking grace-period wait automatically implies a grace period if
* there is only one CPU online at any point time during execution of
* either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
@@ -2622,7 +2915,7 @@ static int rcu_blocking_is_gp(void)
{
int ret;
- if (IS_ENABLED(CONFIG_PREEMPT))
+ if (IS_ENABLED(CONFIG_PREEMPTION))
return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
might_sleep(); /* Check for RCU read-side critical section. */
preempt_disable();
@@ -2727,20 +3020,26 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
* CPU-local state are performed first. However, we must check for CPU
* stalls first, else we might not get a chance.
*/
-static int rcu_pending(void)
+static int rcu_pending(int user)
{
+ bool gp_in_progress;
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rdp);
- /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
- if (rcu_nohz_full_cpu())
+ /* Does this CPU need a deferred NOCB wakeup? */
+ if (rcu_nocb_need_deferred_wakeup(rdp))
+ return 1;
+
+ /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
+ if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
return 0;
/* Is the RCU core waiting for a quiescent state from this CPU? */
- if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm)
+ gp_in_progress = rcu_gp_in_progress();
+ if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
return 1;
/* Does this CPU have callbacks ready to invoke? */
@@ -2748,8 +3047,9 @@ static int rcu_pending(void)
return 1;
/* Has RCU gone idle with this CPU needing another grace period? */
- if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist) &&
+ if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
+ (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
+ !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
@@ -2758,10 +3058,6 @@ static int rcu_pending(void)
unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
return 1;
- /* Does this CPU need a deferred NOCB wakeup? */
- if (rcu_nocb_need_deferred_wakeup(rdp))
- return 1;
-
/* nothing to do */
return 0;
}
@@ -2784,7 +3080,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
{
if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
rcu_barrier_trace(TPS("LastCB"), -1,
- rcu_state.barrier_sequence);
+ rcu_state.barrier_sequence);
complete(&rcu_state.barrier_completion);
} else {
rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence);
@@ -2801,13 +3097,16 @@ static void rcu_barrier_func(void *unused)
rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
- if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
rcu_barrier_trace(TPS("IRQNQ"), -1,
- rcu_state.barrier_sequence);
+ rcu_state.barrier_sequence);
}
+ rcu_nocb_unlock(rdp);
}
/**
@@ -2832,7 +3131,7 @@ void rcu_barrier(void)
/* Did someone else do our work for us? */
if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
rcu_barrier_trace(TPS("EarlyExit"), -1,
- rcu_state.barrier_sequence);
+ rcu_state.barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rcu_state.barrier_mutex);
return;
@@ -2858,28 +3157,17 @@ void rcu_barrier(void)
* corresponding CPU's preceding callbacks have been invoked.
*/
for_each_possible_cpu(cpu) {
- if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
- continue;
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_is_nocb_cpu(cpu)) {
- if (!rcu_nocb_cpu_needs_barrier(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCB"), cpu,
- rcu_state.barrier_sequence);
- } else {
- rcu_barrier_trace(TPS("OnlineNoCB"), cpu,
- rcu_state.barrier_sequence);
- smp_mb__before_atomic();
- atomic_inc(&rcu_state.barrier_cpu_count);
- __call_rcu(&rdp->barrier_head,
- rcu_barrier_callback, cpu, 0);
- }
- } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
+ if (!cpu_online(cpu) &&
+ !rcu_segcblist_is_offloaded(&rdp->cblist))
+ continue;
+ if (rcu_segcblist_n_cbs(&rdp->cblist)) {
rcu_barrier_trace(TPS("OnlineQ"), cpu,
- rcu_state.barrier_sequence);
+ rcu_state.barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
} else {
rcu_barrier_trace(TPS("OnlineNQ"), cpu,
- rcu_state.barrier_sequence);
+ rcu_state.barrier_sequence);
}
}
put_online_cpus();
@@ -2958,7 +3246,8 @@ rcu_boot_init_percpu_data(int cpu)
* Initializes a CPU's per-CPU RCU data. Note that only one online or
* offline event can be happening at a given time. Note also that we can
* accept some slop in the rsp->gp_seq access due to the fact that this
- * CPU cannot possibly have any RCU callbacks in flight yet.
+ * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
+ * And any offloaded callbacks are being numbered elsewhere.
*/
int rcutree_prepare_cpu(unsigned int cpu)
{
@@ -2972,7 +3261,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->n_force_qs_snap = rcu_state.n_force_qs;
rdp->blimit = blimit;
if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
- !init_nocb_callback_list(rdp))
+ !rcu_segcblist_is_offloaded(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
rcu_dynticks_eqs_online();
@@ -3029,6 +3318,9 @@ int rcutree_online_cpu(unsigned int cpu)
return 0; /* Too early in boot for scheduler work. */
sync_sched_exp_online_cleanup(cpu);
rcutree_affinity_setting(cpu, -1);
+
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
return 0;
}
@@ -3049,6 +3341,9 @@ int rcutree_offline_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcutree_affinity_setting(cpu, cpu);
+
+ // nohz_full CPUs need the tick for stop-machine to work quickly
+ tick_dep_set(TICK_DEP_BIT_RCU);
return 0;
}
@@ -3094,6 +3389,7 @@ void rcu_cpu_starting(unsigned int cpu)
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */
+ rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
} else {
@@ -3151,29 +3447,38 @@ void rcutree_migrate_callbacks(int cpu)
{
unsigned long flags;
struct rcu_data *my_rdp;
+ struct rcu_node *my_rnp;
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp_root = rcu_get_root();
bool needwake;
- if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
+ if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
+ rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
local_irq_save(flags);
my_rdp = this_cpu_ptr(&rcu_data);
- if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
- local_irq_restore(flags);
- return;
- }
- raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ my_rnp = my_rdp->mynode;
+ rcu_nocb_lock(my_rdp); /* irqs already disabled. */
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
+ raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
/* Leverage recent GPs and set GP for new callbacks. */
- needwake = rcu_advance_cbs(rnp_root, rdp) ||
- rcu_advance_cbs(rnp_root, my_rdp);
+ needwake = rcu_advance_cbs(my_rnp, rdp) ||
+ rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
+ rcu_segcblist_disable(&rdp->cblist);
WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
!rcu_segcblist_n_cbs(&my_rdp->cblist));
- raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
+ raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
+ __call_rcu_nocb_wake(my_rdp, true, flags);
+ } else {
+ rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
+ raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
+ }
if (needwake)
rcu_gp_kthread_wake();
+ lockdep_assert_irqs_enabled();
WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
!rcu_segcblist_empty(&rdp->cblist),
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
@@ -3234,13 +3539,13 @@ static int __init rcu_spawn_gp_kthread(void)
t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
return 0;
- rnp = rcu_get_root();
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- rcu_state.gp_kthread = t;
if (kthread_prio) {
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
+ rnp = rcu_get_root();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rcu_state.gp_kthread = t;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
rcu_spawn_nocb_kthreads();
@@ -3455,12 +3760,29 @@ static void __init rcu_dump_rcu_node_tree(void)
struct workqueue_struct *rcu_gp_wq;
struct workqueue_struct *rcu_par_gp_wq;
+static void __init kfree_rcu_batch_init(void)
+{
+ int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ spin_lock_init(&krcp->lock);
+ for (i = 0; i < KFREE_N_BATCHES; i++)
+ krcp->krw_arr[i].krcp = krcp;
+ INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ krcp->initialized = true;
+ }
+}
+
void __init rcu_init(void)
{
int cpu;
rcu_early_boot_tests();
+ kfree_rcu_batch_init();
rcu_bootup_announce();
rcu_init_geometry();
rcu_init_one();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7acaf3a62d39..0c87e4c161c2 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -16,7 +16,6 @@
#include <linux/cpumask.h>
#include <linux/seqlock.h>
#include <linux/swait.h>
-#include <linux/stop_machine.h>
#include <linux/rcu_node_tree.h>
#include "rcu_segcblist.h"
@@ -181,8 +180,9 @@ struct rcu_data {
atomic_t dynticks; /* Even value for idle, else odd. */
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
+ bool rcu_forced_tick; /* Forced tick to provide QS. */
+ bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
#ifdef CONFIG_RCU_FAST_NO_HZ
- bool all_lazy; /* All CPU's CBs lazy at idle start? */
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
@@ -194,29 +194,38 @@ struct rcu_data {
/* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
- struct rcu_head *nocb_head; /* CBs waiting for kthread. */
- struct rcu_head **nocb_tail;
- atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
- atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
- struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
- struct rcu_head **nocb_follower_tail;
- struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
- struct task_struct *nocb_kthread;
+ struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+ struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
+ atomic_t nocb_lock_contended; /* Contention experienced. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
-
- /* The following fields are used by the leader, hence own cacheline. */
- struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
- /* CBs waiting for GP. */
- struct rcu_head **nocb_gp_tail;
- bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
- struct rcu_data *nocb_next_follower;
- /* Next follower in wakeup chain. */
-
- /* The following fields are used by the follower, hence new cachline. */
- struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
- /* Leader CPU takes GP-end wakeups. */
+ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
+
+ /* The following fields are used by call_rcu, hence own cacheline. */
+ raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
+ struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */
+ unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */
+ unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */
+ int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */
+
+ /* The following fields are used by GP kthread, hence own cacheline. */
+ raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
+ struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
+ u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
+ u8 nocb_gp_bypass; /* Found a bypass on last scan? */
+ u8 nocb_gp_gp; /* GP to wait for on last scan? */
+ unsigned long nocb_gp_seq; /* If so, ->gp_seq to wait for. */
+ unsigned long nocb_gp_loops; /* # passes through wait code. */
+ struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
+ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
+ struct task_struct *nocb_cb_kthread;
+ struct rcu_data *nocb_next_cb_rdp;
+ /* Next rcu_data in wakeup chain. */
+
+ /* The following fields are used by CB kthread, hence new cacheline. */
+ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
+ /* GP rdp takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 6) RCU priority boosting. */
@@ -358,18 +367,6 @@ struct rcu_state {
#define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */
#define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */
-static const char * const gp_state_names[] = {
- "RCU_GP_IDLE",
- "RCU_GP_WAIT_GPS",
- "RCU_GP_DONE_GPS",
- "RCU_GP_ONOFF",
- "RCU_GP_INIT",
- "RCU_GP_WAIT_FQS",
- "RCU_GP_DOING_FQS",
- "RCU_GP_CLEANUP",
- "RCU_GP_CLEANED",
-};
-
/*
* In order to export the rcu_state name to the tracing tools, it
* needs to be added in the __tracepoint_string section.
@@ -393,8 +390,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
#define RCU_NAME rcu_name
#endif /* #else #ifdef CONFIG_TRACING */
-int rcu_dynticks_snap(struct rcu_data *rdp);
-
/* Forward declarations for tree_plugin.h */
static void rcu_bootup_announce(void);
static void rcu_qs(void);
@@ -405,7 +400,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_flavor_sched_clock_irq(int user);
-void call_rcu(struct rcu_head *head, rcu_callback_t func);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -419,25 +413,39 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
-static bool rcu_nocb_cpu_needs_barrier(int cpu);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
- struct rcu_data *rdp,
- unsigned long flags);
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j);
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags);
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
static void __init rcu_spawn_nocb_kthreads(void);
+static void show_rcu_nocb_state(struct rcu_data *rdp);
+static void rcu_nocb_lock(struct rcu_data *rdp);
+static void rcu_nocb_unlock(struct rcu_data *rdp);
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags);
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
-static bool init_nocb_callback_list(struct rcu_data *rdp);
-static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp);
+#define rcu_nocb_lock_irqsave(rdp, flags) \
+do { \
+ if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
+ local_irq_save(flags); \
+ else \
+ raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
+} while (0)
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+
static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
static void rcu_dynticks_task_enter(void);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index af7e7b9c86af..dcbd75791f39 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -21,7 +21,7 @@ static void rcu_exp_gp_seq_start(void)
}
/*
- * Return then value that expedited-grace-period counter will have
+ * Return the value that the expedited-grace-period counter will have
* at the end of the current grace period.
*/
static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
@@ -39,7 +39,9 @@ static void rcu_exp_gp_seq_end(void)
}
/*
- * Take a snapshot of the expedited-grace-period counter.
+ * Take a snapshot of the expedited-grace-period counter, which is the
+ * earliest value that will indicate that a full grace period has
+ * elapsed since the current time.
*/
static unsigned long rcu_exp_gp_seq_snap(void)
{
@@ -134,7 +136,7 @@ static void __maybe_unused sync_exp_reset_tree(void)
rcu_for_each_node_breadth_first(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
- rnp->expmask = rnp->expmaskinit;
+ WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -143,31 +145,26 @@ static void __maybe_unused sync_exp_reset_tree(void)
* Return non-zero if there is no RCU expedited grace period in progress
* for the specified rcu_node structure, in other words, if all CPUs and
* tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period. Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold the specificed rcu_node structure's ->lock
+ * for the current expedited grace period.
*/
-static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+static bool sync_rcu_exp_done(struct rcu_node *rnp)
{
raw_lockdep_assert_held_rcu_node(rnp);
-
return rnp->exp_tasks == NULL &&
READ_ONCE(rnp->expmask) == 0;
}
/*
- * Like sync_rcu_preempt_exp_done(), but this function assumes the caller
- * doesn't hold the rcu_node's ->lock, and will acquire and release the lock
- * itself
+ * Like sync_rcu_exp_done(), but where the caller does not hold the
+ * rcu_node's ->lock.
*/
-static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
+static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
{
unsigned long flags;
bool ret;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- ret = sync_rcu_preempt_exp_done(rnp);
+ ret = sync_rcu_exp_done(rnp);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return ret;
@@ -181,8 +178,6 @@ static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
* which the task was queued or to one of that rcu_node structure's ancestors,
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
- *
- * Caller must hold the specified rcu_node structure's ->lock.
*/
static void __rcu_report_exp_rnp(struct rcu_node *rnp,
bool wake, unsigned long flags)
@@ -190,8 +185,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
{
unsigned long mask;
+ raw_lockdep_assert_held_rcu_node(rnp);
for (;;) {
- if (!sync_rcu_preempt_exp_done(rnp)) {
+ if (!sync_rcu_exp_done(rnp)) {
if (!rnp->expmask)
rcu_initiate_boost(rnp, flags);
else
@@ -211,7 +207,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
rnp = rnp->parent;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
WARN_ON_ONCE(!(rnp->expmask & mask));
- rnp->expmask &= ~mask;
+ WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
}
}
@@ -234,14 +230,23 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
unsigned long mask, bool wake)
{
+ int cpu;
unsigned long flags;
+ struct rcu_data *rdp;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!(rnp->expmask & mask)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
- rnp->expmask &= ~mask;
+ WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp)
+ continue;
+ rdp->rcu_forced_tick_exp = false;
+ tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ }
__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
}
@@ -345,8 +350,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
/* Each pass checks a CPU for identity, offline, and idle. */
mask_ofl_test = 0;
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
- unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long mask = rdp->grpmask;
int snap;
if (raw_smp_processor_id() == cpu ||
@@ -372,12 +377,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
- unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) {
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long mask = rdp->grpmask;
- if (!(mask_ofl_ipi & mask))
- continue;
retry_ipi:
if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {
mask_ofl_test |= mask;
@@ -389,10 +392,10 @@ retry_ipi:
}
ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
put_cpu();
- if (!ret) {
- mask_ofl_ipi &= ~mask;
+ /* The CPU will report the QS in response to the IPI. */
+ if (!ret)
continue;
- }
+
/* Failed, raced with CPU hotplug operation. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if ((rnp->qsmaskinitnext & mask) &&
@@ -403,13 +406,12 @@ retry_ipi:
schedule_timeout_uninterruptible(1);
goto retry_ipi;
}
- /* CPU really is offline, so we can ignore it. */
- if (!(rnp->expmask & mask))
- mask_ofl_ipi &= ~mask;
+ /* CPU really is offline, so we must report its QS. */
+ if (rnp->expmask & mask)
+ mask_ofl_test |= mask;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/* Report quiescent states for those that went offline. */
- mask_ofl_test |= mask_ofl_ipi;
if (mask_ofl_test)
rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
}
@@ -456,29 +458,61 @@ static void sync_rcu_exp_select_cpus(void)
flush_work(&rnp->rew.rew_work);
}
-static void synchronize_sched_expedited_wait(void)
+/*
+ * Wait for the expedited grace period to elapse, within time limit.
+ * If the time limit is exceeded without the grace period elapsing,
+ * return false, otherwise return true.
+ */
+static bool synchronize_rcu_expedited_wait_once(long tlimit)
+{
+ int t;
+ struct rcu_node *rnp_root = rcu_get_root();
+
+ t = swait_event_timeout_exclusive(rcu_state.expedited_wq,
+ sync_rcu_exp_done_unlocked(rnp_root),
+ tlimit);
+ // Workqueues should not be signaled.
+ if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root))
+ return true;
+ WARN_ON(t < 0); /* workqueues should not be signaled. */
+ return false;
+}
+
+/*
+ * Wait for the expedited grace period to elapse, issuing any needed
+ * RCU CPU stall warnings along the way.
+ */
+static void synchronize_rcu_expedited_wait(void)
{
int cpu;
unsigned long jiffies_stall;
unsigned long jiffies_start;
unsigned long mask;
int ndetected;
+ struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root();
- int ret;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
jiffies_stall = rcu_jiffies_till_stall_check();
jiffies_start = jiffies;
+ if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ if (synchronize_rcu_expedited_wait_once(1))
+ return;
+ rcu_for_each_leaf_node(rnp) {
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->rcu_forced_tick_exp)
+ continue;
+ rdp->rcu_forced_tick_exp = true;
+ tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ }
+ }
+ }
for (;;) {
- ret = swait_event_timeout_exclusive(
- rcu_state.expedited_wq,
- sync_rcu_preempt_exp_done_unlocked(rnp_root),
- jiffies_stall);
- if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root))
+ if (synchronize_rcu_expedited_wait_once(jiffies_stall))
return;
- WARN_ON(ret < 0); /* workqueues should not be signaled. */
if (rcu_cpu_stall_suppress)
continue;
panic_on_rcu_stall();
@@ -491,7 +525,7 @@ static void synchronize_sched_expedited_wait(void)
struct rcu_data *rdp;
mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(rnp->expmask & mask))
+ if (!(READ_ONCE(rnp->expmask) & mask))
continue;
ndetected++;
rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -503,17 +537,18 @@ static void synchronize_sched_expedited_wait(void)
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
jiffies - jiffies_start, rcu_state.expedited_sequence,
- rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+ READ_ONCE(rnp_root->expmask),
+ ".T"[!!rnp_root->exp_tasks]);
if (ndetected) {
pr_err("blocking rcu_node structures:");
rcu_for_each_node_breadth_first(rnp) {
if (rnp == rnp_root)
continue; /* printed unconditionally */
- if (sync_rcu_preempt_exp_done_unlocked(rnp))
+ if (sync_rcu_exp_done_unlocked(rnp))
continue;
pr_cont(" l=%u:%d-%d:%#lx/%c",
rnp->level, rnp->grplo, rnp->grphi,
- rnp->expmask,
+ READ_ONCE(rnp->expmask),
".T"[!!rnp->exp_tasks]);
}
pr_cont("\n");
@@ -521,7 +556,7 @@ static void synchronize_sched_expedited_wait(void)
rcu_for_each_leaf_node(rnp) {
for_each_leaf_node_possible_cpu(rnp, cpu) {
mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(rnp->expmask & mask))
+ if (!(READ_ONCE(rnp->expmask) & mask))
continue;
dump_cpu_task(cpu);
}
@@ -540,15 +575,14 @@ static void rcu_exp_wait_wake(unsigned long s)
{
struct rcu_node *rnp;
- synchronize_sched_expedited_wait();
- rcu_exp_gp_seq_end();
- trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
+ synchronize_rcu_expedited_wait();
- /*
- * Switch over to wakeup mode, allowing the next GP, but -only- the
- * next GP, to proceed.
- */
+ // Switch over to wakeup mode, allowing the next GP to proceed.
+ // End the previous grace period only after acquiring the mutex
+ // to ensure that only one GP runs concurrently with wakeups.
mutex_lock(&rcu_state.exp_wake_mutex);
+ rcu_exp_gp_seq_end();
+ trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
rcu_for_each_node_breadth_first(rnp) {
if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
@@ -559,7 +593,7 @@ static void rcu_exp_wait_wake(unsigned long s)
spin_unlock(&rnp->exp_lock);
}
smp_mb(); /* All above changes before wakeup. */
- wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]);
+ wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]);
}
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake"));
mutex_unlock(&rcu_state.exp_wake_mutex);
@@ -610,7 +644,7 @@ static void rcu_exp_handler(void *unused)
* critical section. If also enabled or idle, immediately
* report the quiescent state, otherwise defer.
*/
- if (!t->rcu_read_lock_nesting) {
+ if (!rcu_preempt_depth()) {
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
rcu_dynticks_curr_cpu_in_eqs()) {
rcu_report_exp_rdp(rdp);
@@ -634,7 +668,7 @@ static void rcu_exp_handler(void *unused)
* can have caused this quiescent state to already have been
* reported, so we really do need to check ->expmask.
*/
- if (t->rcu_read_lock_nesting > 0) {
+ if (rcu_preempt_depth() > 0) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
rdp->exp_deferred_qs = true;
@@ -670,7 +704,7 @@ static void rcu_exp_handler(void *unused)
}
}
-/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */
+/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */
static void sync_sched_exp_online_cleanup(int cpu)
{
}
@@ -781,17 +815,18 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
* other hand, if the CPU is not in an RCU read-side critical section,
* the IPI handler reports the quiescent state immediately.
*
- * Although this is a greate improvement over previous expedited
+ * Although this is a great improvement over previous expedited
* implementations, it is still unfriendly to real-time workloads, so is
* thus not recommended for any sort of common-case code. In fact, if
* you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
+ * your code to batch your updates, and then use a single synchronize_rcu()
* instead.
*
* This has the same semantics as (but is more brutal than) synchronize_rcu().
*/
void synchronize_rcu_expedited(void)
{
+ bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
struct rcu_exp_work rew;
struct rcu_node *rnp;
unsigned long s;
@@ -817,7 +852,7 @@ void synchronize_rcu_expedited(void)
return; /* Someone else did our work for us. */
/* Ensure that load happens before action based on it. */
- if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+ if (unlikely(boottime)) {
/* Direct call during scheduler init and early_initcalls(). */
rcu_exp_sel_wait_wake(s);
} else {
@@ -835,5 +870,8 @@ void synchronize_rcu_expedited(void)
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
+
+ if (likely(!boottime))
+ destroy_work_on_stack(&rew.rew_work);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index acb225023ed1..c6ea81cd4189 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -220,7 +220,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* blocked tasks.
*/
if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
- rnp->gp_tasks = &t->rcu_node_entry;
+ WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
}
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
@@ -288,11 +288,10 @@ void rcu_note_context_switch(bool preempt)
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp;
- barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
- WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
- if (t->rcu_read_lock_nesting > 0 &&
+ WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
+ if (rcu_preempt_depth() > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
@@ -314,15 +313,6 @@ void rcu_note_context_switch(bool preempt)
? rnp->gp_seq
: rcu_seq_snap(&rnp->gp_seq));
rcu_preempt_ctxt_queue(rnp, rdp);
- } else if (t->rcu_read_lock_nesting < 0 &&
- t->rcu_read_unlock_special.s) {
-
- /*
- * Complete exit from RCU read-side critical section on
- * behalf of preempted instance of __rcu_read_unlock().
- */
- rcu_read_unlock_special(t);
- rcu_preempt_deferred_qs(t);
} else {
rcu_preempt_deferred_qs(t);
}
@@ -340,7 +330,6 @@ void rcu_note_context_switch(bool preempt)
if (rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
trace_rcu_utilization(TPS("End context switch"));
- barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -351,7 +340,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
*/
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
- return rnp->gp_tasks != NULL;
+ return READ_ONCE(rnp->gp_tasks) != NULL;
}
/* Bias and limit values for ->rcu_read_lock_nesting. */
@@ -359,6 +348,21 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
#define RCU_NEST_NMAX (-INT_MAX / 2)
#define RCU_NEST_PMAX (INT_MAX / 2)
+static void rcu_preempt_read_enter(void)
+{
+ current->rcu_read_lock_nesting++;
+}
+
+static void rcu_preempt_read_exit(void)
+{
+ current->rcu_read_lock_nesting--;
+}
+
+static void rcu_preempt_depth_set(int val)
+{
+ current->rcu_read_lock_nesting = val;
+}
+
/*
* Preemptible RCU implementation for rcu_read_lock().
* Just increment ->rcu_read_lock_nesting, shared state will be updated
@@ -366,9 +370,9 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
*/
void __rcu_read_lock(void)
{
- current->rcu_read_lock_nesting++;
+ rcu_preempt_read_enter();
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
- WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX);
+ WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
barrier(); /* critical section after entry code. */
}
EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -384,19 +388,19 @@ void __rcu_read_unlock(void)
{
struct task_struct *t = current;
- if (t->rcu_read_lock_nesting != 1) {
- --t->rcu_read_lock_nesting;
+ if (rcu_preempt_depth() != 1) {
+ rcu_preempt_read_exit();
} else {
barrier(); /* critical section before exit code. */
- t->rcu_read_lock_nesting = -RCU_NEST_BIAS;
+ rcu_preempt_depth_set(-RCU_NEST_BIAS);
barrier(); /* assign before ->rcu_read_unlock_special load */
if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
barrier(); /* ->rcu_read_unlock_special load before assign */
- t->rcu_read_lock_nesting = 0;
+ rcu_preempt_depth_set(0);
}
if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
- int rrln = t->rcu_read_lock_nesting;
+ int rrln = rcu_preempt_depth();
WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX);
}
@@ -455,15 +459,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
local_irq_restore(flags);
return;
}
- t->rcu_read_unlock_special.b.deferred_qs = false;
- if (special.b.need_qs) {
+ t->rcu_read_unlock_special.s = 0;
+ if (special.b.need_qs)
rcu_qs();
- t->rcu_read_unlock_special.b.need_qs = false;
- if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) {
- local_irq_restore(flags);
- return;
- }
- }
/*
* Respond to a request by an expedited grace period for a
@@ -471,17 +469,11 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* tasks are handled when removing the task from the
* blocked-tasks list below.
*/
- if (rdp->exp_deferred_qs) {
+ if (rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
- if (!t->rcu_read_unlock_special.s) {
- local_irq_restore(flags);
- return;
- }
- }
/* Clean up if blocked during RCU read-side critical section. */
if (special.b.blocked) {
- t->rcu_read_unlock_special.b.blocked = false;
/*
* Remove this task from the list it blocked on. The task
@@ -496,7 +488,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
(!empty_norm || rnp->qsmask));
- empty_exp = sync_rcu_preempt_exp_done(rnp);
+ empty_exp = sync_rcu_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
@@ -504,7 +496,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
rnp->gp_seq, t->pid);
if (&t->rcu_node_entry == rnp->gp_tasks)
- rnp->gp_tasks = np;
+ WRITE_ONCE(rnp->gp_tasks, np);
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
@@ -520,7 +512,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
* so we must take a snapshot of the expedited state.
*/
- empty_exp_now = sync_rcu_preempt_exp_done(rnp);
+ empty_exp_now = sync_rcu_exp_done(rnp);
if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
rnp->gp_seq,
@@ -562,7 +554,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
- t->rcu_read_lock_nesting <= 0;
+ rcu_preempt_depth() <= 0;
}
/*
@@ -575,16 +567,16 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
static void rcu_preempt_deferred_qs(struct task_struct *t)
{
unsigned long flags;
- bool couldrecurse = t->rcu_read_lock_nesting >= 0;
+ bool couldrecurse = rcu_preempt_depth() >= 0;
if (!rcu_preempt_need_deferred_qs(t))
return;
if (couldrecurse)
- t->rcu_read_lock_nesting -= RCU_NEST_BIAS;
+ rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS);
local_irq_save(flags);
rcu_preempt_deferred_qs_irqrestore(t, flags);
if (couldrecurse)
- t->rcu_read_lock_nesting += RCU_NEST_BIAS;
+ rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS);
}
/*
@@ -621,27 +613,22 @@ static void rcu_read_unlock_special(struct task_struct *t)
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- t->rcu_read_unlock_special.b.exp_hint = false;
exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) ||
- (rdp->grpmask & rnp->expmask) ||
+ (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
tick_nohz_full_cpu(rdp->cpu);
// Need to defer quiescent state until everything is enabled.
- if ((exp || in_irq()) && irqs_were_disabled && use_softirq &&
- (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
+ if (irqs_were_disabled && use_softirq &&
+ (in_interrupt() ||
+ (exp && !t->rcu_read_unlock_special.b.deferred_qs))) {
// Using softirq, safe to awaken, and we get
// no help from enabling irqs, unlike bh/preempt.
raise_softirq_irqoff(RCU_SOFTIRQ);
- } else if (exp && irqs_were_disabled && !use_softirq &&
- !t->rcu_read_unlock_special.b.deferred_qs) {
- // Safe to awaken and we get no help from enabling
- // irqs, unlike bh/preempt.
- invoke_rcu_core();
} else {
// Enabling BH or preempt does reschedule, so...
// Also if no expediting or NO_HZ_FULL, slow is OK.
set_tsk_need_resched(current);
set_preempt_need_resched();
- if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+ if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
!rdp->defer_qs_iw_pending && exp) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
@@ -655,7 +642,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_restore(flags);
return;
}
- WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
rcu_preempt_deferred_qs_irqrestore(t, flags);
}
@@ -663,8 +649,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
- * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock
- * must be held by the caller.
+ * invoked -before- updating this rnp's ->gp_seq.
*
* Also, if there are blocked tasks on the list, they automatically
* block the newly created grace period, so set up ->gp_tasks accordingly.
@@ -674,11 +659,12 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
struct task_struct *t;
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
+ raw_lockdep_assert_held_rcu_node(rnp);
if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
dump_blkd_tasks(rnp, 10);
if (rcu_preempt_has_tasks(rnp) &&
(rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
- rnp->gp_tasks = rnp->blkd_tasks.next;
+ WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
t = container_of(rnp->gp_tasks, struct task_struct,
rcu_node_entry);
trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
@@ -701,7 +687,7 @@ static void rcu_flavor_sched_clock_irq(int user)
if (user || rcu_is_cpu_rrupt_from_idle()) {
rcu_note_voluntary_context_switch(current);
}
- if (t->rcu_read_lock_nesting > 0 ||
+ if (rcu_preempt_depth() > 0 ||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
/* No QS, force context switch if deferred. */
if (rcu_preempt_need_deferred_qs(t)) {
@@ -711,13 +697,13 @@ static void rcu_flavor_sched_clock_irq(int user)
} else if (rcu_preempt_need_deferred_qs(t)) {
rcu_preempt_deferred_qs(t); /* Report deferred QS. */
return;
- } else if (!t->rcu_read_lock_nesting) {
+ } else if (!rcu_preempt_depth()) {
rcu_qs(); /* Report immediate QS. */
return;
}
/* If GP is oldish, ask for help from rcu_read_unlock_special(). */
- if (t->rcu_read_lock_nesting > 0 &&
+ if (rcu_preempt_depth() > 0 &&
__this_cpu_read(rcu_data.core_needs_qs) &&
__this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
!t->rcu_read_unlock_special.b.need_qs &&
@@ -738,11 +724,11 @@ void exit_rcu(void)
struct task_struct *t = current;
if (unlikely(!list_empty(&current->rcu_node_entry))) {
- t->rcu_read_lock_nesting = 1;
+ rcu_preempt_depth_set(1);
barrier();
WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
- } else if (unlikely(t->rcu_read_lock_nesting)) {
- t->rcu_read_lock_nesting = 1;
+ } else if (unlikely(rcu_preempt_depth())) {
+ rcu_preempt_depth_set(1);
} else {
return;
}
@@ -772,7 +758,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
- __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks);
+ __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks,
+ rnp->exp_tasks);
pr_info("%s: ->blkd_tasks", __func__);
i = 0;
list_for_each(lhp, &rnp->blkd_tasks) {
@@ -803,7 +790,7 @@ static void __init rcu_bootup_announce(void)
}
/*
- * Note a quiescent state for PREEMPT=n. Because we do not need to know
+ * Note a quiescent state for PREEMPTION=n. Because we do not need to know
* how many quiescent states passed, just if there was at least one since
* the start of the grace period, this just sets a flag. The caller must
* have disabled preemption.
@@ -828,11 +815,6 @@ static void rcu_qs(void)
* dyntick-idle quiescent state visible to other CPUs, which will in
* some cases serve for expedited as well as normal grace periods.
* Either way, register a lightweight quiescent state.
- *
- * The barrier() calls are redundant in the common case when this is
- * called externally, but just in case this is called from within this
- * file.
- *
*/
void rcu_all_qs(void)
{
@@ -847,24 +829,21 @@ void rcu_all_qs(void)
return;
}
this_cpu_write(rcu_data.rcu_urgent_qs, false);
- barrier(); /* Avoid RCU read-side critical sections leaking down. */
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
rcu_qs();
- barrier(); /* Avoid RCU read-side critical sections leaking up. */
preempt_enable();
}
EXPORT_SYMBOL_GPL(rcu_all_qs);
/*
- * Note a PREEMPT=n context switch. The caller must have disabled interrupts.
+ * Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
*/
void rcu_note_context_switch(bool preempt)
{
- barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
rcu_qs();
/* Load rcu_urgent_qs before other flags. */
@@ -877,7 +856,6 @@ void rcu_note_context_switch(bool preempt)
rcu_tasks_qs(current);
out:
trace_rcu_utilization(TPS("End context switch"));
- barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1134,7 +1112,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
* already exist. We only create this kthread for preemptible RCU.
* Returns zero if all is well, a negated errno otherwise.
*/
-static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
int rnp_index = rnp - rcu_get_root();
unsigned long flags;
@@ -1142,25 +1120,27 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
struct task_struct *t;
if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
- return 0;
+ return;
if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
- return 0;
+ return;
rcu_state.boost = 1;
+
if (rnp->boost_kthread_task != NULL)
- return 0;
+ return;
+
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
- if (IS_ERR(t))
- return PTR_ERR(t);
+ if (WARN_ON_ONCE(IS_ERR(t)))
+ return;
+
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
- return 0;
}
/*
@@ -1201,7 +1181,7 @@ static void __init rcu_spawn_boost_kthreads(void)
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp)
- (void)rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_one_boost_kthread(rnp);
}
static void rcu_prepare_kthreads(int cpu)
@@ -1211,7 +1191,7 @@ static void rcu_prepare_kthreads(int cpu)
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
if (rcu_scheduler_fully_active)
- (void)rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_one_boost_kthread(rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1248,10 +1228,10 @@ static void rcu_prepare_kthreads(int cpu)
#if !defined(CONFIG_RCU_FAST_NO_HZ)
/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so. This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
+ * Check to see if any future non-offloaded RCU-related work will need
+ * to be done by the current CPU, even if none need be done immediately,
+ * returning 1 if so. This function is part of the RCU implementation;
+ * it is -not- an exported member of the RCU API.
*
* Because we not have RCU_FAST_NO_HZ, just check whether or not this
* CPU has RCU callbacks queued.
@@ -1259,7 +1239,8 @@ static void rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
- return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist);
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
+ !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist);
}
/*
@@ -1283,10 +1264,9 @@ static void rcu_prepare_for_idle(void)
/*
* This code is invoked when a CPU goes idle, at which point we want
* to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode. This is handled by a
- * state machine implemented by rcu_prepare_for_idle() below.
+ * the energy-efficient dyntick-idle mode.
*
- * The following three proprocessor symbols control this state machine:
+ * The following preprocessor symbol controls this:
*
* RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
* to sleep in dyntick-idle mode with RCU callbacks pending. This
@@ -1295,21 +1275,15 @@ static void rcu_prepare_for_idle(void)
* number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
* system. And if you are -that- concerned about energy efficiency,
* just power the system down and be done with it!
- * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
- * permitted to sleep in dyntick-idle mode with only lazy RCU
- * callbacks pending. Setting this too high can OOM your system.
*
- * The values below work well in practice. If future workloads require
+ * The value below works well in practice. If future workloads require
* adjustment, they can be converted into kernel config parameters, though
* making the state machine smarter might be a better option.
*/
#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
-#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
module_param(rcu_idle_gp_delay, int, 0644);
-static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
-module_param(rcu_idle_lazy_gp_delay, int, 0644);
/*
* Try to advance callbacks on the current CPU, but only if it has been
@@ -1348,8 +1322,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
/*
* Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
* to invoke. If the CPU has callbacks, try to advance them. Tell the
- * caller to set the timeout based on whether or not there are non-lazy
- * callbacks.
+ * caller about what to set the timeout.
*
* The caller must have disabled interrupts.
*/
@@ -1360,8 +1333,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
lockdep_assert_irqs_disabled();
- /* If no callbacks, RCU doesn't need the CPU. */
- if (rcu_segcblist_empty(&rdp->cblist)) {
+ /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
+ if (rcu_segcblist_empty(&rdp->cblist) ||
+ rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) {
*nextevt = KTIME_MAX;
return 0;
}
@@ -1374,25 +1348,18 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
}
rdp->last_accelerate = jiffies;
- /* Request timer delay depending on laziness, and round. */
- rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist);
- if (rdp->all_lazy) {
- dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
- } else {
- dj = round_up(rcu_idle_gp_delay + jiffies,
- rcu_idle_gp_delay) - jiffies;
- }
+ /* Request timer and round. */
+ dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
+
*nextevt = basemono + dj * TICK_NSEC;
return 0;
}
/*
- * Prepare a CPU for idle from an RCU perspective. The first major task
- * is to sense whether nohz mode has been enabled or disabled via sysfs.
- * The second major task is to check to see if a non-lazy callback has
- * arrived at a CPU that previously had only lazy callbacks. The third
- * major task is to accelerate (that is, assign grace-period numbers to)
- * any recently arrived callbacks.
+ * Prepare a CPU for idle from an RCU perspective. The first major task is to
+ * sense whether nohz mode has been enabled or disabled via sysfs. The second
+ * major task is to accelerate (that is, assign grace-period numbers to) any
+ * recently arrived callbacks.
*
* The caller must have disabled interrupts.
*/
@@ -1404,7 +1371,7 @@ static void rcu_prepare_for_idle(void)
int tne;
lockdep_assert_irqs_disabled();
- if (rcu_is_nocb_cpu(smp_processor_id()))
+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
return;
/* Handle nohz enablement switches conservatively. */
@@ -1419,17 +1386,6 @@ static void rcu_prepare_for_idle(void)
return;
/*
- * If a non-lazy callback arrived at a CPU having only lazy
- * callbacks, invoke RCU core for the side-effect of recalculating
- * idle duration on re-entry to idle.
- */
- if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) {
- rdp->all_lazy = false;
- invoke_rcu_core();
- return;
- }
-
- /*
* If we have not yet accelerated this jiffy, accelerate all
* callbacks on this CPU.
*/
@@ -1453,8 +1409,10 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
lockdep_assert_irqs_disabled();
- if (rcu_is_nocb_cpu(smp_processor_id()))
+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
@@ -1469,10 +1427,10 @@ static void rcu_cleanup_after_idle(void)
* specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
* created that pull the callbacks from the corresponding CPU, wait for
* a grace period to elapse, and invoke the callbacks. These kthreads
- * are organized into leaders, which manage incoming callbacks, wait for
- * grace periods, and awaken followers, and the followers, which only
- * invoke callbacks. Each leader is its own follower. The no-CBs CPUs
- * do a wake_up() on their kthread when they insert a callback into any
+ * are organized into GP kthreads, which manage incoming callbacks, wait for
+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
+ * do a wake_up() on their GP kthread when they insert a callback into any
* empty list, unless the rcu_nocb_poll boot parameter has been specified,
* in which case each kthread actively polls its CPU. (Which isn't so great
* for energy efficiency, but which does reduce RCU's overhead on that CPU.)
@@ -1515,6 +1473,116 @@ static int __init parse_rcu_nocb_poll(char *arg)
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
+ * After all, the main point of bypassing is to avoid lock contention
+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
+ */
+int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
+ * lock isn't immediately available, increment ->nocb_lock_contended to
+ * flag the contention.
+ */
+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
+ return;
+ atomic_inc(&rdp->nocb_lock_contended);
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ smp_mb__after_atomic(); /* atomic_inc() before lock. */
+ raw_spin_lock(&rdp->nocb_bypass_lock);
+ smp_mb__before_atomic(); /* atomic_dec() after lock. */
+ atomic_dec(&rdp->nocb_lock_contended);
+}
+
+/*
+ * Spinwait until the specified rcu_data structure's ->nocb_lock is
+ * not contended. Please note that this is extremely special-purpose,
+ * relying on the fact that at most two kthreads and one CPU contend for
+ * this lock, and also that the two kthreads are guaranteed to have frequent
+ * grace-period-duration time intervals between successive acquisitions
+ * of the lock. This allows us to use an extremely simple throttling
+ * mechanism, and further to apply it only to the CPU doing floods of
+ * call_rcu() invocations. Don't try this at home!
+ */
+static void rcu_nocb_wait_contended(struct rcu_data *rdp)
+{
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
+ cpu_relax();
+}
+
+/*
+ * Conditionally acquire the specified rcu_data structure's
+ * ->nocb_bypass_lock.
+ */
+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
+ */
+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
+ return;
+ raw_spin_lock(&rdp->nocb_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_lock);
+ }
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock and restore
+ * interrupts, but only if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (rcu_segcblist_is_offloaded(&rdp->cblist) &&
+ cpu_online(rdp->cpu))
+ lockdep_assert_held(&rdp->nocb_lock);
+}
+
+/*
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
@@ -1543,440 +1611,514 @@ bool rcu_is_nocb_cpu(int cpu)
}
/*
- * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock
+ * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
* and this function releases it.
*/
-static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
- unsigned long flags)
+static void wake_nocb_gp(struct rcu_data *rdp, bool force,
+ unsigned long flags)
__releases(rdp->nocb_lock)
{
- struct rcu_data *rdp_leader = rdp->nocb_leader;
+ bool needwake = false;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
lockdep_assert_held(&rdp->nocb_lock);
- if (!READ_ONCE(rdp_leader->nocb_kthread)) {
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("AlreadyAwake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
return;
}
- if (rdp_leader->nocb_leader_sleep || force) {
- /* Prior smp_mb__after_atomic() orders against prior enqueue. */
- WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
- del_timer(&rdp->nocb_timer);
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
- swake_up_one(&rdp_leader->nocb_wq);
- } else {
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ del_timer(&rdp->nocb_timer);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
+ needwake = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
}
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ if (needwake)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
}
/*
- * Kick the leader kthread for this NOCB group, but caller has not
- * acquired locks.
+ * Arrange to wake the GP kthread for this NOCB group at some future
+ * time when it is safe to do so.
*/
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
{
- unsigned long flags;
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
+ mod_timer(&rdp->nocb_timer, jiffies + 1);
+ if (rdp->nocb_defer_wakeup < waketype)
+ WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
+}
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- __wake_nocb_leader(rdp, force, flags);
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ struct rcu_cblist rcl;
+
+ WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist));
+ rcu_lockdep_assert_cblist_protected(rdp);
+ lockdep_assert_held(&rdp->nocb_bypass_lock);
+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+ return false;
+ }
+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
+ if (rhp)
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ rcu_nocb_bypass_unlock(rdp);
+ return true;
}
/*
- * Arrange to wake the leader kthread for this NOCB group at some
- * future time when it is safe to do so.
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
*/
-static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
- const char *reason)
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
{
- unsigned long flags;
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
+ return true;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j);
+}
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
- mod_timer(&rdp->nocb_timer, jiffies + 1);
- WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+/*
+ * If the ->nocb_bypass_lock is immediately available, flush the
+ * ->nocb_bypass queue into ->cblist.
+ */
+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist) ||
+ !rcu_nocb_bypass_trylock(rdp))
+ return;
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
}
-/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */
-static bool rcu_nocb_cpu_needs_barrier(int cpu)
+/*
+ * See whether it is appropriate to use the ->nocb_bypass list in order
+ * to control contention on ->nocb_lock. A limited number of direct
+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
+ * used if ->cblist is empty, because otherwise callbacks can be stranded
+ * on ->nocb_bypass because we cannot count on the current CPU ever again
+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
+ * non-empty, the corresponding no-CBs grace-period kthread must not be
+ * in an indefinite sleep state.
+ *
+ * Finally, it is not permitted to use the bypass during early boot,
+ * as doing so would confuse the auto-initialization code. Besides
+ * which, there is no point in worrying about lock contention while
+ * there is only one CPU in operation.
+ */
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- unsigned long ret;
-#ifdef CONFIG_PROVE_RCU
- struct rcu_head *rhp;
-#endif /* #ifdef CONFIG_PROVE_RCU */
+ unsigned long c;
+ unsigned long cur_gp_seq;
+ unsigned long j = jiffies;
+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- /*
- * Check count of all no-CBs callbacks awaiting invocation.
- * There needs to be a barrier before this function is called,
- * but associated with a prior determination that no more
- * callbacks would be posted. In the worst case, the first
- * barrier in rcu_barrier() suffices (but the caller cannot
- * necessarily rely on this, not a substitute for the caller
- * getting the concurrency design right!). There must also be a
- * barrier between the following load and posting of a callback
- * (if a callback is in fact needed). This is associated with an
- * atomic_inc() in the caller.
- */
- ret = rcu_get_n_cbs_nocb_cpu(rdp);
-
-#ifdef CONFIG_PROVE_RCU
- rhp = READ_ONCE(rdp->nocb_head);
- if (!rhp)
- rhp = READ_ONCE(rdp->nocb_gp_head);
- if (!rhp)
- rhp = READ_ONCE(rdp->nocb_follower_head);
-
- /* Having no rcuo kthread but CBs after scheduler starts is bad! */
- if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
- rcu_scheduler_fully_active) {
- /* RCU callback enqueued before CPU first came online??? */
- pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
- cpu, rhp->func);
- WARN_ON_ONCE(1);
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false; /* Not offloaded, no bypassing. */
+ }
+ lockdep_assert_irqs_disabled();
+
+ // Don't use ->nocb_bypass during early boot.
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // If we have advanced to a new jiffy, reset counts to allow
+ // moving back from ->nocb_bypass to ->cblist.
+ if (j == rdp->nocb_nobypass_last) {
+ c = rdp->nocb_nobypass_count + 1;
+ } else {
+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
+ nocb_nobypass_lim_per_jiffy))
+ c = 0;
+ else if (c > nocb_nobypass_lim_per_jiffy)
+ c = nocb_nobypass_lim_per_jiffy;
+ }
+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
+
+ // If there hasn't yet been all that many ->cblist enqueues
+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
+ // ->nocb_bypass first.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+
+ // If ->nocb_bypass has been used too long or is too full,
+ // flush ->nocb_bypass to ->cblist.
+ if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ ncbs >= qhimark) {
+ rcu_nocb_lock(rdp);
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ return true; // Callback already enqueued.
}
-#endif /* #ifdef CONFIG_PROVE_RCU */
- return !!ret;
+ // We need to use the bypass.
+ rcu_nocb_wait_contended(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ if (!ncbs) {
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
+ }
+ rcu_nocb_bypass_unlock(rdp);
+ smp_mb(); /* Order enqueue before wake. */
+ if (ncbs) {
+ local_irq_restore(flags);
+ } else {
+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQwake"));
+ __call_rcu_nocb_wake(rdp, true, flags);
+ } else {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQnoWake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+ }
+ return true; // Callback already enqueued.
}
/*
- * Enqueue the specified string of rcu_head structures onto the specified
- * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
- * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
- * counts are supplied by rhcount and rhcount_lazy.
+ * Awaken the no-CBs grace-period kthead if needed, either due to it
+ * legitimately being asleep or due to overload conditions.
*
* If warranted, also wake up the kthread servicing this CPUs queues.
*/
-static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
- struct rcu_head *rhp,
- struct rcu_head **rhtp,
- int rhcount, int rhcount_lazy,
- unsigned long flags)
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
{
- int len;
- struct rcu_head **old_rhpp;
+ unsigned long cur_gp_seq;
+ unsigned long j;
+ long len;
struct task_struct *t;
- /* Enqueue the callback on the nocb list and update counts. */
- atomic_long_add(rhcount, &rdp->nocb_q_count);
- /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
- old_rhpp = xchg(&rdp->nocb_tail, rhtp);
- WRITE_ONCE(*old_rhpp, rhp);
- atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
- smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
-
- /* If we are not being polled and there is a kthread, awaken it ... */
- t = READ_ONCE(rdp->nocb_kthread);
+ // If we are being polled or there is no kthread, just leave.
+ t = READ_ONCE(rdp->nocb_gp_kthread);
if (rcu_nocb_poll || !t) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeNotPoll"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
return;
}
- len = rcu_get_n_cbs_nocb_cpu(rdp);
- if (old_rhpp == &rdp->nocb_head) {
+ // Need to actually to a wakeup.
+ len = rcu_segcblist_n_cbs(&rdp->cblist);
+ if (was_alldone) {
+ rdp->qlen_last_fqs_check = len;
if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
- wake_nocb_leader(rdp, false);
+ wake_nocb_gp(rdp, false, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
- TPS("WakeEmptyIsDeferred"));
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
- rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
/* ... or if many callbacks queued. */
- if (!irqs_disabled_flags(flags)) {
- wake_nocb_leader(rdp, true);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeOvf"));
- } else {
- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
+ rdp->qlen_last_fqs_check = len;
+ j = jiffies;
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
}
- rdp->qlen_last_fqs_check = LONG_MAX / 2;
+ smp_mb(); /* Enqueue before timer_pending(). */
+ if ((rdp->nocb_cb_sleep ||
+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
+ !timer_pending(&rdp->nocb_bypass_timer))
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
+ TPS("WakeOvfIsDeferred"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
} else {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
return;
}
-/*
- * This is a helper for __call_rcu(), which invokes this when the normal
- * callback queue is inoperable. If this is not a no-CBs CPU, this
- * function returns failure back to __call_rcu(), which can complain
- * appropriately.
- *
- * Otherwise, this function queues the callback where the corresponding
- * "rcuo" kthread can find it.
- */
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags)
+/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
+static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
{
+ unsigned long flags;
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
- if (!rcu_is_nocb_cpu(rdp->cpu))
- return false;
- __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
- if (__is_kfree_rcu_offset((unsigned long)rhp->func))
- trace_rcu_kfree_callback(rcu_state.name, rhp,
- (unsigned long)rhp->func,
- -atomic_long_read(&rdp->nocb_q_count_lazy),
- -rcu_get_n_cbs_nocb_cpu(rdp));
- else
- trace_rcu_callback(rcu_state.name, rhp,
- -atomic_long_read(&rdp->nocb_q_count_lazy),
- -rcu_get_n_cbs_nocb_cpu(rdp));
-
- /*
- * If called from an extended quiescent state with interrupts
- * disabled, invoke the RCU core in order to allow the idle-entry
- * deferred-wakeup check to function.
- */
- if (irqs_disabled_flags(flags) &&
- !rcu_is_watching() &&
- cpu_online(smp_processor_id()))
- invoke_rcu_core();
-
- return true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+ __call_rcu_nocb_wake(rdp, true, flags);
}
/*
- * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
- * not a no-CBs CPU.
+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
+ * or for grace periods to end.
*/
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
- struct rcu_data *rdp,
- unsigned long flags)
+static void nocb_gp_wait(struct rcu_data *my_rdp)
{
- lockdep_assert_irqs_disabled();
- if (!rcu_is_nocb_cpu(smp_processor_id()))
- return false; /* Not NOCBs CPU, caller must migrate CBs. */
- __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
- rcu_segcblist_tail(&rdp->cblist),
- rcu_segcblist_n_cbs(&rdp->cblist),
- rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
- rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_disable(&rdp->cblist);
- return true;
-}
-
-/*
- * If necessary, kick off a new grace period, and either way wait
- * for a subsequent grace period to complete.
- */
-static void rcu_nocb_wait_gp(struct rcu_data *rdp)
-{
- unsigned long c;
- bool d;
+ bool bypass = false;
+ long bypass_ncbs;
+ int __maybe_unused cpu = my_rdp->cpu;
+ unsigned long cur_gp_seq;
unsigned long flags;
+ bool gotcbs = false;
+ unsigned long j = jiffies;
+ bool needwait_gp = false; // This prevents actual uninitialized use.
bool needwake;
- struct rcu_node *rnp = rdp->mynode;
+ bool needwake_gp;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
- local_irq_save(flags);
- c = rcu_seq_snap(&rcu_state.gp_seq);
- if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
- local_irq_restore(flags);
- } else {
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- needwake = rcu_start_this_gp(rnp, rdp, c);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (needwake)
+ /*
+ * Each pass through the following loop checks for CBs and for the
+ * nearest grace period (if any) to wait for next. The CB kthreads
+ * and the global grace-period kthread are awakened if needed.
+ */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ if (bypass_ncbs &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
+ bypass_ncbs > 2 * qhimark)) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue; /* No callbacks here, try next. */
+ }
+ if (bypass_ncbs) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("Bypass"));
+ bypass = true;
+ }
+ rnp = rdp->mynode;
+ if (bypass) { // Avoid race with first bypass CB.
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup,
+ RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ }
+ // Advance callbacks if helpful and low contention.
+ needwake_gp = false;
+ if (!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL) ||
+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake_gp = rcu_advance_cbs(rnp, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
+ }
+ // Need to wait on some grace period?
+ WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL));
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+ if (!needwait_gp ||
+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
+ wait_gp_seq = cur_gp_seq;
+ needwait_gp = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("NeedWaitGP"));
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ needwake = rdp->nocb_cb_sleep;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ smp_mb(); /* CB invocation -after- GP end. */
+ } else {
+ needwake = false;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake) {
+ swake_up_one(&rdp->nocb_cb_wq);
+ gotcbs = true;
+ }
+ if (needwake_gp)
rcu_gp_kthread_wake();
}
- /*
- * Wait for the grace period. Do so interruptibly to avoid messing
- * up the load average.
- */
- trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
- for (;;) {
+ my_rdp->nocb_gp_bypass = bypass;
+ my_rdp->nocb_gp_gp = needwait_gp;
+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+ if (bypass && !rcu_nocb_poll) {
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ if (rcu_nocb_poll) {
+ /* Polling, so trace if first poll in the series. */
+ if (gotcbs)
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
+ schedule_timeout_interruptible(1);
+ } else if (!needwait_gp) {
+ /* Wait for callbacks to appear. */
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+ } else {
+ rnp = my_rdp->mynode;
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
swait_event_interruptible_exclusive(
- rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1],
- (d = rcu_seq_done(&rnp->gp_seq, c)));
- if (likely(d))
- break;
- WARN_ON(signal_pending(current));
- trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait"));
+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
}
- trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait"));
- smp_mb(); /* Ensure that CB invocation happens after GP end. */
+ if (!rcu_nocb_poll) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (bypass)
+ del_timer(&my_rdp->nocb_bypass_timer);
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ my_rdp->nocb_gp_seq = -1;
+ WARN_ON(signal_pending(current));
}
/*
- * Leaders come here to wait for additional callbacks to show up.
- * This function does not return until callbacks appear.
+ * No-CBs grace-period-wait kthread. There is one of these per group
+ * of CPUs, but only once at least one CPU in that group has come online
+ * at least once since boot. This kthread checks for newly posted
+ * callbacks from any of the CPUs it is responsible for, waits for a
+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
+ * that then have callback-invocation work to do.
*/
-static void nocb_leader_wait(struct rcu_data *my_rdp)
+static int rcu_nocb_gp_kthread(void *arg)
{
- bool firsttime = true;
- unsigned long flags;
- bool gotcbs;
- struct rcu_data *rdp;
- struct rcu_head **tail;
-
-wait_again:
-
- /* Wait for callbacks to appear. */
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_wq,
- !READ_ONCE(my_rdp->nocb_leader_sleep));
- raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
- my_rdp->nocb_leader_sleep = true;
- WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
- } else if (firsttime) {
- firsttime = false; /* Don't drown trace log with "Poll"! */
- trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll"));
- }
-
- /*
- * Each pass through the following loop checks a follower for CBs.
- * We are our own first follower. Any CBs found are moved to
- * nocb_gp_head, where they await a grace period.
- */
- gotcbs = false;
- smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
- if (!rdp->nocb_gp_head)
- continue; /* No CBs here, try next follower. */
-
- /* Move callbacks to wait-for-GP list, which is empty. */
- WRITE_ONCE(rdp->nocb_head, NULL);
- rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
- gotcbs = true;
- }
-
- /* No callbacks? Sleep a bit if polling, and go retry. */
- if (unlikely(!gotcbs)) {
- WARN_ON(signal_pending(current));
- if (rcu_nocb_poll) {
- schedule_timeout_interruptible(1);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu,
- TPS("WokeEmpty"));
- }
- goto wait_again;
- }
+ struct rcu_data *rdp = arg;
- /* Wait for one grace period. */
- rcu_nocb_wait_gp(my_rdp);
-
- /* Each pass through the following loop wakes a follower, if needed. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (!rcu_nocb_poll &&
- READ_ONCE(rdp->nocb_head) &&
- READ_ONCE(my_rdp->nocb_leader_sleep)) {
- raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
- my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
- raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
- }
- if (!rdp->nocb_gp_head)
- continue; /* No CBs, so no need to wake follower. */
-
- /* Append callbacks to follower's "done" list. */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- tail = rdp->nocb_follower_tail;
- rdp->nocb_follower_tail = rdp->nocb_gp_tail;
- *tail = rdp->nocb_gp_head;
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
- /* List was empty, so wake up the follower. */
- swake_up_one(&rdp->nocb_wq);
- }
+ for (;;) {
+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
+ nocb_gp_wait(rdp);
+ cond_resched_tasks_rcu_qs();
}
-
- /* If we (the leader) don't have CBs, go wait some more. */
- if (!my_rdp->nocb_follower_head)
- goto wait_again;
+ return 0;
}
/*
- * Followers come here to wait for additional callbacks to show up.
- * This function does not return until callbacks appear.
+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
+ * then, if there are no more, wait for more to appear.
*/
-static void nocb_follower_wait(struct rcu_data *rdp)
+static void nocb_cb_wait(struct rcu_data *rdp)
{
- for (;;) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep"));
- swait_event_interruptible_exclusive(rdp->nocb_wq,
- READ_ONCE(rdp->nocb_follower_head));
- if (smp_load_acquire(&rdp->nocb_follower_head)) {
- /* ^^^ Ensure CB invocation follows _head test. */
- return;
- }
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool needwake_gp = false;
+ struct rcu_node *rnp = rdp->mynode;
+
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ local_bh_disable();
+ rcu_do_batch(rdp);
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ return;
+ }
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+ WRITE_ONCE(rdp->nocb_cb_sleep, true);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ !READ_ONCE(rdp->nocb_cb_sleep));
+ if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
+ /* ^^^ Ensure CB invocation follows _sleep test. */
+ return;
}
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
}
/*
- * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU, however, there is
- * an optional leader-follower relationship so that the grace-period
- * kthreads don't have to do quite so many wakeups.
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
+ * nocb_cb_wait() to do the dirty work.
*/
-static int rcu_nocb_kthread(void *arg)
+static int rcu_nocb_cb_kthread(void *arg)
{
- int c, cl;
- unsigned long flags;
- struct rcu_head *list;
- struct rcu_head *next;
- struct rcu_head **tail;
struct rcu_data *rdp = arg;
- /* Each pass through this loop invokes one batch of callbacks */
+ // Each pass through this loop does one callback batch, and,
+ // if there are no more ready callbacks, waits for them.
for (;;) {
- /* Wait for callbacks. */
- if (rdp->nocb_leader == rdp)
- nocb_leader_wait(rdp);
- else
- nocb_follower_wait(rdp);
-
- /* Pull the ready-to-invoke callbacks onto local list. */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- list = rdp->nocb_follower_head;
- rdp->nocb_follower_head = NULL;
- tail = rdp->nocb_follower_tail;
- rdp->nocb_follower_tail = &rdp->nocb_follower_head;
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- if (WARN_ON_ONCE(!list))
- continue;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty"));
-
- /* Each pass through the following loop invokes a callback. */
- trace_rcu_batch_start(rcu_state.name,
- atomic_long_read(&rdp->nocb_q_count_lazy),
- rcu_get_n_cbs_nocb_cpu(rdp), -1);
- c = cl = 0;
- while (list) {
- next = list->next;
- /* Wait for enqueuing to complete, if needed. */
- while (next == NULL && &list->next != tail) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WaitQueue"));
- schedule_timeout_interruptible(1);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WokeQueue"));
- next = list->next;
- }
- debug_rcu_head_unqueue(list);
- local_bh_disable();
- if (__rcu_reclaim(rcu_state.name, list))
- cl++;
- c++;
- local_bh_enable();
- cond_resched_tasks_rcu_qs();
- list = next;
- }
- trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1);
- smp_mb__before_atomic(); /* _add after CB invocation. */
- atomic_long_add(-c, &rdp->nocb_q_count);
- atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
+ nocb_cb_wait(rdp);
+ cond_resched_tasks_rcu_qs();
}
return 0;
}
@@ -1993,14 +2135,14 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
unsigned long flags;
int ndw;
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_nocb_need_deferred_wakeup(rdp)) {
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
return;
}
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
}
@@ -2027,6 +2169,7 @@ void __init rcu_init_nohz(void)
{
int cpu;
bool need_rcu_nocb_mask = false;
+ struct rcu_data *rdp;
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
@@ -2060,67 +2203,63 @@ void __init rcu_init_nohz(void)
if (rcu_nocb_poll)
pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
- for_each_cpu(cpu, rcu_nocb_mask)
- init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu));
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist);
+ }
rcu_organize_nocb_kthreads();
}
/* Initialize per-rcu_data variables for no-CBs CPUs. */
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
- rdp->nocb_tail = &rdp->nocb_head;
- init_swait_queue_head(&rdp->nocb_wq);
- rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ init_swait_queue_head(&rdp->nocb_cb_wq);
+ init_swait_queue_head(&rdp->nocb_gp_wq);
raw_spin_lock_init(&rdp->nocb_lock);
+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
+ raw_spin_lock_init(&rdp->nocb_gp_lock);
timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
+ timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
+ rcu_cblist_init(&rdp->nocb_bypass);
}
/*
* If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it. If the CPUs are brought online out of order,
- * this can require re-organizing the leader-follower relationships.
+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
+ * for this CPU's group has not yet been created, spawn it as well.
*/
static void rcu_spawn_one_nocb_kthread(int cpu)
{
- struct rcu_data *rdp;
- struct rcu_data *rdp_last;
- struct rcu_data *rdp_old_leader;
- struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp_gp;
struct task_struct *t;
/*
* If this isn't a no-CBs CPU or if it already has an rcuo kthread,
* then nothing to do.
*/
- if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
+ if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
return;
- /* If we didn't spawn the leader first, reorganize! */
- rdp_old_leader = rdp_spawn->nocb_leader;
- if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
- rdp_last = NULL;
- rdp = rdp_old_leader;
- do {
- rdp->nocb_leader = rdp_spawn;
- if (rdp_last && rdp != rdp_spawn)
- rdp_last->nocb_next_follower = rdp;
- if (rdp == rdp_spawn) {
- rdp = rdp->nocb_next_follower;
- } else {
- rdp_last = rdp;
- rdp = rdp->nocb_next_follower;
- rdp_last->nocb_next_follower = NULL;
- }
- } while (rdp);
- rdp_spawn->nocb_next_follower = rdp_old_leader;
+ /* If we didn't spawn the GP kthread first, reorganize! */
+ rdp_gp = rdp->nocb_gp_rdp;
+ if (!rdp_gp->nocb_gp_kthread) {
+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
+ "rcuog/%d", rdp_gp->cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
+ return;
+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
}
/* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_kthread, rdp_spawn,
+ t = kthread_run(rcu_nocb_cb_kthread, rdp,
"rcuo%c/%d", rcu_state.abbr, cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__))
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
return;
- WRITE_ONCE(rdp_spawn->nocb_kthread, t);
+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
}
/*
@@ -2147,27 +2286,30 @@ static void __init rcu_spawn_nocb_kthreads(void)
rcu_spawn_cpu_nocb_kthread(cpu);
}
-/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
-static int rcu_nocb_leader_stride = -1;
-module_param(rcu_nocb_leader_stride, int, 0444);
+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_gp_stride = -1;
+module_param(rcu_nocb_gp_stride, int, 0444);
/*
- * Initialize leader-follower relationships for all no-CBs CPU.
+ * Initialize GP-CB relationships for all no-CBs CPU.
*/
static void __init rcu_organize_nocb_kthreads(void)
{
int cpu;
- int ls = rcu_nocb_leader_stride;
- int nl = 0; /* Next leader. */
+ bool firsttime = true;
+ bool gotnocbs = false;
+ bool gotnocbscbs = true;
+ int ls = rcu_nocb_gp_stride;
+ int nl = 0; /* Next GP kthread. */
struct rcu_data *rdp;
- struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
struct rcu_data *rdp_prev = NULL;
if (!cpumask_available(rcu_nocb_mask))
return;
if (ls == -1) {
- ls = int_sqrt(nr_cpu_ids);
- rcu_nocb_leader_stride = ls;
+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
+ rcu_nocb_gp_stride = ls;
}
/*
@@ -2178,37 +2320,32 @@ static void __init rcu_organize_nocb_kthreads(void)
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->cpu >= nl) {
- /* New leader, set up for followers & next leader. */
+ /* New GP kthread, set up for CBs & next GP. */
+ gotnocbs = true;
nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_leader = rdp;
- rdp_leader = rdp;
+ rdp->nocb_gp_rdp = rdp;
+ rdp_gp = rdp;
+ if (dump_tree) {
+ if (!firsttime)
+ pr_cont("%s\n", gotnocbscbs
+ ? "" : " (self only)");
+ gotnocbscbs = false;
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:",
+ __func__, cpu);
+ }
} else {
- /* Another follower, link to previous leader. */
- rdp->nocb_leader = rdp_leader;
- rdp_prev->nocb_next_follower = rdp;
+ /* Another CB kthread, link to previous GP kthread. */
+ gotnocbscbs = true;
+ rdp->nocb_gp_rdp = rdp_gp;
+ rdp_prev->nocb_next_cb_rdp = rdp;
+ if (dump_tree)
+ pr_cont(" %d", cpu);
}
rdp_prev = rdp;
}
-}
-
-/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
-static bool init_nocb_callback_list(struct rcu_data *rdp)
-{
- if (!rcu_is_nocb_cpu(rdp->cpu))
- return false;
-
- /* If there are early-boot callbacks, move them to nocb lists. */
- if (!rcu_segcblist_empty(&rdp->cblist)) {
- rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
- rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
- atomic_long_set(&rdp->nocb_q_count,
- rcu_segcblist_n_cbs(&rdp->cblist));
- atomic_long_set(&rdp->nocb_q_count_lazy,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist));
- rcu_segcblist_init(&rdp->cblist);
- }
- rcu_segcblist_disable(&rdp->cblist);
- return true;
+ if (gotnocbs && dump_tree)
+ pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
}
/*
@@ -2223,20 +2360,101 @@ void rcu_bind_current_to_nocb(void)
EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
/*
- * Return the number of RCU callbacks still queued from the specified
- * CPU, which must be a nocbs CPU.
+ * Dump out nocb grace-period kthread state for the specified rcu_data
+ * structure.
*/
-static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
{
- return atomic_long_read(&rdp->nocb_q_count);
+ struct rcu_node *rnp = rdp->mynode;
+
+ pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
+ rdp->cpu,
+ "kK"[!!rdp->nocb_gp_kthread],
+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[timer_pending(&rdp->nocb_timer)],
+ "bB"[timer_pending(&rdp->nocb_bypass_timer)],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[swait_active(&rdp->nocb_gp_wq)],
+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
+ ".B"[!!rdp->nocb_gp_bypass],
+ ".G"[!!rdp->nocb_gp_gp],
+ (long)rdp->nocb_gp_seq,
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
+}
+
+/* Dump out nocb kthread state for the specified rcu_data structure. */
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *rsclp = &rdp->cblist;
+ bool waslocked;
+ bool wastimer;
+ bool wassleep;
+
+ if (rdp->nocb_gp_rdp == rdp)
+ show_rcu_nocb_gp_state(rdp);
+
+ pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ "kK"[!!rdp->nocb_cb_kthread],
+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
+ "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
+ "sS"[!!rdp->nocb_cb_sleep],
+ ".W"[swait_active(&rdp->nocb_cb_wq)],
+ jiffies - rdp->nocb_bypass_first,
+ jiffies - rdp->nocb_nobypass_last,
+ rdp->nocb_nobypass_count,
+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
+ ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
+ ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
+ ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ rcu_segcblist_n_cbs(&rdp->cblist));
+
+ /* It is OK for GP kthreads to have GP state. */
+ if (rdp->nocb_gp_rdp == rdp)
+ return;
+
+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
+ wastimer = timer_pending(&rdp->nocb_timer);
+ wassleep = swait_active(&rdp->nocb_gp_wq);
+ if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep &&
+ !waslocked && !wastimer && !wassleep)
+ return; /* Nothing untowards. */
+
+ pr_info(" !!! %c%c%c%c %c\n",
+ "lL"[waslocked],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[wastimer],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[wassleep]);
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static bool rcu_nocb_cpu_needs_barrier(int cpu)
+/* No ->nocb_lock to acquire. */
+static void rcu_nocb_lock(struct rcu_data *rdp)
{
- WARN_ON_ONCE(1); /* Should be dead code. */
- return false;
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ local_irq_restore(flags);
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
}
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
@@ -2252,19 +2470,24 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags)
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
{
- return false;
+ return true;
}
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
- struct rcu_data *rdp,
- unsigned long flags)
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
{
return false;
}
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}
@@ -2286,14 +2509,8 @@ static void __init rcu_spawn_nocb_kthreads(void)
{
}
-static bool init_nocb_callback_list(struct rcu_data *rdp)
+static void show_rcu_nocb_state(struct rcu_data *rdp)
{
- return false;
-}
-
-static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
-{
- return 0;
}
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 065183391f75..55f9b84790d3 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp)
//
// Printing RCU CPU stall warnings
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_RCU
/*
* Dump detailed information for all tasks blocking the current RCU
@@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
return ndetected;
}
-#else /* #ifdef CONFIG_PREEMPT */
+#else /* #ifdef CONFIG_PREEMPT_RCU */
/*
* Because preemptible RCU does not exist, we never have to check for
@@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
{
return 0;
}
-#endif /* #else #ifdef CONFIG_PREEMPT */
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/*
* Dump stacks of all tasks running on stalled CPUs. First try using
@@ -263,11 +263,9 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
+ sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- ".l"[rdp->all_lazy],
- ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
- ".D"[!!rdp->tick_nohz_enabled_snap]);
+ !!rdp->tick_nohz_enabled_snap);
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -279,6 +277,28 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
+static const char * const gp_state_names[] = {
+ [RCU_GP_IDLE] = "RCU_GP_IDLE",
+ [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
+ [RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS",
+ [RCU_GP_ONOFF] = "RCU_GP_ONOFF",
+ [RCU_GP_INIT] = "RCU_GP_INIT",
+ [RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS",
+ [RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS",
+ [RCU_GP_CLEANUP] = "RCU_GP_CLEANUP",
+ [RCU_GP_CLEANED] = "RCU_GP_CLEANED",
+};
+
+/*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+ if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+ return "???";
+ return gp_state_names[gs];
+}
+
/*
* Print out diagnostic information for the specified stalled CPU.
*
@@ -527,6 +547,8 @@ static void check_cpu_stall(struct rcu_data *rdp)
/* We haven't checked in, so go dump stack. */
print_cpu_stall();
+ if (rcu_cpu_stall_ftrace_dump)
+ rcu_ftrace_dump(DUMP_ALL);
} else if (rcu_gp_in_progress() &&
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
@@ -534,6 +556,8 @@ static void check_cpu_stall(struct rcu_data *rdp)
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(gs2);
+ if (rcu_cpu_stall_ftrace_dump)
+ rcu_ftrace_dump(DUMP_ALL);
}
}
@@ -585,6 +609,11 @@ void show_rcu_gp_kthreads(void)
cpu, (long)rdp->gp_seq_needed);
}
}
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
+ show_rcu_nocb_state(rdp);
+ }
/* sched_show_task(rcu_state.gp_kthread); */
}
EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 61df2bf08563..6c4b862f57d6 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -40,6 +40,7 @@
#include <linux/rcupdate_wait.h>
#include <linux/sched/isolation.h>
#include <linux/kprobes.h>
+#include <linux/slab.h>
#define CREATE_TRACE_POINTS
@@ -51,9 +52,7 @@
#define MODULE_PARAM_PREFIX "rcupdate."
#ifndef CONFIG_TINY_RCU
-extern int rcu_expedited; /* from sysctl */
module_param(rcu_expedited, int, 0);
-extern int rcu_normal; /* from sysctl */
module_param(rcu_normal, int, 0);
static int rcu_normal_after_boot;
module_param(rcu_normal_after_boot, int, 0);
@@ -61,9 +60,15 @@ module_param(rcu_normal_after_boot, int, 0);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
- * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
+ * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section?
+ * @ret: Best guess answer if lockdep cannot be relied on
*
- * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * Returns true if lockdep must be ignored, in which case *ret contains
+ * the best guess described below. Otherwise returns false, in which
+ * case *ret tells the caller nothing and the caller should instead
+ * consult lockdep.
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an
* RCU-sched read-side critical section. In absence of
* CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
* critical section unless it can prove otherwise. Note that disabling
@@ -75,35 +80,45 @@ module_param(rcu_normal_after_boot, int, 0);
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot
* and while lockdep is disabled.
*
- * Note that if the CPU is in the idle loop from an RCU point of
- * view (ie: that we are in the section between rcu_idle_enter() and
- * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
- * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
- * that are in such a section, considering these as in extended quiescent
- * state, so such a CPU is effectively never in an RCU read-side critical
- * section regardless of what RCU primitives it invokes. This state of
- * affairs is required --- we need to keep an RCU-free window in idle
- * where the CPU may possibly enter into low power mode. This way we can
- * notice an extended quiescent state to other CPUs that started a grace
- * period. Otherwise we would delay any grace period as long as we run in
- * the idle task.
+ * Note that if the CPU is in the idle loop from an RCU point of view (ie:
+ * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
+ * then rcu_read_lock_held() sets *ret to false even if the CPU did an
+ * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
+ * in such a section, considering these as in extended quiescent state,
+ * so such a CPU is effectively never in an RCU read-side critical section
+ * regardless of what RCU primitives it invokes. This state of affairs is
+ * required --- we need to keep an RCU-free window in idle where the CPU may
+ * possibly enter into low power mode. This way we can notice an extended
+ * quiescent state to other CPUs that started a grace period. Otherwise
+ * we would delay any grace period as long as we run in the idle task.
*
- * Similarly, we avoid claiming an SRCU read lock held if the current
+ * Similarly, we avoid claiming an RCU read lock held if the current
* CPU is offline.
*/
+static bool rcu_read_lock_held_common(bool *ret)
+{
+ if (!debug_lockdep_rcu_enabled()) {
+ *ret = 1;
+ return true;
+ }
+ if (!rcu_is_watching()) {
+ *ret = 0;
+ return true;
+ }
+ if (!rcu_lockdep_current_cpu_online()) {
+ *ret = 0;
+ return true;
+ }
+ return false;
+}
+
int rcu_read_lock_sched_held(void)
{
- int lockdep_opinion = 0;
+ bool ret;
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
- if (debug_locks)
- lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
- return lockdep_opinion || !preemptible();
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ return lock_is_held(&rcu_sched_lock_map) || !preemptible();
}
EXPORT_SYMBOL(rcu_read_lock_sched_held);
#endif
@@ -136,8 +151,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
*/
bool rcu_gp_is_expedited(void)
{
- return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
- rcu_scheduler_active == RCU_SCHEDULER_INIT;
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting);
}
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
@@ -203,6 +217,7 @@ static int __init rcu_set_runtime_mode(void)
{
rcu_test_sync_prims();
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ kfree_rcu_scheduler_running();
rcu_test_sync_prims();
return 0;
}
@@ -261,12 +276,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);
*/
int rcu_read_lock_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return lock_is_held(&rcu_lock_map);
}
EXPORT_SYMBOL_GPL(rcu_read_lock_held);
@@ -288,16 +301,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held);
*/
int rcu_read_lock_bh_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+int rcu_read_lock_any_held(void)
+{
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ if (lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map))
+ return 1;
+ return !preemptible();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_any_held);
+
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
@@ -410,7 +435,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_RCU_TRACE)
void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
unsigned long secs,
unsigned long c_old, unsigned long c)
@@ -437,6 +462,8 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
#endif
#ifdef CONFIG_RCU_STALL_COMMON
+int rcu_cpu_stall_ftrace_dump __read_mostly;
+module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
module_param(rcu_cpu_stall_suppress, int, 0644);
@@ -826,14 +853,22 @@ static void test_callback(struct rcu_head *r)
DEFINE_STATIC_SRCU(early_srcu);
+struct early_boot_kfree_rcu {
+ struct rcu_head rh;
+};
+
static void early_boot_test_call_rcu(void)
{
static struct rcu_head head;
static struct rcu_head shead;
+ struct early_boot_kfree_rcu *rhp;
call_rcu(&head, test_callback);
if (IS_ENABLED(CONFIG_SRCU))
call_srcu(&early_srcu, &shead, test_callback);
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (!WARN_ON_ONCE(!rhp))
+ kfree_rcu(rhp, rh);
}
void rcu_early_boot_tests(void)
diff --git a/kernel/resource.c b/kernel/resource.c
index 7ea4306503c5..76036a41143b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
while (start < end &&
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
false, &res)) {
- pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
- end_pfn = (res.end + 1) >> PAGE_SHIFT;
+ pfn = PFN_UP(res.start);
+ end_pfn = PFN_DOWN(res.end + 1);
if (end_pfn > pfn)
ret = (*func)(pfn, end_pfn - pfn, arg);
if (ret)
@@ -1644,19 +1644,8 @@ void resource_list_free(struct list_head *head)
EXPORT_SYMBOL(resource_list_free);
#ifdef CONFIG_DEVICE_PRIVATE
-/**
- * devm_request_free_mem_region - find free region for device private memory
- *
- * @dev: device struct to bind the resource to
- * @size: size in bytes of the device memory to add
- * @base: resource tree to look in
- *
- * This function tries to find an empty range of physical address big enough to
- * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
- * memory, which in turn allocates struct pages.
- */
-struct resource *devm_request_free_mem_region(struct device *dev,
- struct resource *base, unsigned long size)
+static struct resource *__request_free_mem_region(struct device *dev,
+ struct resource *base, unsigned long size, const char *name)
{
resource_size_t end, addr;
struct resource *res;
@@ -1670,7 +1659,10 @@ struct resource *devm_request_free_mem_region(struct device *dev,
REGION_DISJOINT)
continue;
- res = devm_request_mem_region(dev, addr, size, dev_name(dev));
+ if (dev)
+ res = devm_request_mem_region(dev, addr, size, name);
+ else
+ res = request_mem_region(addr, size, name);
if (!res)
return ERR_PTR(-ENOMEM);
res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
@@ -1679,7 +1671,32 @@ struct resource *devm_request_free_mem_region(struct device *dev,
return ERR_PTR(-ERANGE);
}
+
+/**
+ * devm_request_free_mem_region - find free region for device private memory
+ *
+ * @dev: device struct to bind the resource to
+ * @size: size in bytes of the device memory to add
+ * @base: resource tree to look in
+ *
+ * This function tries to find an empty range of physical address big enough to
+ * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
+ * memory, which in turn allocates struct pages.
+ */
+struct resource *devm_request_free_mem_region(struct device *dev,
+ struct resource *base, unsigned long size)
+{
+ return __request_free_mem_region(dev, base, size, dev_name(dev));
+}
EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+
+struct resource *request_free_mem_region(struct resource *base,
+ unsigned long size, const char *name)
+{
+ return __request_free_mem_region(NULL, base, size, name);
+}
+EXPORT_SYMBOL_GPL(request_free_mem_region);
+
#endif /* CONFIG_DEVICE_PRIVATE */
static int __init strict_iomem(char *str)
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 27c48eb7de40..a4f86a9d6937 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -310,6 +310,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
int ret;
if (flags & RSEQ_FLAG_UNREGISTER) {
+ if (flags & ~RSEQ_FLAG_UNREGISTER)
+ return -EINVAL;
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 1152259a4ca0..12bca64dff73 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -370,7 +370,7 @@ u64 sched_clock_cpu(int cpu)
if (sched_clock_stable())
return sched_clock() + __sched_clock_offset;
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return sched_clock();
preempt_disable_notrace();
@@ -393,7 +393,7 @@ void sched_clock_tick(void)
if (sched_clock_stable())
return;
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return;
lockdep_assert_irqs_disabled();
@@ -460,7 +460,7 @@ void __init sched_clock_init(void)
u64 sched_clock_cpu(int cpu)
{
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return 0;
return sched_clock();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b037f195473..1a9983da4408 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
#include <asm/tlb.h>
#include "../workqueue_internal.h"
+#include "../../fs/io-wq.h"
#include "../smpboot.h"
#include "pelt.h"
@@ -255,7 +256,7 @@ static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
@@ -314,7 +315,7 @@ void hrtick_start(struct rq *rq, u64 delay)
*/
delay = max_t(u64, delay, 10000LL);
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
}
#endif /* CONFIG_SMP */
@@ -328,7 +329,7 @@ static void hrtick_rq_init(struct rq *rq)
rq->hrtick_csd.info = rq;
#endif
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
#else /* CONFIG_SCHED_HRTICK */
@@ -551,27 +552,32 @@ void resched_cpu(int cpu)
*/
int get_nohz_timer_target(void)
{
- int i, cpu = smp_processor_id();
+ int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
- return cpu;
+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+ if (!idle_cpu(cpu))
+ return cpu;
+ default_cpu = cpu;
+ }
rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu(i, sched_domain_span(sd)) {
+ for_each_cpu_and(i, sched_domain_span(sd),
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
if (cpu == i)
continue;
- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
+ if (!idle_cpu(i)) {
cpu = i;
goto unlock;
}
}
}
- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ if (default_cpu == -1)
+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ cpu = default_cpu;
unlock:
rcu_read_unlock();
return cpu;
@@ -773,6 +779,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
/* Max allowed minimum utilization */
unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
@@ -798,7 +816,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
-static inline unsigned int uclamp_none(int clamp_id)
+static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -814,7 +832,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,
}
static inline unsigned int
-uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
/*
@@ -830,7 +848,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
return uclamp_none(UCLAMP_MIN);
}
-static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
/* Reset max-clamp retention only on idle exit */
@@ -841,8 +859,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
}
static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
- unsigned int clamp_value)
+unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
int bucket_id = UCLAMP_BUCKETS - 1;
@@ -861,16 +879,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ struct uclamp_se uc_max;
+
+ /*
+ * Tasks in autogroups or root task group will be
+ * restricted by system defaults.
+ */
+ if (task_group_is_autogroup(task_group(p)))
+ return uc_req;
+ if (task_group(p) == &root_task_group)
+ return uc_req;
+
+ uc_max = task_group(p)->uclamp[clamp_id];
+ if (uc_req.value > uc_max.value || !uc_req.user_defined)
+ return uc_max;
+#endif
+
+ return uc_req;
+}
+
/*
* The effective clamp bucket index of a task depends on, by increasing
* priority:
* - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ * group or in an autogroup
* - the system default clamp value, defined by the sysadmin
*/
static inline struct uclamp_se
-uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
{
- struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
struct uclamp_se uc_max = uclamp_default[clamp_id];
/* System default restrictions always apply */
@@ -880,17 +924,17 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
return uc_req;
}
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
/* Task currently refcounted: use back-annotated (effective) value */
if (p->uclamp[clamp_id].active)
- return p->uclamp[clamp_id].value;
+ return (unsigned long)p->uclamp[clamp_id].value;
uc_eff = uclamp_eff_get(p, clamp_id);
- return uc_eff.value;
+ return (unsigned long)uc_eff.value;
}
/*
@@ -904,7 +948,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
* for each bucket when all its RUNNABLE tasks require the same clamp.
*/
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
- unsigned int clamp_id)
+ enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -942,7 +986,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
* enforce the expected state and warn.
*/
static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
- unsigned int clamp_id)
+ enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -981,7 +1025,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -996,7 +1040,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1005,15 +1049,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ /*
+ * Lock the task and the rq where the task is (or was) queued.
+ *
+ * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+ * price to pay to safely serialize util_{min,max} updates with
+ * enqueues, dequeues and migration operations.
+ * This is the same locking schema used by __set_cpus_allowed_ptr().
+ */
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Setting the clamp bucket is serialized by task_rq_lock().
+ * If the task is not yet RUNNABLE and its task_struct is not
+ * affecting a valid clamp bucket, the next time it's enqueued,
+ * it will already see the updated clamp bucket value.
+ */
+ if (p->uclamp[clamp_id].active) {
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+ }
+
+ task_rq_unlock(rq, p, &rf);
+}
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+ unsigned int clamps)
+{
+ enum uclamp_id clamp_id;
+ struct css_task_iter it;
+ struct task_struct *p;
+
+ css_task_iter_start(css, 0, &it);
+ while ((p = css_task_iter_next(&it))) {
+ for_each_clamp_id(clamp_id) {
+ if ((0x1 << clamp_id) & clamps)
+ uclamp_update_active(p, clamp_id);
+ }
+ }
+ css_task_iter_end(&it);
+}
+
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+static void uclamp_update_root_tg(void)
+{
+ struct task_group *tg = &root_task_group;
+
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+ sysctl_sched_uclamp_util_min, false);
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+ sysctl_sched_uclamp_util_max, false);
+
+ rcu_read_lock();
+ cpu_util_update_eff(&root_task_group.css);
+ rcu_read_unlock();
+}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
+
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
+ bool update_root_tg = false;
int old_min, old_max;
- static DEFINE_MUTEX(mutex);
int result;
- mutex_lock(&mutex);
+ mutex_lock(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
@@ -1032,23 +1143,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
if (old_min != sysctl_sched_uclamp_util_min) {
uclamp_se_set(&uclamp_default[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
+ update_root_tg = true;
}
if (old_max != sysctl_sched_uclamp_util_max) {
uclamp_se_set(&uclamp_default[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
+ update_root_tg = true;
}
+ if (update_root_tg)
+ uclamp_update_root_tg();
+
/*
- * Updating all the RUNNABLE task is expensive, keep it simple and do
- * just a lazy update at each next enqueue time.
+ * We update all RUNNABLE tasks only when task groups are in use.
+ * Otherwise, keep it simple and do just a lazy update at each next
+ * task enqueue time.
*/
+
goto done;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
done:
- mutex_unlock(&mutex);
+ mutex_unlock(&uclamp_mutex);
return result;
}
@@ -1075,7 +1193,7 @@ static int uclamp_validate(struct task_struct *p,
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
/*
* On scheduling class change, reset to default clamps for tasks
@@ -1112,7 +1230,7 @@ static void __setscheduler_uclamp(struct task_struct *p,
static void uclamp_fork(struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
@@ -1134,11 +1252,14 @@ static void uclamp_fork(struct task_struct *p)
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
int cpu;
+ mutex_init(&uclamp_mutex);
+
for_each_possible_cpu(cpu) {
- memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+ memset(&cpu_rq(cpu)->uclamp, 0,
+ sizeof(struct uclamp_rq)*UCLAMP_CNT);
cpu_rq(cpu)->uclamp_flags = 0;
}
@@ -1149,8 +1270,13 @@ static void __init init_uclamp(void)
/* System defaults allow max clamp values for both indexes */
uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
- for_each_clamp_id(clamp_id)
+ for_each_clamp_id(clamp_id) {
uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ root_task_group.uclamp_req[clamp_id] = uc_max;
+ root_task_group.uclamp[clamp_id] = uc_max;
+#endif
+ }
}
#else /* CONFIG_UCLAMP_TASK */
@@ -1321,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
-static inline bool is_per_cpu_kthread(struct task_struct *p)
-{
- if (!(p->flags & PF_KTHREAD))
- return false;
-
- if (p->nr_cpus_allowed != 1)
- return false;
-
- return true;
-}
-
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1494,7 +1609,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
}
/*
@@ -1537,7 +1652,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(p->cpus_ptr, new_mask))
goto out;
- if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+ if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
@@ -1558,7 +1674,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
@@ -2986,7 +3101,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ spin_release(&rq->lock.dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = next;
@@ -3135,7 +3250,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
/* Task is done with its stack. */
put_task_stack(prev);
- put_task_struct(prev);
+ put_task_struct_rcu_user(prev);
}
tick_nohz_task_switch();
@@ -3214,12 +3329,8 @@ static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
- struct mm_struct *mm, *oldmm;
-
prepare_task_switch(rq, prev, next);
- mm = next->mm;
- oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
@@ -3228,22 +3339,37 @@ context_switch(struct rq *rq, struct task_struct *prev,
arch_start_context_switch(prev);
/*
- * If mm is non-NULL, we pass through switch_mm(). If mm is
- * NULL, we will pass through mmdrop() in finish_task_switch().
- * Both of these contain the full memory barrier required by
- * membarrier after storing to rq->curr, before returning to
- * user-space.
+ * kernel -> kernel lazy + transfer active
+ * user -> kernel lazy + mmgrab() active
+ *
+ * kernel -> user switch + mmdrop() active
+ * user -> user switch
*/
- if (!mm) {
- next->active_mm = oldmm;
- mmgrab(oldmm);
- enter_lazy_tlb(oldmm, next);
- } else
- switch_mm_irqs_off(oldmm, mm, next);
+ if (!next->mm) { // to kernel
+ enter_lazy_tlb(prev->active_mm, next);
- if (!prev->mm) {
- prev->active_mm = NULL;
- rq->prev_mm = oldmm;
+ next->active_mm = prev->active_mm;
+ if (prev->mm) // from user
+ mmgrab(prev->active_mm);
+ else
+ prev->active_mm = NULL;
+ } else { // to user
+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
+ /*
+ * sys_membarrier() requires an smp_mb() between setting
+ * rq->curr / membarrier_switch_mm() and returning to userspace.
+ *
+ * The below provides this either through switch_mm(), or in
+ * case 'prev->active_mm == next->mm' through
+ * finish_task_switch()'s mmdrop().
+ */
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
+
+ if (!prev->mm) { // from kernel
+ /* will mmdrop() in finish_task_switch(). */
+ rq->prev_mm = prev->active_mm;
+ prev->active_mm = NULL;
+ }
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@ -3486,8 +3612,36 @@ void scheduler_tick(void)
struct tick_work {
int cpu;
+ atomic_t state;
struct delayed_work work;
};
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE 0
+#define TICK_SCHED_REMOTE_OFFLINING 1
+#define TICK_SCHED_REMOTE_RUNNING 2
+
+/*
+ * State diagram for ->state:
+ *
+ *
+ * TICK_SCHED_REMOTE_OFFLINE
+ * | ^
+ * | |
+ * | | sched_tick_remote()
+ * | |
+ * | |
+ * +--TICK_SCHED_REMOTE_OFFLINING
+ * | ^
+ * | |
+ * sched_tick_start() | | sched_tick_stop()
+ * | |
+ * V |
+ * TICK_SCHED_REMOTE_RUNNING
+ *
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
+ */
static struct tick_work __percpu *tick_work_cpu;
@@ -3500,6 +3654,7 @@ static void sched_tick_remote(struct work_struct *work)
struct task_struct *curr;
struct rq_flags rf;
u64 delta;
+ int os;
/*
* Handle the tick only if it appears the remote CPU is running in full
@@ -3508,38 +3663,47 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
+ if (!tick_nohz_tick_stopped_cpu(cpu))
goto out_requeue;
rq_lock_irq(rq, &rf);
curr = rq->curr;
- if (is_idle_task(curr))
+ if (cpu_is_offline(cpu))
goto out_unlock;
+ curr = rq->curr;
update_rq_clock(rq);
- delta = rq_clock_task(rq) - curr->se.exec_start;
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a reasonable
+ * amount of time.
+ */
+ delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
curr->sched_class->task_tick(rq, curr, 0);
+ calc_load_nohz_remote(rq);
out_unlock:
rq_unlock_irq(rq, &rf);
-
out_requeue:
+
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
- * to keep scheduler internal stats reasonably up to date.
+ * to keep scheduler internal stats reasonably up to date. But
+ * first update state to reflect hotplug activity if required.
*/
- queue_delayed_work(system_unbound_wq, dwork, HZ);
+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ if (os == TICK_SCHED_REMOTE_RUNNING)
+ queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{
+ int os;
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@ -3548,15 +3712,20 @@ static void sched_tick_start(int cpu)
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- twork->cpu = cpu;
- INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ twork->cpu = cpu;
+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
+ int os;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
@@ -3564,7 +3733,10 @@ static void sched_tick_stop(int cpu)
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- cancel_delayed_work_sync(&twork->work);
+ /* There cannot be competing actions, but don't rely on stop-machine. */
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ /* Don't cancel, as this would mess up the state machine. */
}
#endif /* CONFIG_HOTPLUG_CPU */
@@ -3572,7 +3744,6 @@ int __init sched_tick_offload_init(void)
{
tick_work_cpu = alloc_percpu(struct tick_work);
BUG_ON(!tick_work_cpu);
-
return 0;
}
@@ -3581,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
#endif
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
/*
* If the value passed in is equal to the current preempt count
@@ -3700,13 +3871,22 @@ static noinline void __schedule_bug(struct task_struct *prev)
/*
* Various schedule()-time debugging checks and statistics:
*/
-static inline void schedule_debug(struct task_struct *prev)
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
#endif
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ if (!preempt && prev->state && prev->non_block_count) {
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
+ prev->comm, prev->pid, prev->non_block_count);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+ }
+#endif
+
if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
@@ -3737,25 +3917,41 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev, rf);
+ p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
- goto again;
+ goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
- if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev, rf);
+ if (!p) {
+ put_prev_task(rq, prev);
+ p = pick_next_task_idle(rq);
+ }
return p;
}
-again:
+restart:
+#ifdef CONFIG_SMP
+ /*
+ * We must do the balancing pass before put_next_task(), such
+ * that when we release the rq->lock the task is in the same
+ * state as before we took rq->lock.
+ *
+ * We can terminate the balance pass as soon as we know there is
+ * a runnable task of @class priority or higher.
+ */
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
+ if (class->balance(rq, prev, rf))
+ break;
+ }
+#endif
+
+ put_prev_task(rq, prev);
+
for_each_class(class) {
- p = class->pick_next_task(rq, prev, rf);
- if (p) {
- if (unlikely(p == RETRY_TASK))
- goto again;
+ p = class->pick_next_task(rq);
+ if (p)
return p;
- }
}
/* The idle class should always have a runnable task: */
@@ -3782,7 +3978,7 @@ again:
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
*
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
@@ -3791,7 +3987,7 @@ again:
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
* then at the next:
*
* - cond_resched() call
@@ -3813,7 +4009,7 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev);
+ schedule_debug(prev, preempt);
if (sched_feat(HRTICK))
hrtick_clear(rq);
@@ -3857,7 +4053,11 @@ static void __sched notrace __schedule(bool preempt)
if (likely(prev != next)) {
rq->nr_switches++;
- rq->curr = next;
+ /*
+ * RCU users of rcu_dereference(rq->curr) may not see
+ * changes to task_struct made by pick_next_task().
+ */
+ RCU_INIT_POINTER(rq->curr, next);
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
@@ -3904,7 +4104,7 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state || tsk_is_pi_blocked(tsk))
+ if (!tsk->state)
return;
/*
@@ -3914,12 +4114,18 @@ static inline void sched_submit_work(struct task_struct *tsk)
* we disable preemption to avoid it calling schedule() again
* in the possible wakeup of a kworker.
*/
- if (tsk->flags & PF_WQ_WORKER) {
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
preempt_disable();
- wq_worker_sleeping(tsk);
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else
+ io_wq_worker_sleeping(tsk);
preempt_enable_no_resched();
}
+ if (tsk_is_pi_blocked(tsk))
+ return;
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
@@ -3930,8 +4136,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
static void sched_update_worker(struct task_struct *tsk)
{
- if (tsk->flags & PF_WQ_WORKER)
- wq_worker_running(tsk);
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_running(tsk);
+ else
+ io_wq_worker_running(tsk);
+ }
}
asmlinkage __visible void __sched schedule(void)
@@ -4033,11 +4243,10 @@ static void __sched notrace preempt_schedule_common(void)
} while (need_resched());
}
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
- * this is the entry point to schedule() from in-kernel preemption
- * off of preempt_enable. Kernel preemptions off return from interrupt
- * occur there and call schedule directly.
+ * This is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable.
*/
asmlinkage __visible void __sched notrace preempt_schedule(void)
{
@@ -4105,10 +4314,10 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
/*
- * this is the entry point to schedule() from kernel preemption
+ * This is the entry point to schedule() from kernel preemption
* off of irq context.
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
@@ -4273,7 +4482,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (queued)
enqueue_task(rq, p, queue_flag);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -4294,7 +4503,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
- int old_prio, delta;
+ int old_prio;
struct rq_flags rf;
struct rq *rq;
@@ -4328,19 +4537,18 @@ void set_user_nice(struct task_struct *p, long nice)
set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
- delta = p->prio - old_prio;
- if (queued) {
+ if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
- resched_curr(rq);
- }
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
+
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ p->sched_class->prio_changed(rq, p, old_prio);
+
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -4657,6 +4865,9 @@ recheck:
return retval;
}
+ if (pi)
+ cpuset_read_lock();
+
/*
* Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
@@ -4671,8 +4882,8 @@ recheck:
* Changing the policy of the stop threads its a very bad idea:
*/
if (p == rq->stop) {
- task_rq_unlock(rq, p, &rf);
- return -EINVAL;
+ retval = -EINVAL;
+ goto unlock;
}
/*
@@ -4690,8 +4901,8 @@ recheck:
goto change;
p->sched_reset_on_fork = reset_on_fork;
- task_rq_unlock(rq, p, &rf);
- return 0;
+ retval = 0;
+ goto unlock;
}
change:
@@ -4704,8 +4915,8 @@ change:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
+ retval = -EPERM;
+ goto unlock;
}
#endif
#ifdef CONFIG_SMP
@@ -4720,8 +4931,8 @@ change:
*/
if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
+ retval = -EPERM;
+ goto unlock;
}
}
#endif
@@ -4731,6 +4942,8 @@ change:
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
goto recheck;
}
@@ -4740,8 +4953,8 @@ change:
* is available.
*/
if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- task_rq_unlock(rq, p, &rf);
- return -EBUSY;
+ retval = -EBUSY;
+ goto unlock;
}
p->sched_reset_on_fork = reset_on_fork;
@@ -4783,7 +4996,7 @@ change:
enqueue_task(rq, p, queue_flags);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
@@ -4791,14 +5004,22 @@ change:
preempt_disable();
task_rq_unlock(rq, p, &rf);
- if (pi)
+ if (pi) {
+ cpuset_read_unlock();
rt_mutex_adjust_pi(p);
+ }
/* Run balance callbacks after we've adjusted the PI chain: */
balance_callback(rq);
preempt_enable();
return 0;
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
+ return retval;
}
static int _sched_setscheduler(struct task_struct *p, int policy,
@@ -4882,10 +5103,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setscheduler(p, policy, &lparam);
+ if (likely(p))
+ get_task_struct(p);
rcu_read_unlock();
+ if (likely(p)) {
+ retval = sched_setscheduler(p, policy, &lparam);
+ put_task_struct(p);
+ }
+
return retval;
}
@@ -4897,9 +5123,6 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
u32 size;
int ret;
- if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0))
- return -EFAULT;
-
/* Zero the full structure, so that a short copy will be nice: */
memset(attr, 0, sizeof(*attr));
@@ -4907,45 +5130,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
if (ret)
return ret;
- /* Bail out on silly large: */
- if (size > PAGE_SIZE)
- goto err_size;
-
/* ABI compatibility quirk: */
if (!size)
size = SCHED_ATTR_SIZE_VER0;
-
- if (size < SCHED_ATTR_SIZE_VER0)
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
goto err_size;
- /*
- * If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
- if (size > sizeof(*attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(*attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- ret = get_user(val, addr);
- if (ret)
- return ret;
- if (val)
- goto err_size;
- }
- size = sizeof(*attr);
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
}
- ret = copy_from_user(attr, uattr, size);
- if (ret)
- return -EFAULT;
-
if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
size < SCHED_ATTR_SIZE_VER1)
return -EINVAL;
@@ -5102,37 +5299,40 @@ out_unlock:
return retval;
}
-static int sched_read_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr,
- unsigned int usize)
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ struct sched_attr *kattr,
+ unsigned int usize)
{
- int ret;
+ unsigned int ksize = sizeof(*kattr);
if (!access_ok(uattr, usize))
return -EFAULT;
/*
- * If we're handed a smaller struct than we know of,
- * ensure all the unknown bits are 0 - i.e. old
- * user-space does not get uncomplete information.
+ * sched_getattr() ABI forwards and backwards compatibility:
+ *
+ * If usize == ksize then we just copy everything to user-space and all is good.
+ *
+ * If usize < ksize then we only copy as much as user-space has space for,
+ * this keeps ABI compatibility as well. We skip the rest.
+ *
+ * If usize > ksize then user-space is using a newer version of the ABI,
+ * which part the kernel doesn't know about. Just ignore it - tooling can
+ * detect the kernel's knowledge of attributes from the attr->size value
+ * which is set to ksize in this case.
*/
- if (usize < sizeof(*attr)) {
- unsigned char *addr;
- unsigned char *end;
-
- addr = (void *)attr + usize;
- end = (void *)attr + sizeof(*attr);
+ kattr->size = min(usize, ksize);
- for (; addr < end; addr++) {
- if (*addr)
- return -EFBIG;
- }
-
- attr->size = usize;
- }
-
- ret = copy_to_user(uattr, attr, attr->size);
- if (ret)
+ if (copy_to_user(uattr, kattr, kattr->size))
return -EFAULT;
return 0;
@@ -5142,20 +5342,18 @@ static int sched_read_attr(struct sched_attr __user *uattr,
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
+ * @usize: sizeof(attr) for fwd/bwd comp.
* @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size, unsigned int, flags)
+ unsigned int, usize, unsigned int, flags)
{
- struct sched_attr attr = {
- .size = sizeof(struct sched_attr),
- };
+ struct sched_attr kattr = { };
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0 || flags)
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
@@ -5168,25 +5366,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
if (retval)
goto out_unlock;
- attr.sched_policy = p->policy;
+ kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
if (task_has_dl_policy(p))
- __getparam_dl(p, &attr);
+ __getparam_dl(p, &kattr);
else if (task_has_rt_policy(p))
- attr.sched_priority = p->rt_priority;
+ kattr.sched_priority = p->rt_priority;
else
- attr.sched_nice = task_nice(p);
+ kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
- attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
rcu_read_unlock();
- retval = sched_read_attr(uattr, &attr, size);
- return retval;
+ return sched_attr_copy_to_user(uattr, &kattr, usize);
out_unlock:
rcu_read_unlock();
@@ -5416,7 +5613,7 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
int __sched _cond_resched(void)
{
if (should_resched(0)) {
@@ -5433,7 +5630,7 @@ EXPORT_SYMBOL(_cond_resched);
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
* call schedule, and on return reacquire the lock.
*
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
@@ -5830,10 +6027,11 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ __sched_fork(0, idle);
+
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_lock(&rq->lock);
- __sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
idle->flags |= PF_IDLE;
@@ -5863,7 +6061,8 @@ void init_idle(struct task_struct *idle, int cpu)
__set_task_cpu(idle, cpu);
rcu_read_unlock();
- rq->curr = rq->idle = idle;
+ rq->idle = idle;
+ rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
#ifdef CONFIG_SMP
idle->on_cpu = 1;
@@ -5972,7 +6171,7 @@ void sched_setnuma(struct task_struct *p, int nid)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -6012,21 +6211,22 @@ static void calc_load_migrate(struct rq *rq)
atomic_long_add(delta, &calc_load_tasks);
}
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_migrate_task(struct rq *rq)
{
-}
+ const struct sched_class *class;
+ struct task_struct *next;
-static const struct sched_class fake_sched_class = {
- .put_prev_task = put_prev_task_fake,
-};
+ for_each_class(class) {
+ next = class->pick_next_task(rq);
+ if (next) {
+ next->sched_class->put_prev_task(rq, next);
+ return next;
+ }
+ }
-static struct task_struct fake_task = {
- /*
- * Avoid pull_{rt,dl}_task()
- */
- .prio = MAX_PRIO + 1,
- .sched_class = &fake_sched_class,
-};
+ /* The idle class should always have a runnable task */
+ BUG();
+}
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
@@ -6069,12 +6269,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
if (rq->nr_running == 1)
break;
- /*
- * pick_next_task() assumes pinned rq->lock:
- */
- next = pick_next_task(rq, &fake_task, rf);
- BUG_ON(!next);
- put_prev_task(rq, next);
+ next = __pick_migrate_task(rq);
/*
* Rules for changing task_struct::cpus_mask are holding
@@ -6228,8 +6423,6 @@ int sched_cpu_activate(unsigned int cpu)
}
rq_unlock_irqrestore(rq, &rf);
- update_max_interval();
-
return 0;
}
@@ -6371,19 +6564,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
void __init sched_init(void)
{
- unsigned long alloc_size = 0, ptr;
+ unsigned long ptr = 0;
int i;
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
- if (alloc_size) {
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+ if (ptr) {
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.se = (struct sched_entity **)ptr;
@@ -6570,7 +6763,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
rcu_sleep_check();
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
+ !is_idle_task(current) && !current->non_block_count) ||
system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
oops_in_progress)
return;
@@ -6586,8 +6779,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
"BUG: sleeping function called from invalid context at %s:%d\n",
file, line);
printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(),
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
current->pid, current->comm);
if (task_stack_end_corrupted(current))
@@ -6702,7 +6895,7 @@ struct task_struct *curr_task(int cpu)
#ifdef CONFIG_IA64
/**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
* @cpu: the processor in question.
* @p: the task pointer to set.
*
@@ -6727,6 +6920,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+ struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&tg->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
+ tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+ }
+#endif
+}
+
static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
@@ -6750,6 +6957,8 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ alloc_uclamp_sched_group(tg, parent);
+
return tg;
err:
@@ -6852,8 +7061,15 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, queue_flags);
- if (running)
- set_curr_task(rq, tsk);
+ if (running) {
+ set_next_task(rq, tsk);
+ /*
+ * After changing group, the running task may have joined a
+ * throttled one but it's still the running task. Trigger a
+ * resched to make sure that task can still run.
+ */
+ resched_curr(rq);
+ }
task_rq_unlock(rq, tsk, &rf);
}
@@ -6889,6 +7105,12 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent)
sched_online_group(tg, parent);
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* Propagate the effective uclamp value for the new group */
+ cpu_util_update_eff(css);
+#endif
+
return 0;
}
@@ -6936,10 +7158,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#else
- /* We don't support RT-tasks being in separate groups */
- if (task->sched_class != &fair_sched_class)
- return -EINVAL;
#endif
/*
* Serialize against wake_up_new_task() such that if its
@@ -6970,6 +7188,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
sched_move_task(task);
}
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *top_css = css;
+ struct uclamp_se *uc_parent = NULL;
+ struct uclamp_se *uc_se = NULL;
+ unsigned int eff[UCLAMP_CNT];
+ enum uclamp_id clamp_id;
+ unsigned int clamps;
+
+ css_for_each_descendant_pre(css, top_css) {
+ uc_parent = css_tg(css)->parent
+ ? css_tg(css)->parent->uclamp : NULL;
+
+ for_each_clamp_id(clamp_id) {
+ /* Assume effective clamps matches requested clamps */
+ eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+ /* Cap effective clamps with parent's effective clamps */
+ if (uc_parent &&
+ eff[clamp_id] > uc_parent[clamp_id].value) {
+ eff[clamp_id] = uc_parent[clamp_id].value;
+ }
+ }
+ /* Ensure protection is always capped by limit */
+ eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+ /* Propagate most restrictive effective clamps */
+ clamps = 0x0;
+ uc_se = css_tg(css)->uclamp;
+ for_each_clamp_id(clamp_id) {
+ if (eff[clamp_id] == uc_se[clamp_id].value)
+ continue;
+ uc_se[clamp_id].value = eff[clamp_id];
+ uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+ clamps |= (0x1 << clamp_id);
+ }
+ if (!clamps) {
+ css = css_rightmost_descendant(css);
+ continue;
+ }
+
+ /* Immediately update descendants RUNNABLE tasks */
+ uclamp_update_active_tasks(css, clamps);
+ }
+}
+
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT 2
+#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
+ s64 percent;
+ u64 util;
+ int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+ struct uclamp_request req = {
+ .percent = UCLAMP_PERCENT_SCALE,
+ .util = SCHED_CAPACITY_SCALE,
+ .ret = 0,
+ };
+
+ buf = strim(buf);
+ if (strcmp(buf, "max")) {
+ req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+ &req.percent);
+ if (req.ret)
+ return req;
+ if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
+ req.ret = -ERANGE;
+ return req;
+ }
+
+ req.util = req.percent << SCHED_CAPACITY_SHIFT;
+ req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+ }
+
+ return req;
+}
+
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off,
+ enum uclamp_id clamp_id)
+{
+ struct uclamp_request req;
+ struct task_group *tg;
+
+ req = capacity_from_percent(buf);
+ if (req.ret)
+ return req.ret;
+
+ mutex_lock(&uclamp_mutex);
+ rcu_read_lock();
+
+ tg = css_tg(of_css(of));
+ if (tg->uclamp_req[clamp_id].value != req.util)
+ uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+
+ /*
+ * Because of not recoverable conversion rounding we keep track of the
+ * exact requested value
+ */
+ tg->uclamp_pct[clamp_id] = req.percent;
+
+ /* Update effective clamps to track the most restrictive value */
+ cpu_util_update_eff(of_css(of));
+
+ rcu_read_unlock();
+ mutex_unlock(&uclamp_mutex);
+
+ return nbytes;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+ enum uclamp_id clamp_id)
+{
+ struct task_group *tg;
+ u64 util_clamp;
+ u64 percent;
+ u32 rem;
+
+ rcu_read_lock();
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ rcu_read_unlock();
+
+ if (util_clamp == SCHED_CAPACITY_SCALE) {
+ seq_puts(sf, "max\n");
+ return;
+ }
+
+ percent = tg->uclamp_pct[clamp_id];
+ percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+ seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+ cpu_uclamp_print(sf, UCLAMP_MIN);
+ return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+ cpu_uclamp_print(sf, UCLAMP_MAX);
+ return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
@@ -7315,6 +7705,20 @@ static struct cftype cpu_legacy_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
+#endif
{ } /* Terminate */
};
@@ -7482,6 +7886,20 @@ static struct cftype cpu_files[] = {
.write = cpu_max_write,
},
#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
+#endif
{ } /* terminate */
};
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index b5dcd1d83c7f..7c2fe50fd76d 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,6 +5,8 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include <linux/cpufreq.h>
+
#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
@@ -57,3 +59,19 @@ void cpufreq_remove_update_util_hook(int cpu)
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
}
EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
+
+/**
+ * cpufreq_this_cpu_can_update - Check if cpufreq policy can be updated.
+ * @policy: cpufreq policy to check.
+ *
+ * Return 'true' if:
+ * - the local and remote CPUs share @policy,
+ * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going
+ * offline (in which case it is not expected to run cpufreq updates any more).
+ */
+bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy)
+{
+ return cpumask_test_cpu(smp_processor_id(), policy->cpus) ||
+ (policy->dvfs_possible_from_any_cpu &&
+ rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)));
+}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 867b4bb6d4be..7fbaee24c824 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -82,12 +82,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
* by the hardware, as calculating the frequency is pointless if
* we cannot in fact act on it.
*
- * For the slow switching platforms, the kthread is always scheduled on
- * the right set of CPUs and any CPU can find the next frequency and
- * schedule the kthread.
+ * This is needed on the slow switching platforms too to prevent CPUs
+ * going offline from leaving stale IRQ work items behind.
*/
- if (sg_policy->policy->fast_switch_enabled &&
- !cpufreq_this_cpu_can_update(sg_policy->policy))
+ if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
if (unlikely(sg_policy->limits_changed)) {
@@ -117,6 +115,7 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
struct cpufreq_policy *policy = sg_policy->policy;
+ int cpu;
if (!sugov_update_next_freq(sg_policy, time, next_freq))
return;
@@ -126,7 +125,11 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
return;
policy->cur = next_freq;
- trace_cpu_frequency(next_freq, smp_processor_id());
+
+ if (trace_cpu_frequency_enabled()) {
+ for_each_cpu(cpu, policy->cpus)
+ trace_cpu_frequency(next_freq, cpu);
+ }
}
static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
@@ -235,7 +238,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
*/
util = util_cfs + cpu_util_rt(rq);
if (type == FREQUENCY_UTIL)
- util = uclamp_util_with(rq, util, p);
+ util = uclamp_rq_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
@@ -263,9 +266,9 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
* irq metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
- * 1 - irq
- * U' = irq + ------- * U
- * max
+ * max - irq
+ * U' = irq + --------- * U
+ * max
*/
util = scale_irq_capacity(util, irq, max);
util += irq;
@@ -910,7 +913,7 @@ static int __init sugov_register(void)
{
return cpufreq_register_governor(&schedutil_gov);
}
-fs_initcall(sugov_register);
+core_initcall(sugov_register);
#ifdef CONFIG_ENERGY_MODEL
extern bool sched_energy_update;
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index b7abca987d94..1a2719e1350a 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -46,6 +46,8 @@ static int convert_prio(int prio)
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
+ * @fitness_fn: A pointer to a function to do custom checks whether the CPU
+ * fits a specific criteria so that we only return those CPUs.
*
* Note: This function returns the recommended CPUs as calculated during the
* current invocation. By the time the call returns, the CPUs may have in
@@ -57,7 +59,8 @@ static int convert_prio(int prio)
* Return: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
- struct cpumask *lowest_mask)
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu))
{
int idx = 0;
int task_pri = convert_prio(p->prio);
@@ -98,6 +101,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
continue;
if (lowest_mask) {
+ int cpu;
+
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
@@ -108,7 +113,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
* condition, simply act as though we never hit this
* priority level and continue on.
*/
- if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+ if (cpumask_empty(lowest_mask))
+ continue;
+
+ if (!fitness_fn)
+ return 1;
+
+ /* Ensure the capacity of the CPUs fit the task */
+ for_each_cpu(cpu, lowest_mask) {
+ if (!fitness_fn(p, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
+ }
+
+ /*
+ * If no CPU at the current priority can fit the task
+ * continue looking
+ */
+ if (cpumask_empty(lowest_mask))
continue;
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 7dc20a3232e7..32dd520db11f 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -18,7 +18,9 @@ struct cpupri {
};
#ifdef CONFIG_SMP
-int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu));
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2305ce89a26c..cff3e656566d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int ticks)
+ int ticks)
{
u64 other, cputime = TICK_NSEC * ticks;
@@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime);
- } else if (p == rq->idle) {
+ } else if (p == this_rq()->idle) {
account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime);
@@ -392,40 +392,36 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
static void irqtime_account_idle_ticks(int ticks)
{
- struct rq *rq = this_rq();
-
- irqtime_account_process_tick(current, 0, rq, ticks);
+ irqtime_account_process_tick(current, 0, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int nr_ticks) { }
+ int nr_ticks) { }
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
* Use precise platform statistics if available:
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_common_task_switch(struct task_struct *prev)
+void vtime_task_switch(struct task_struct *prev)
{
if (is_idle_task(prev))
vtime_account_idle(prev);
else
- vtime_account_system(prev);
+ vtime_account_kernel(prev);
vtime_flush(prev);
arch_vtime_task_switch(prev);
}
# endif
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
+ * vtime_account_kernel() and vtime_account_idle(). Archs that
* have other meaning of the idle time (s390 only includes the
* time spent by the CPU when it's in low power mode) must override
* vtime_account().
@@ -436,7 +432,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
if (!in_interrupt() && is_idle_task(tsk))
vtime_account_idle(tsk);
else
- vtime_account_system(tsk);
+ vtime_account_kernel(tsk);
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -475,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
void account_process_tick(struct task_struct *p, int user_tick)
{
u64 cputime, steal;
- struct rq *rq = this_rq();
- if (vtime_accounting_cpu_enabled())
+ if (vtime_accounting_enabled_this_cpu())
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq, 1);
+ irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -495,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (user_tick)
account_user_time(p, cputime);
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, cputime);
else
account_idle_time(cputime);
@@ -711,8 +706,8 @@ static u64 get_vtime_delta(struct vtime *vtime)
return delta - other;
}
-static void __vtime_account_system(struct task_struct *tsk,
- struct vtime *vtime)
+static void vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
{
vtime->stime += get_vtime_delta(vtime);
if (vtime->stime >= TICK_NSEC) {
@@ -731,7 +726,17 @@ static void vtime_account_guest(struct task_struct *tsk,
}
}
-void vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_kernel(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ /* We might have scheduled out from guest path */
+ if (vtime->state == VTIME_GUEST)
+ vtime_account_guest(tsk, vtime);
+ else
+ vtime_account_system(tsk, vtime);
+}
+
+void vtime_account_kernel(struct task_struct *tsk)
{
struct vtime *vtime = &tsk->vtime;
@@ -739,11 +744,7 @@ void vtime_account_system(struct task_struct *tsk)
return;
write_seqcount_begin(&vtime->seqcount);
- /* We might have scheduled out from guest path */
- if (current->flags & PF_VCPU)
- vtime_account_guest(tsk, vtime);
- else
- __vtime_account_system(tsk, vtime);
+ __vtime_account_kernel(tsk, vtime);
write_seqcount_end(&vtime->seqcount);
}
@@ -752,7 +753,7 @@ void vtime_user_enter(struct task_struct *tsk)
struct vtime *vtime = &tsk->vtime;
write_seqcount_begin(&vtime->seqcount);
- __vtime_account_system(tsk, vtime);
+ vtime_account_system(tsk, vtime);
vtime->state = VTIME_USER;
write_seqcount_end(&vtime->seqcount);
}
@@ -782,8 +783,9 @@ void vtime_guest_enter(struct task_struct *tsk)
* that can thus safely catch up with a tickless delta.
*/
write_seqcount_begin(&vtime->seqcount);
- __vtime_account_system(tsk, vtime);
- current->flags |= PF_VCPU;
+ vtime_account_system(tsk, vtime);
+ tsk->flags |= PF_VCPU;
+ vtime->state = VTIME_GUEST;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
@@ -794,7 +796,8 @@ void vtime_guest_exit(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
- current->flags &= ~PF_VCPU;
+ tsk->flags &= ~PF_VCPU;
+ vtime->state = VTIME_SYS;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -804,19 +807,30 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(get_vtime_delta(&tsk->vtime));
}
-void arch_vtime_task_switch(struct task_struct *prev)
+void vtime_task_switch_generic(struct task_struct *prev)
{
struct vtime *vtime = &prev->vtime;
write_seqcount_begin(&vtime->seqcount);
+ if (vtime->state == VTIME_IDLE)
+ vtime_account_idle(prev);
+ else
+ __vtime_account_kernel(prev, vtime);
vtime->state = VTIME_INACTIVE;
+ vtime->cpu = -1;
write_seqcount_end(&vtime->seqcount);
vtime = &current->vtime;
write_seqcount_begin(&vtime->seqcount);
- vtime->state = VTIME_SYS;
+ if (is_idle_task(current))
+ vtime->state = VTIME_IDLE;
+ else if (current->flags & PF_VCPU)
+ vtime->state = VTIME_GUEST;
+ else
+ vtime->state = VTIME_SYS;
vtime->starttime = sched_clock();
+ vtime->cpu = smp_processor_id();
write_seqcount_end(&vtime->seqcount);
}
@@ -827,8 +841,9 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags);
write_seqcount_begin(&vtime->seqcount);
- vtime->state = VTIME_SYS;
+ vtime->state = VTIME_IDLE;
vtime->starttime = sched_clock();
+ vtime->cpu = cpu;
write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
@@ -846,7 +861,7 @@ u64 task_gtime(struct task_struct *t)
seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
- if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+ if (vtime->state == VTIME_GUEST)
gtime += vtime->gtime + vtime_delta(vtime);
} while (read_seqcount_retry(&vtime->seqcount, seq));
@@ -877,20 +892,230 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
*utime = t->utime;
*stime = t->stime;
- /* Task is sleeping, nothing to add */
- if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
+ /* Task is sleeping or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
continue;
delta = vtime_delta(vtime);
/*
- * Task runs either in user or kernel space, add pending nohz time to
- * the right place.
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
*/
- if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
- *utime += vtime->utime + delta;
- else if (vtime->state == VTIME_SYS)
+ if (vtime->state == VTIME_SYS)
*stime += vtime->stime + delta;
+ else
+ *utime += vtime->utime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
}
+
+static int vtime_state_check(struct vtime *vtime, int cpu)
+{
+ /*
+ * We raced against a context switch, fetch the
+ * kcpustat task again.
+ */
+ if (vtime->cpu != cpu && vtime->cpu != -1)
+ return -EAGAIN;
+
+ /*
+ * Two possible things here:
+ * 1) We are seeing the scheduling out task (prev) or any past one.
+ * 2) We are seeing the scheduling in task (next) but it hasn't
+ * passed though vtime_task_switch() yet so the pending
+ * cputime of the prev task may not be flushed yet.
+ *
+ * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
+ */
+ if (vtime->state == VTIME_INACTIVE)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static u64 kcpustat_user_vtime(struct vtime *vtime)
+{
+ if (vtime->state == VTIME_USER)
+ return vtime->utime + vtime_delta(vtime);
+ else if (vtime->state == VTIME_GUEST)
+ return vtime->gtime + vtime_delta(vtime);
+ return 0;
+}
+
+static int kcpustat_field_vtime(u64 *cpustat,
+ struct task_struct *tsk,
+ enum cpu_usage_stat usage,
+ int cpu, u64 *val)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+ int err;
+
+ do {
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ err = vtime_state_check(vtime, cpu);
+ if (err < 0)
+ return err;
+
+ *val = cpustat[usage];
+
+ /*
+ * Nice VS unnice cputime accounting may be inaccurate if
+ * the nice value has changed since the last vtime update.
+ * But proper fix would involve interrupting target on nice
+ * updates which is a no go on nohz_full (although the scheduler
+ * may still interrupt the target if rescheduling is needed...)
+ */
+ switch (usage) {
+ case CPUTIME_SYSTEM:
+ if (vtime->state == VTIME_SYS)
+ *val += vtime->stime + vtime_delta(vtime);
+ break;
+ case CPUTIME_USER:
+ if (task_nice(tsk) <= 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_NICE:
+ if (task_nice(tsk) > 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_GUEST:
+ if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ case CPUTIME_GUEST_NICE:
+ if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ default:
+ break;
+ }
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return 0;
+}
+
+u64 kcpustat_field(struct kernel_cpustat *kcpustat,
+ enum cpu_usage_stat usage, int cpu)
+{
+ u64 *cpustat = kcpustat->cpustat;
+ struct rq *rq;
+ u64 val;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu))
+ return cpustat[usage];
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ return cpustat[usage];
+ }
+
+ err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
+ rcu_read_unlock();
+
+ if (!err)
+ return val;
+
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(kcpustat_field);
+
+static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
+ const struct kernel_cpustat *src,
+ struct task_struct *tsk, int cpu)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+ int err;
+
+ do {
+ u64 *cpustat;
+ u64 delta;
+
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ err = vtime_state_check(vtime, cpu);
+ if (err < 0)
+ return err;
+
+ *dst = *src;
+ cpustat = dst->cpustat;
+
+ /* Task is sleeping, dead or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
+ continue;
+
+ delta = vtime_delta(vtime);
+
+ /*
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
+ */
+ if (vtime->state == VTIME_SYS) {
+ cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
+ } else if (vtime->state == VTIME_USER) {
+ if (task_nice(tsk) > 0)
+ cpustat[CPUTIME_NICE] += vtime->utime + delta;
+ else
+ cpustat[CPUTIME_USER] += vtime->utime + delta;
+ } else {
+ WARN_ON_ONCE(vtime->state != VTIME_GUEST);
+ if (task_nice(tsk) > 0) {
+ cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
+ cpustat[CPUTIME_NICE] += vtime->gtime + delta;
+ } else {
+ cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
+ cpustat[CPUTIME_USER] += vtime->gtime + delta;
+ }
+ }
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return err;
+}
+
+void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
+{
+ const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
+ struct rq *rq;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu)) {
+ *dst = *src;
+ return;
+ }
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ *dst = *src;
+ return;
+ }
+
+ err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
+ rcu_read_unlock();
+
+ if (!err)
+ return;
+
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
+
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 46122edd8552..43323f875cb9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -287,7 +287,7 @@ static void task_non_contending(struct task_struct *p)
dl_se->dl_non_contending = 1;
get_task_struct(p);
- hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+ hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
}
static void task_contending(struct sched_dl_entity *dl_se, int flags)
@@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{
struct rq *later_rq = NULL;
+ struct dl_bw *dl_b;
later_rq = find_lock_later_rq(p, rq);
if (!later_rq) {
@@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
double_lock_balance(rq, later_rq);
}
+ if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+ /*
+ * Inactive timer is armed (or callback is running, but
+ * waiting for us to release rq locks). In any case, when it
+ * will fire (or continue), it will see running_bw of this
+ * task migrated to later_rq (and correctly handle it).
+ */
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
+
+ add_rq_bw(&p->dl, &later_rq->dl);
+ add_running_bw(&p->dl, &later_rq->dl);
+ } else {
+ sub_rq_bw(&p->dl, &rq->dl);
+ add_rq_bw(&p->dl, &later_rq->dl);
+ }
+
+ /*
+ * And we finally need to fixup root_domain(s) bandwidth accounting,
+ * since p is still hanging out in the old (now moved to default) root
+ * domain.
+ */
+ dl_b = &rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+
+ dl_b = &later_rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+
set_task_cpu(p, later_rq->cpu);
double_unlock_balance(later_rq, rq);
@@ -923,7 +956,7 @@ static int start_dl_timer(struct task_struct *p)
*/
if (!hrtimer_is_queued(timer)) {
get_task_struct(p);
- hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+ hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
}
return 1;
@@ -1053,7 +1086,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
timer->function = dl_task_timer;
}
@@ -1292,7 +1325,7 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->inactive_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
timer->function = inactive_task_timer;
}
@@ -1658,6 +1691,22 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
resched_curr(rq);
}
+static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+{
+ if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
+
+ return sched_stop_runnable(rq) || sched_dl_runnable(rq);
+}
#endif /* CONFIG_SMP */
/*
@@ -1694,12 +1743,23 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+
+ if (!first)
+ return;
+
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, p);
+
+ if (rq->curr->sched_class != &dl_sched_class)
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ deadline_queue_push_tasks(rq);
}
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1713,61 +1773,19 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-static struct task_struct *
-pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
+ struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
- struct dl_rq *dl_rq;
-
- dl_rq = &rq->dl;
- if (need_pull_dl_task(rq, prev)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we're
- * being very careful to re-start the picking loop.
- */
- rq_unpin_lock(rq, rf);
- pull_dl_task(rq);
- rq_repin_lock(rq, rf);
- /*
- * pull_dl_task() can drop (and re-acquire) rq->lock; this
- * means a stop task can slip in, in which case we need to
- * re-start task selection.
- */
- if (rq->stop && task_on_rq_queued(rq->stop))
- return RETRY_TASK;
- }
-
- /*
- * When prev is DL, we may throttle it in put_prev_task().
- * So, we update time before we check for dl_nr_running.
- */
- if (prev->sched_class == &dl_sched_class)
- update_curr_dl(rq);
-
- if (unlikely(!dl_rq->dl_nr_running))
+ if (!sched_dl_runnable(rq))
return NULL;
- put_prev_task(rq, prev);
-
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
-
p = dl_task_of(dl_se);
-
- set_next_task(rq, p);
-
- if (hrtick_enabled(rq))
- start_hrtick_dl(rq, p);
-
- deadline_queue_push_tasks(rq);
-
- if (rq->curr->sched_class != &dl_sched_class)
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
-
+ set_next_task_dl(rq, p, true);
return p;
}
@@ -1811,11 +1829,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-static void set_curr_task_dl(struct rq *rq)
-{
- set_next_task(rq, rq->curr);
-}
-
#ifdef CONFIG_SMP
/* Only try algorithms three times */
@@ -2275,6 +2288,36 @@ void __init init_sched_dl_class(void)
GFP_KERNEL, cpu_to_node(i));
}
+void dl_add_task_root_domain(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+ struct dl_bw *dl_b;
+
+ rq = task_rq_lock(p, &rf);
+ if (!dl_task(p))
+ goto unlock;
+
+ dl_b = &rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+
+ raw_spin_unlock(&dl_b->lock);
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+}
+
+void dl_clear_root_domain(struct root_domain *rd)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+ rd->dl_bw.total_bw = 0;
+ raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+}
+
#endif /* CONFIG_SMP */
static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@ -2395,8 +2438,10 @@ const struct sched_class dl_sched_class = {
.pick_next_task = pick_next_task_dl,
.put_prev_task = put_prev_task_dl,
+ .set_next_task = set_next_task_dl,
#ifdef CONFIG_SMP
+ .balance = balance_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
@@ -2405,7 +2450,6 @@ const struct sched_class dl_sched_class = {
.task_woken = task_woken_dl,
#endif
- .set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f7e4579e746c..879d3ccf3806 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -751,9 +751,16 @@ void sysrq_sched_debug_show(void)
int cpu;
sched_debug_header(NULL);
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ /*
+ * Need to reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI or stop_machine.
+ */
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
print_cpu(NULL, cpu);
-
+ }
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bc9cfeaac8bd..3c8a379c357e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu)
}
/*
- * The margin used when comparing utilization with CPU capacity:
- * util * margin < capacity * 1024
+ * The margin used when comparing utilization with CPU capacity.
*
* (default: ~20%)
*/
-static unsigned int capacity_margin = 1280;
+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
+
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -229,8 +229,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
}
}
- /* hint to use a 32x32->64 mul */
- fact = (u64)(u32)fact * lw->inv_weight;
+ fact = mul_u32_u32(fact, lw->inv_weight);
while (fact >> 32) {
fact >>= 1;
@@ -749,7 +748,6 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
static void attach_entity_cfs_rq(struct sched_entity *se);
/*
@@ -803,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p)
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
@@ -1188,47 +1186,6 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax);
}
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
-{
- int mm_users = 0;
- struct mm_struct *mm = p->mm;
-
- if (mm) {
- mm_users = atomic_read(&mm->mm_users);
- if (mm_users == 1) {
- mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
- mm->numa_scan_seq = 0;
- }
- }
- p->node_stamp = 0;
- p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
- p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- RCU_INIT_POINTER(p->numa_group, NULL);
- p->last_task_numa_placement = 0;
- p->last_sum_exec_runtime = 0;
-
- /* New address space, reset the preferred nid */
- if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = NUMA_NO_NODE;
- return;
- }
-
- /*
- * New thread, keep existing numa_preferred_nid which should be copied
- * already by arch_dup_task_struct but stagger when scans start.
- */
- if (mm) {
- unsigned int delay;
-
- delay = min_t(unsigned int, task_scan_max(current),
- current->numa_scan_period * mm_users * NSEC_PER_MSEC);
- delay += 2 * TICK_NSEC;
- p->node_stamp = delay;
- }
-}
-
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
@@ -1516,7 +1473,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long cpu_runnable_load(struct rq *rq);
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+
+static unsigned long cpu_runnable_load(struct rq *rq)
+{
+ return cfs_rq_runnable_load_avg(&rq->cfs);
+}
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -1644,7 +1606,7 @@ static void task_numa_compare(struct task_numa_env *env,
return;
rcu_read_lock();
- cur = task_rcu_dereference(&dst_rq->curr);
+ cur = rcu_dereference(dst_rq->curr);
if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = NULL;
@@ -2523,7 +2485,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
-void task_numa_work(struct callback_head *work)
+static void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
@@ -2536,7 +2498,7 @@ void task_numa_work(struct callback_head *work)
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
- work->next = work; /* protect against double add */
+ work->next = work;
/*
* Who cares about NUMA placement when they're dying.
*
@@ -2665,6 +2627,50 @@ out:
}
}
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+ int mm_users = 0;
+ struct mm_struct *mm = p->mm;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1) {
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ mm->numa_scan_seq = 0;
+ }
+ }
+ p->node_stamp = 0;
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ /* Protect against double add, see task_tick_numa and task_numa_work */
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ RCU_INIT_POINTER(p->numa_group, NULL);
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ init_task_work(&p->numa_work, task_numa_work);
+
+ /* New address space, reset the preferred nid */
+ if (!(clone_flags & CLONE_VM)) {
+ p->numa_preferred_nid = NUMA_NO_NODE;
+ return;
+ }
+
+ /*
+ * New thread, keep existing numa_preferred_nid which should be copied
+ * already by arch_dup_task_struct but stagger when scans start.
+ */
+ if (mm) {
+ unsigned int delay;
+
+ delay = min_t(unsigned int, task_scan_max(current),
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+ delay += 2 * TICK_NSEC;
+ p->node_stamp = delay;
+ }
+}
+
/*
* Drive the periodic memory faults..
*/
@@ -2693,10 +2699,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ if (!time_before(jiffies, curr->mm->numa_next_scan))
task_work_add(curr, work, true);
- }
}
}
@@ -3110,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
- if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
+ if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -3362,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
-
- se->avg.runnable_load_sum = runnable_sum;
- se->avg.runnable_load_avg = runnable_load_avg;
if (se->on_rq) {
+ delta_sum = runnable_load_sum -
+ se_weight(se) * se->avg.runnable_load_sum;
+ delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
}
+
+ se->avg.runnable_load_sum = runnable_sum;
+ se->avg.runnable_load_avg = runnable_load_avg;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3504,9 +3509,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (decayed)
- cfs_rq_util_change(cfs_rq, 0);
-
return decayed;
}
@@ -3514,12 +3516,11 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
- * @flags: migration hints
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
@@ -3555,7 +3556,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
- cfs_rq_util_change(cfs_rq, flags);
+ cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
@@ -3613,11 +3614,15 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*
* IOW we're enqueueing a task on a new CPU.
*/
- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
+ attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
- } else if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ } else if (decayed) {
+ cfs_rq_util_change(cfs_rq, 0);
+
+ if (flags & UPDATE_TG)
+ update_tg_load_avg(cfs_rq, 0);
+ }
}
#ifndef CONFIG_64BIT
@@ -3689,8 +3694,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
-
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
@@ -3708,6 +3711,20 @@ static inline unsigned long task_util_est(struct task_struct *p)
return max(task_util(p), _task_util_est(p));
}
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return clamp(task_util_est(p),
+ uclamp_eff_value(p, UCLAMP_MIN),
+ uclamp_eff_value(p, UCLAMP_MAX));
+}
+#else
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return task_util_est(p);
+}
+#endif
+
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
@@ -3766,10 +3783,21 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
return;
/*
+ * Reset EWMA on utilization increases, the moving average is used only
+ * to smooth utilization decreases.
+ */
+ ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ if (sched_feat(UTIL_EST_FASTUP)) {
+ if (ue.ewma < ue.enqueued) {
+ ue.ewma = ue.enqueued;
+ goto done;
+ }
+ }
+
+ /*
* Skip update of task's estimated utilization when its EWMA is
* already ~1% close to its last activation value.
*/
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
last_ewma_diff = ue.enqueued - ue.ewma;
if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
return;
@@ -3802,12 +3830,13 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
WRITE_ONCE(p->se.avg.util_est, ue);
}
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
- return capacity * 1024 > task_util_est(p) * capacity_margin;
+ return fits_capacity(uclamp_task_util(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -3842,7 +3871,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
@@ -4355,23 +4384,16 @@ static inline u64 sched_cfs_bandwidth_slice(void)
}
/*
- * Replenish runtime according to assigned quota and update expiration time.
- * We use sched_clock_cpu directly instead of rq->clock to avoid adding
- * additional synchronization around rq->lock.
+ * Replenish runtime according to assigned quota. We use sched_clock_cpu
+ * directly instead of rq->clock to avoid adding additional synchronization
+ * around rq->lock.
*
* requires cfs_b->lock
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- u64 now;
-
- if (cfs_b->quota == RUNTIME_INF)
- return;
-
- now = sched_clock_cpu(smp_processor_id());
- cfs_b->runtime = cfs_b->quota;
- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
+ if (cfs_b->quota != RUNTIME_INF)
+ cfs_b->runtime = cfs_b->quota;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4379,22 +4401,12 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}
-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
- if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
-
- return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
-}
-
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount, expires;
- int expires_seq;
+ u64 amount = 0, min_amount;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4411,65 +4423,23 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- expires_seq = cfs_b->expires_seq;
- expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
- /*
- * we may have advanced our local expiration to account for allowed
- * spread between our sched_clock and the one on which runtime was
- * issued.
- */
- if (cfs_rq->expires_seq != expires_seq) {
- cfs_rq->expires_seq = expires_seq;
- cfs_rq->runtime_expires = expires;
- }
return cfs_rq->runtime_remaining > 0;
}
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
- /* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
- return;
-
- if (cfs_rq->runtime_remaining < 0)
- return;
-
- /*
- * If the local deadline has passed we have to consider the
- * possibility that our sched_clock is 'fast' and the global deadline
- * has not truly expired.
- *
- * Fortunately we can check determine whether this the case by checking
- * whether the global deadline(cfs_b->expires_seq) has advanced.
- */
- if (cfs_rq->expires_seq == cfs_b->expires_seq) {
- /* extend local deadline, drift is bounded above by 2 ticks */
- cfs_rq->runtime_expires += TICK_NSEC;
- } else {
- /* global deadline is ahead, expiration has passed */
- cfs_rq->runtime_remaining = 0;
- }
-}
-
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
- expire_cfs_rq_runtime(cfs_rq);
if (likely(cfs_rq->runtime_remaining > 0))
return;
+ if (cfs_rq->throttled)
+ return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
@@ -4522,7 +4492,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
- /* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
@@ -4554,7 +4523,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, dequeue = 1;
+ long task_delta, idle_task_delta, dequeue = 1;
bool empty;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4565,6 +4534,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
@@ -4574,6 +4544,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
if (qcfs_rq->load.weight)
dequeue = 0;
@@ -4613,7 +4584,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
- long task_delta;
+ long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -4633,6 +4604,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
return;
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
if (se->on_rq)
enqueue = 0;
@@ -4641,6 +4613,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
+ cfs_rq->idle_h_nr_running += idle_task_delta;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -4656,8 +4629,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
resched_curr(rq);
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
{
struct cfs_rq *cfs_rq;
u64 runtime;
@@ -4673,13 +4645,15 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
if (!cfs_rq_throttled(cfs_rq))
goto next;
+ /* By the above check, this should never be true */
+ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
remaining -= runtime;
cfs_rq->runtime_remaining += runtime;
- cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
@@ -4704,7 +4678,7 @@ next:
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
- u64 runtime, runtime_expires;
+ u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -4732,8 +4706,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- runtime_expires = cfs_b->runtime_expires;
-
/*
* This check is repeated as we are holding onto the new bandwidth while
* we unthrottle. This can potentially race with an unthrottled group
@@ -4746,8 +4718,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
cfs_b->distribute_running = 1;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime,
- runtime_expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
@@ -4829,8 +4800,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
return;
raw_spin_lock(&cfs_b->lock);
- if (cfs_b->quota != RUNTIME_INF &&
- cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
@@ -4863,7 +4833,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
unsigned long flags;
- u64 expires;
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@ -4881,7 +4850,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- expires = cfs_b->runtime_expires;
if (runtime)
cfs_b->distribute_running = 1;
@@ -4890,11 +4858,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- if (expires == cfs_b->runtime_expires)
- lsub_positive(&cfs_b->runtime, runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -4990,20 +4957,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
- new = (old * 147) / 128; /* ~115% */
- new = min(new, max_cfs_quota_period);
-
- cfs_b->period = ns_to_ktime(new);
-
- /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
- cfs_b->quota *= new;
- cfs_b->quota = div64_u64(cfs_b->quota, old);
-
- pr_warn_ratelimited(
- "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
- smp_processor_id(),
- div_u64(new, NSEC_PER_USEC),
- div_u64(cfs_b->quota, NSEC_PER_USEC));
+ /*
+ * Grow period by a factor of 2 to avoid losing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_cfs_quota_period) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
/* reset count so we don't come right back in here */
count = 0;
@@ -5042,17 +5017,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
- u64 overrun;
-
lockdep_assert_held(&cfs_b->lock);
if (cfs_b->period_active)
return;
cfs_b->period_active = 1;
- overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
- cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
+ hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
@@ -5130,11 +5101,6 @@ static inline bool cfs_bandwidth_used(void)
return false;
}
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
- return rq_clock_task(rq_of(cfs_rq));
-}
-
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
@@ -5230,7 +5196,7 @@ static inline unsigned long cpu_util(int cpu);
static inline bool cpu_overutilized(int cpu)
{
- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+ return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
@@ -5244,6 +5210,20 @@ static inline void update_overutilized_status(struct rq *rq)
static inline void update_overutilized_status(struct rq *rq) { }
#endif
+/* Runqueue only has SCHED_IDLE tasks enqueued */
+static int sched_idle_rq(struct rq *rq)
+{
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ rq->nr_running);
+}
+
+#ifdef CONFIG_SMP
+static int sched_idle_cpu(int cpu)
+{
+ return sched_idle_rq(cpu_rq(cpu));
+}
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5254,6 +5234,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int idle_h_nr_running = task_has_idle_policy(p);
/*
* The code below (indirectly) updates schedutil which looks at
@@ -5286,6 +5267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
flags = ENQUEUE_WAKEUP;
}
@@ -5293,6 +5275,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5354,6 +5337,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
+ int idle_h_nr_running = task_has_idle_policy(p);
+ bool was_sched_idle = sched_idle_rq(rq);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5368,6 +5353,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -5387,6 +5373,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5398,6 +5385,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
sub_nr_running(rq, 1);
+ /* balance early to pull high priority tasks */
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ rq->next_balance = jiffies;
+
util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -5420,26 +5411,45 @@ static struct {
#endif /* CONFIG_NO_HZ_COMMON */
-static unsigned long cpu_runnable_load(struct rq *rq)
+static unsigned long cpu_load(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&rq->cfs);
+ return cfs_rq_load_avg(&rq->cfs);
}
-static unsigned long capacity_of(int cpu)
+/*
+ * cpu_load_without - compute CPU load without any contributions from *p
+ * @cpu: the CPU which load is requested
+ * @p: the task which load should be discounted
+ *
+ * The load of a CPU is defined by the load of tasks currently enqueued on that
+ * CPU as well as tasks which are currently sleeping after an execution on that
+ * CPU.
+ *
+ * This method returns the load of the specified CPU by discounting the load of
+ * the specified task, whenever the task is currently contributing to the CPU
+ * load.
+ */
+static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
{
- return cpu_rq(cpu)->cpu_capacity;
-}
+ struct cfs_rq *cfs_rq;
+ unsigned int load;
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = cpu_runnable_load(rq);
+ /* Task has no contribution or is new */
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_load(rq);
- if (nr_running)
- return load_avg / nr_running;
+ cfs_rq = &rq->cfs;
+ load = READ_ONCE(cfs_rq->avg.load_avg);
- return 0;
+ /* Discount task's util from CPU's util */
+ lsub_positive(&load, task_h_load(p));
+
+ return load;
+}
+
+static unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
}
static void record_wakee(struct task_struct *p)
@@ -5532,7 +5542,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
- this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
+ this_eff_load = cpu_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
@@ -5550,7 +5560,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
- prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5588,149 +5598,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return target;
}
-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-
-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
-{
- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
-}
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- *
- * Assumes p is allowed on at least one CPU in sd.
- */
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
-{
- struct sched_group *idlest = NULL, *group = sd->groups;
- struct sched_group *most_spare_sg = NULL;
- unsigned long min_runnable_load = ULONG_MAX;
- unsigned long this_runnable_load = ULONG_MAX;
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
- unsigned long most_spare = 0, this_spare = 0;
- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
- (sd->imbalance_pct-100) / 100;
-
- do {
- unsigned long load, avg_load, runnable_load;
- unsigned long spare_cap, max_spare_cap;
- int local_group;
- int i;
-
- /* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_span(group),
- p->cpus_ptr))
- continue;
-
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_span(group));
-
- /*
- * Tally up the load of all CPUs in the group and find
- * the group containing the CPU with most spare capacity.
- */
- avg_load = 0;
- runnable_load = 0;
- max_spare_cap = 0;
-
- for_each_cpu(i, sched_group_span(group)) {
- load = cpu_runnable_load(cpu_rq(i));
- runnable_load += load;
-
- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
-
- spare_cap = capacity_spare_without(i, p);
-
- if (spare_cap > max_spare_cap)
- max_spare_cap = spare_cap;
- }
-
- /* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
- group->sgc->capacity;
- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
- group->sgc->capacity;
-
- if (local_group) {
- this_runnable_load = runnable_load;
- this_avg_load = avg_load;
- this_spare = max_spare_cap;
- } else {
- if (min_runnable_load > (runnable_load + imbalance)) {
- /*
- * The runnable load is significantly smaller
- * so we can pick this new CPU:
- */
- min_runnable_load = runnable_load;
- min_avg_load = avg_load;
- idlest = group;
- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
- (100*min_avg_load > imbalance_scale*avg_load)) {
- /*
- * The runnable loads are close so take the
- * blocked load into account through avg_load:
- */
- min_avg_load = avg_load;
- idlest = group;
- }
-
- if (most_spare < max_spare_cap) {
- most_spare = max_spare_cap;
- most_spare_sg = group;
- }
- }
- } while (group = group->next, group != sd->groups);
-
- /*
- * The cross-over point between using spare capacity or least load
- * is too conservative for high utilization tasks on partially
- * utilized systems if we require spare_capacity > task_util(p),
- * so we allow for some task stuffing by using
- * spare_capacity > task_util(p)/2.
- *
- * Spare capacity can't be used for fork because the utilization has
- * not been set yet, we must first select a rq to compute the initial
- * utilization.
- */
- if (sd_flag & SD_BALANCE_FORK)
- goto skip_spare;
-
- if (this_spare > task_util(p) / 2 &&
- imbalance_scale*this_spare > 100*most_spare)
- return NULL;
-
- if (most_spare > task_util(p) / 2)
- return most_spare_sg;
-
-skip_spare:
- if (!idlest)
- return NULL;
-
- /*
- * When comparing groups across NUMA domains, it's possible for the
- * local domain to be very lightly loaded relative to the remote
- * domains but "imbalance" skews the comparison making remote CPUs
- * look much more favourable. When considering cross-domain, add
- * imbalance to the runnable load on the remote node and consider
- * staying local.
- */
- if ((sd->flags & SD_NUMA) &&
- min_runnable_load + imbalance >= this_runnable_load)
- return NULL;
-
- if (min_runnable_load > (this_runnable_load + imbalance))
- return NULL;
-
- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
- (100*this_avg_load < imbalance_scale*min_avg_load))
- return NULL;
-
- return idlest;
-}
+ int this_cpu, int sd_flag);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5751,6 +5621,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ if (sched_idle_cpu(i))
+ return i;
+
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5774,7 +5647,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = cpu_runnable_load(cpu_rq(i));
+ load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
@@ -5794,7 +5667,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return prev_cpu;
/*
- * We need task's util for capacity_spare_without, sync it up to
+ * We need task's util for cpu_util_without, sync it up to
* prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
@@ -5943,7 +5816,7 @@ static int select_idle_smt(struct task_struct *p, int target)
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- if (available_idle_cpu(cpu))
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
}
@@ -5971,12 +5844,13 @@ static inline int select_idle_smt(struct task_struct *p, int target)
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
- int cpu, nr = INT_MAX;
int this = smp_processor_id();
+ int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6002,12 +5876,12 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = cpu_clock(this);
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
return -1;
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
- continue;
- if (available_idle_cpu(cpu))
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
break;
}
@@ -6027,21 +5901,36 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
- if (available_idle_cpu(target))
+ if (available_idle_cpu(target) || sched_idle_cpu(target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
- if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+ if (prev != target && cpus_share_cache(prev, target) &&
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)))
+ return prev;
+
+ /*
+ * Allow a per-cpu kthread to stack with the wakee if the
+ * kworker thread and the tasks previous CPUs are the same.
+ * The assumption is that the wakee queued work for the
+ * per-cpu kthread that is now complete and the wakeup is
+ * essentially a sync wakeup. An obvious example of this
+ * pattern is IO completions.
+ */
+ if (is_per_cpu_kthread(current) &&
+ prev == smp_processor_id() &&
+ this_rq()->nr_running <= 1) {
return prev;
+ }
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- available_idle_cpu(recent_used_cpu) &&
+ (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
/*
* Replace recent_used_cpu with prev as it is a potential
@@ -6277,69 +6166,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
}
/*
- * compute_energy(): Estimates the energy that would be consumed if @p was
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
* to compute what would be the energy if we decided to actually migrate that
* task.
*/
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
- unsigned int max_util, util_cfs, cpu_util, cpu_cap;
- unsigned long sum_util, energy = 0;
- struct task_struct *tsk;
+ struct cpumask *pd_mask = perf_domain_span(pd);
+ unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+ unsigned long max_util = 0, sum_util = 0;
int cpu;
- for (; pd; pd = pd->next) {
- struct cpumask *pd_mask = perf_domain_span(pd);
+ /*
+ * The capacity state of CPUs of the current rd can be driven by CPUs
+ * of another rd if they belong to the same pd. So, account for the
+ * utilization of these CPUs too by masking pd with cpu_online_mask
+ * instead of the rd span.
+ *
+ * If an entire pd is outside of the current rd, it will not appear in
+ * its pd list and will not be accounted by compute_energy().
+ */
+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+ unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
+ struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
/*
- * The energy model mandates all the CPUs of a performance
- * domain have the same capacity.
+ * Busy time computation: utilization clamping is not
+ * required since the ratio (sum_util / cpu_capacity)
+ * is already enough to scale the EM reported power
+ * consumption at the (eventually clamped) cpu_capacity.
*/
- cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
- max_util = sum_util = 0;
+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ ENERGY_UTIL, NULL);
/*
- * The capacity state of CPUs of the current rd can be driven by
- * CPUs of another rd if they belong to the same performance
- * domain. So, account for the utilization of these CPUs too
- * by masking pd with cpu_online_mask instead of the rd span.
- *
- * If an entire performance domain is outside of the current rd,
- * it will not appear in its pd list and will not be accounted
- * by compute_energy().
+ * Performance domain frequency: utilization clamping
+ * must be considered since it affects the selection
+ * of the performance domain frequency.
+ * NOTE: in case RT tasks are running, by default the
+ * FREQUENCY_UTIL's utilization can be max OPP.
*/
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- util_cfs = cpu_util_next(cpu, p, dst_cpu);
-
- /*
- * Busy time computation: utilization clamping is not
- * required since the ratio (sum_util / cpu_capacity)
- * is already enough to scale the EM reported power
- * consumption at the (eventually clamped) cpu_capacity.
- */
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- ENERGY_UTIL, NULL);
-
- /*
- * Performance domain frequency: utilization clamping
- * must be considered since it affects the selection
- * of the performance domain frequency.
- * NOTE: in case RT tasks are running, by default the
- * FREQUENCY_UTIL's utilization can be max OPP.
- */
- tsk = cpu == dst_cpu ? p : NULL;
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
- }
-
- energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, cpu_util);
}
- return energy;
+ return em_pd_energy(pd->em_pd, max_util, sum_util);
}
/*
@@ -6381,21 +6256,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* other use-cases too. So, until someone finds a better way to solve this,
* let's keep things simple by re-using the existing slow path.
*/
-
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
- unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ unsigned long cpu_cap, util, base_energy = 0;
int cpu, best_energy_cpu = prev_cpu;
- struct perf_domain *head, *pd;
- unsigned long cpu_cap, util;
struct sched_domain *sd;
+ struct perf_domain *pd;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
goto fail;
- head = pd;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6412,31 +6285,44 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
goto unlock;
for (; pd; pd = pd->next) {
- unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+ unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
+ /* Compute the 'base' energy of the pd, without @p */
+ base_energy_pd = compute_energy(p, -1, pd);
+ base_energy += base_energy_pd;
+
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- /* Skip CPUs that will be overutilized. */
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
- if (cpu_cap * 1024 < util * capacity_margin)
+ spare_cap = cpu_cap - util;
+
+ /*
+ * Skip CPUs that cannot satisfy the capacity request.
+ * IOW, placing the task there would make the CPU
+ * overutilized. Take uclamp into account to see how
+ * much capacity we can get out of the CPU; this is
+ * aligned with schedutil_cpu_util().
+ */
+ util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
+ if (!fits_capacity(util, cpu_cap))
continue;
/* Always use prev_cpu as a candidate. */
if (cpu == prev_cpu) {
- prev_energy = compute_energy(p, prev_cpu, head);
- best_energy = min(best_energy, prev_energy);
- continue;
+ prev_delta = compute_energy(p, prev_cpu, pd);
+ prev_delta -= base_energy_pd;
+ best_delta = min(best_delta, prev_delta);
}
/*
* Find the CPU with the maximum spare capacity in
* the performance domain
*/
- spare_cap = cpu_cap - util;
if (spare_cap > max_spare_cap) {
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
@@ -6444,10 +6330,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
/* Evaluate the energy impact of using this CPU. */
- if (max_spare_cap_cpu >= 0) {
- cur_energy = compute_energy(p, max_spare_cap_cpu, head);
- if (cur_energy < best_energy) {
- best_energy = cur_energy;
+ if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
+ cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+ cur_delta -= base_energy_pd;
+ if (cur_delta < best_delta) {
+ best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
}
}
@@ -6459,10 +6346,10 @@ unlock:
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
- if (prev_energy == ULONG_MAX)
+ if (prev_delta == ULONG_MAX)
return best_energy_cpu;
- if ((prev_energy - best_energy) > (prev_energy >> 4))
+ if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
return best_energy_cpu;
return prev_cpu;
@@ -6616,6 +6503,15 @@ static void task_dead_fair(struct task_struct *p)
{
remove_entity_load_avg(&p->se);
}
+
+static int
+balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ if (rq->nr_running)
+ return 1;
+
+ return newidle_balance(rq, rf) != 0;
+}
#endif /* CONFIG_SMP */
static unsigned long wakeup_gran(struct sched_entity *se)
@@ -6783,7 +6679,7 @@ preempt:
set_last_buddy(se);
}
-static struct task_struct *
+struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
@@ -6792,11 +6688,11 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
- if (!cfs_rq->nr_running)
+ if (!sched_fair_runnable(rq))
goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (prev->sched_class != &fair_sched_class)
+ if (!prev || prev->sched_class != &fair_sched_class)
goto simple;
/*
@@ -6873,8 +6769,8 @@ again:
goto done;
simple:
#endif
-
- put_prev_task(rq, prev);
+ if (prev)
+ put_prev_task(rq, prev);
do {
se = pick_next_entity(cfs_rq, NULL);
@@ -6902,11 +6798,13 @@ done: __maybe_unused;
return p;
idle:
- update_misfit_status(NULL, rq);
- new_tasks = idle_balance(rq, rf);
+ if (!rf)
+ return NULL;
+
+ new_tasks = newidle_balance(rq, rf);
/*
- * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
@@ -6925,6 +6823,11 @@ idle:
return NULL;
}
+static struct task_struct *__pick_next_task_fair(struct rq *rq)
+{
+ return pick_next_task_fair(rq, NULL, NULL);
+}
+
/*
* Account for a descheduled task:
*/
@@ -7114,11 +7017,49 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+/*
+ * 'group_type' describes the group of CPUs at the moment of load balancing.
+ *
+ * The enum is ordered by pulling priority, with the group with lowest priority
+ * first so the group_type can simply be compared when selecting the busiest
+ * group. See update_sd_pick_busiest().
+ */
enum group_type {
- group_other = 0,
+ /* The group has spare capacity that can be used to run more tasks. */
+ group_has_spare = 0,
+ /*
+ * The group is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ group_fully_busy,
+ /*
+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
+ * and must be migrated to a more powerful CPU.
+ */
group_misfit_task,
+ /*
+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
+ * and the task should be migrated to it instead of running on the
+ * current CPU.
+ */
+ group_asym_packing,
+ /*
+ * The tasks' affinity constraints previously prevented the scheduler
+ * from balancing the load across the system.
+ */
group_imbalanced,
- group_overloaded,
+ /*
+ * The CPU is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ group_overloaded
+};
+
+enum migration_type {
+ migrate_load = 0,
+ migrate_util,
+ migrate_task,
+ migrate_misfit
};
#define LBF_ALL_PINNED 0x01
@@ -7151,7 +7092,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
- enum group_type src_grp_type;
+ enum migration_type migration_type;
struct list_head tasks;
};
@@ -7374,7 +7315,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
static const unsigned int sched_nr_migrate_break = 32;
/*
- * detach_tasks() -- tries to detach up to imbalance runnable load from
+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
@@ -7382,8 +7323,8 @@ static const unsigned int sched_nr_migrate_break = 32;
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
+ unsigned long util, load;
struct task_struct *p;
- unsigned long load;
int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
@@ -7416,21 +7357,55 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ switch (env->migration_type) {
+ case migrate_load:
+ load = task_h_load(p);
- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
- goto next;
+ if (sched_feat(LB_MIN) &&
+ load < 16 && !env->sd->nr_balance_failed)
+ goto next;
- if ((load / 2) > env->imbalance)
- goto next;
+ /*
+ * Make sure that we don't migrate too much load.
+ * Nevertheless, let relax the constraint if
+ * scheduler fails to find a good waiting task to
+ * migrate.
+ */
+ if (load/2 > env->imbalance &&
+ env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
+ goto next;
+
+ env->imbalance -= load;
+ break;
+
+ case migrate_util:
+ util = task_util_est(p);
+
+ if (util > env->imbalance)
+ goto next;
+
+ env->imbalance -= util;
+ break;
+
+ case migrate_task:
+ env->imbalance--;
+ break;
+
+ case migrate_misfit:
+ /* This is not a misfit task */
+ if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ goto next;
+
+ env->imbalance = 0;
+ break;
+ }
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
- env->imbalance -= load;
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is detached to minimize
@@ -7442,7 +7417,7 @@ static int detach_tasks(struct lb_env *env)
/*
* We only want to steal up to the prescribed amount of
- * runnable load.
+ * load/util/tasks.
*/
if (env->imbalance <= 0)
break;
@@ -7552,6 +7527,28 @@ static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
+static bool __update_blocked_others(struct rq *rq, bool *done)
+{
+ const struct sched_class *curr_class;
+ u64 now = rq_clock_pelt(rq);
+ bool decayed;
+
+ /*
+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
+ * DL and IRQ signals have been updated before updating CFS.
+ */
+ curr_class = rq->curr->sched_class;
+
+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_irq_load_avg(rq, 0);
+
+ if (others_have_blocked(rq))
+ *done = false;
+
+ return decayed;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7571,16 +7568,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}
-static void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
- const struct sched_class *curr_class;
- struct rq_flags rf;
- bool done = true;
-
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
+ bool decayed = false;
+ int cpu = cpu_of(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
@@ -7589,9 +7581,13 @@ static void update_blocked_averages(int cpu)
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq, 0);
+ if (cfs_rq == &rq->cfs)
+ decayed = true;
+ }
+
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
@@ -7606,19 +7602,10 @@ static void update_blocked_averages(int cpu)
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
- done = false;
+ *done = false;
}
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
- /* Don't need periodic decay once load/util_avg are null */
- if (others_have_blocked(rq))
- done = false;
-
- update_blocked_load_status(rq, !done);
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
/*
@@ -7668,23 +7655,16 @@ static unsigned long task_h_load(struct task_struct *p)
cfs_rq_load_avg(cfs_rq) + 1);
}
#else
-static inline void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
- const struct sched_class *curr_class;
- struct rq_flags rf;
+ bool decayed;
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ if (cfs_rq_has_blocked(cfs_rq))
+ *done = false;
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
- update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
static unsigned long task_h_load(struct task_struct *p)
@@ -7693,6 +7673,24 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
+static void update_blocked_averages(int cpu)
+{
+ bool decayed = false, done = true;
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+
+ decayed |= __update_blocked_others(rq, &done);
+ decayed |= __update_blocked_fair(rq, &done);
+
+ update_blocked_load_status(rq, !done);
+ if (decayed)
+ cpufreq_update_util(rq, 0);
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/********** Helpers for find_busiest_group ************************/
/*
@@ -7701,14 +7699,14 @@ static unsigned long task_h_load(struct task_struct *p)
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long load_per_task;
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
- unsigned int sum_nr_running; /* Nr tasks running in the group */
+ unsigned int sum_nr_running; /* Nr of tasks running in the group */
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
- int group_no_capacity;
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -7723,10 +7721,10 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
- unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
+ unsigned int prefer_sibling; /* tasks should go to sibling first */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
struct sg_lb_stats local_stat; /* Statistics of the local group */
@@ -7737,19 +7735,18 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
/*
* Skimp on the clearing to avoid duplicate work. We can avoid clearing
* local_stat because update_sg_lb_stats() does a full clear/assignment.
- * We must however clear busiest_stat::avg_load because
- * update_sd_pick_busiest() reads this before assignment.
+ * We must however set busiest_stat::group_type and
+ * busiest_stat::idle_cpus to the worst busiest group because
+ * update_sd_pick_busiest() reads these before assignment.
*/
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
- .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
- .avg_load = 0UL,
- .sum_nr_running = 0,
- .group_type = group_other,
+ .idle_cpus = UINT_MAX,
+ .group_type = group_has_spare,
},
};
}
@@ -7820,29 +7817,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_span(sdg)) {
- struct sched_group_capacity *sgc;
- struct rq *rq = cpu_rq(cpu);
-
- /*
- * build_sched_domains() -> init_sched_groups_capacity()
- * gets here before we've attached the domains to the
- * runqueues.
- *
- * Use capacity_of(), which is set irrespective of domains
- * in update_cpu_capacity().
- *
- * This avoids capacity from being 0 and
- * causing divide-by-zero issues on boot.
- */
- if (unlikely(!rq->sd)) {
- capacity += capacity_of(cpu);
- } else {
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
- }
+ unsigned long cpu_cap = capacity_of(cpu);
- min_capacity = min(capacity, min_capacity);
- max_capacity = max(capacity, max_capacity);
+ capacity += cpu_cap;
+ min_capacity = min(cpu_cap, min_capacity);
+ max_capacity = max(cpu_cap, max_capacity);
}
} else {
/*
@@ -7937,13 +7916,13 @@ static inline int sg_imbalanced(struct sched_group *group)
* any benefit for the load balance.
*/
static inline bool
-group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running < sgs->group_weight)
return true;
if ((sgs->group_capacity * 100) >
- (sgs->group_util * env->sd->imbalance_pct))
+ (sgs->group_util * imbalance_pct))
return true;
return false;
@@ -7958,13 +7937,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
* false.
*/
static inline bool
-group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
if ((sgs->group_capacity * 100) <
- (sgs->group_util * env->sd->imbalance_pct))
+ (sgs->group_util * imbalance_pct))
return true;
return false;
@@ -7977,8 +7956,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
static inline bool
group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
- return sg->sgc->min_capacity * capacity_margin <
- ref->sgc->min_capacity * 1024;
+ return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
}
/*
@@ -7988,24 +7966,30 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
static inline bool
group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
- return sg->sgc->max_capacity * capacity_margin <
- ref->sgc->max_capacity * 1024;
+ return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
}
static inline enum
-group_type group_classify(struct sched_group *group,
+group_type group_classify(unsigned int imbalance_pct,
+ struct sched_group *group,
struct sg_lb_stats *sgs)
{
- if (sgs->group_no_capacity)
+ if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_asym_packing)
+ return group_asym_packing;
+
if (sgs->group_misfit_task_load)
return group_misfit_task;
- return group_other;
+ if (!group_has_capacity(imbalance_pct, sgs))
+ return group_fully_busy;
+
+ return group_has_spare;
}
static bool update_nohz_stats(struct rq *rq, bool force)
@@ -8042,21 +8026,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sg_lb_stats *sgs,
int *sg_status)
{
- int i, nr_running;
+ int i, nr_running, local_group;
memset(sgs, 0, sizeof(*sgs));
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
- sgs->group_load += cpu_runnable_load(rq);
+ sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
- sgs->sum_nr_running += rq->cfs.h_nr_running;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running;
nr_running = rq->nr_running;
+ sgs->sum_nr_running += nr_running;
+
if (nr_running > 1)
*sg_status |= SG_OVERLOAD;
@@ -8070,9 +8058,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/*
* No need to call idle_cpu() if nr_running is not 0
*/
- if (!nr_running && idle_cpu(i))
+ if (!nr_running && idle_cpu(i)) {
sgs->idle_cpus++;
+ /* Idle cpu can't have misfit task */
+ continue;
+ }
+
+ if (local_group)
+ continue;
+ /* Check for a misfit task on the cpu */
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
@@ -8080,17 +8075,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,
}
}
- /* Adjust by relative CPU capacity of the group */
- sgs->group_capacity = group->sgc->capacity;
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+ /* Check if dst CPU is idle and preferred to this group */
+ if (env->sd->flags & SD_ASYM_PACKING &&
+ env->idle != CPU_NOT_IDLE &&
+ sgs->sum_h_nr_running &&
+ sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
+ sgs->group_asym_packing = 1;
+ }
- if (sgs->sum_nr_running)
- sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
+ sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(group, sgs);
+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+
+ /* Computing avg_load makes sense only when group is overloaded */
+ if (sgs->group_type == group_overloaded)
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
}
/**
@@ -8113,6 +8115,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ /* Make sure that there is at least one task to pull */
+ if (!sgs->sum_h_nr_running)
+ return false;
+
/*
* Don't try to pull misfit tasks we can't help.
* We can use max_capacity here as reduction in capacity on some
@@ -8121,7 +8127,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
*/
if (sgs->group_type == group_misfit_task &&
(!group_smaller_max_cpu_capacity(sg, sds->local) ||
- !group_has_capacity(env, &sds->local_stat)))
+ sds->local_stat.group_type != group_has_spare))
return false;
if (sgs->group_type > busiest->group_type)
@@ -8130,62 +8136,92 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->group_type < busiest->group_type)
return false;
- if (sgs->avg_load <= busiest->avg_load)
- return false;
-
- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
- goto asym_packing;
-
/*
- * Candidate sg has no more than one task per CPU and
- * has higher per-CPU capacity. Migrating tasks to less
- * capable CPUs may harm throughput. Maximize throughput,
- * power/energy consequences are not considered.
+ * The candidate and the current busiest group are the same type of
+ * group. Let check which one is the busiest according to the type.
*/
- if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_min_cpu_capacity(sds->local, sg))
- return false;
- /*
- * If we have more than one misfit sg go with the biggest misfit.
- */
- if (sgs->group_type == group_misfit_task &&
- sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+ switch (sgs->group_type) {
+ case group_overloaded:
+ /* Select the overloaded group with highest avg_load. */
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+ break;
+
+ case group_imbalanced:
+ /*
+ * Select the 1st imbalanced group as we don't have any way to
+ * choose one more than another.
+ */
return false;
-asym_packing:
- /* This is the busiest node in its class. */
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return true;
+ case group_asym_packing:
+ /* Prefer to move from lowest priority CPU's work */
+ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
+ return false;
+ break;
- /* No ASYM_PACKING if target CPU is already busy */
- if (env->idle == CPU_NOT_IDLE)
- return true;
- /*
- * ASYM_PACKING needs to move all the work to the highest
- * prority CPUs in the group, therefore mark all groups
- * of lower priority than ourself as busy.
- */
- if (sgs->sum_nr_running &&
- sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
- if (!sds->busiest)
- return true;
+ case group_misfit_task:
+ /*
+ * If we have more than one misfit sg go with the biggest
+ * misfit.
+ */
+ if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+ return false;
+ break;
- /* Prefer to move from lowest priority CPU's work */
- if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
- sg->asym_prefer_cpu))
- return true;
+ case group_fully_busy:
+ /*
+ * Select the fully busy group with highest avg_load. In
+ * theory, there is no need to pull task from such kind of
+ * group because tasks have all compute capacity that they need
+ * but we can still improve the overall throughput by reducing
+ * contention when accessing shared HW resources.
+ *
+ * XXX for now avg_load is not computed and always 0 so we
+ * select the 1st one.
+ */
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+ break;
+
+ case group_has_spare:
+ /*
+ * Select not overloaded group with lowest number of idle cpus
+ * and highest number of running tasks. We could also compare
+ * the spare capacity which is more stable but it can end up
+ * that the group has less spare capacity but finally more idle
+ * CPUs which means less opportunity to pull tasks.
+ */
+ if (sgs->idle_cpus > busiest->idle_cpus)
+ return false;
+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
+ (sgs->sum_nr_running <= busiest->sum_nr_running))
+ return false;
+
+ break;
}
- return false;
+ /*
+ * Candidate sg has no more than one task per CPU and has higher
+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
+ * throughput. Maximize throughput, power/energy consequences are not
+ * considered.
+ */
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type <= group_fully_busy) &&
+ (group_smaller_min_cpu_capacity(sds->local, sg)))
+ return false;
+
+ return true;
}
#ifdef CONFIG_NUMA_BALANCING
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->nr_numa_running)
+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
return regular;
- if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
return remote;
return all;
}
@@ -8210,18 +8246,314 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */
+
+struct sg_lb_stats;
+
+/*
+ * task_running_on_cpu - return 1 if @p is running on @cpu.
+ */
+
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
+{
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return 0;
+
+ if (task_on_rq_queued(p))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * idle_cpu_without - would a given CPU be idle without p ?
+ * @cpu: the processor on which idleness is tested.
+ * @p: task which should be ignored.
+ *
+ * Return: 1 if the CPU would be idle. 0 otherwise.
+ */
+static int idle_cpu_without(int cpu, struct task_struct *p)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle && rq->curr != p)
+ return 0;
+
+ /*
+ * rq->nr_running can't be used but an updated version without the
+ * impact of p on cpu must be used instead. The updated nr_running
+ * be computed and tested before calling idle_cpu_without().
+ */
+
+#ifdef CONFIG_SMP
+ if (!llist_empty(&rq->wake_list))
+ return 0;
+#endif
+
+ return 1;
+}
+
+/*
+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
+ * @sd: The sched_domain level to look for idlest group.
+ * @group: sched_group whose statistics are to be updated.
+ * @sgs: variable to hold the statistics for this group.
+ * @p: The task for which we look for the idlest group/CPU.
+ */
+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs,
+ struct task_struct *p)
+{
+ int i, nr_running;
+
+ memset(sgs, 0, sizeof(*sgs));
+
+ for_each_cpu(i, sched_group_span(group)) {
+ struct rq *rq = cpu_rq(i);
+ unsigned int local;
+
+ sgs->group_load += cpu_load_without(rq, p);
+ sgs->group_util += cpu_util_without(i, p);
+ local = task_running_on_cpu(i, p);
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
+
+ nr_running = rq->nr_running - local;
+ sgs->sum_nr_running += nr_running;
+
+ /*
+ * No need to call idle_cpu_without() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu_without(i, p))
+ sgs->idle_cpus++;
+
+ }
+
+ /* Check if task fits in the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ !task_fits_capacity(p, group->sgc->max_capacity)) {
+ sgs->group_misfit_task_load = 1;
+ }
+
+ sgs->group_capacity = group->sgc->capacity;
+
+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
+
+ /*
+ * Computing avg_load makes sense only when group is fully busy or
+ * overloaded
+ */
+ if (sgs->group_type < group_fully_busy)
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
+}
+
+static bool update_pick_idlest(struct sched_group *idlest,
+ struct sg_lb_stats *idlest_sgs,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
+{
+ if (sgs->group_type < idlest_sgs->group_type)
+ return true;
+
+ if (sgs->group_type > idlest_sgs->group_type)
+ return false;
+
+ /*
+ * The candidate and the current idlest group are the same type of
+ * group. Let check which one is the idlest according to the type.
+ */
+
+ switch (sgs->group_type) {
+ case group_overloaded:
+ case group_fully_busy:
+ /* Select the group with lowest avg_load. */
+ if (idlest_sgs->avg_load <= sgs->avg_load)
+ return false;
+ break;
+
+ case group_imbalanced:
+ case group_asym_packing:
+ /* Those types are not used in the slow wakeup path */
+ return false;
+
+ case group_misfit_task:
+ /* Select group with the highest max capacity */
+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
+ return false;
+ break;
+
+ case group_has_spare:
+ /* Select group with most idle CPUs */
+ if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
+/*
+ * find_idlest_group() finds and returns the least busy CPU group within the
+ * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int sd_flag)
+{
+ struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
+ struct sg_lb_stats local_sgs, tmp_sgs;
+ struct sg_lb_stats *sgs;
+ unsigned long imbalance;
+ struct sg_lb_stats idlest_sgs = {
+ .avg_load = UINT_MAX,
+ .group_type = group_overloaded,
+ };
+
+ imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
+
+ do {
+ int local_group;
+
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpumask_intersects(sched_group_span(group),
+ p->cpus_ptr))
+ continue;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_span(group));
+
+ if (local_group) {
+ sgs = &local_sgs;
+ local = group;
+ } else {
+ sgs = &tmp_sgs;
+ }
+
+ update_sg_wakeup_stats(sd, group, sgs, p);
+
+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
+ idlest = group;
+ idlest_sgs = *sgs;
+ }
+
+ } while (group = group->next, group != sd->groups);
+
+
+ /* There is no idlest group to push tasks to */
+ if (!idlest)
+ return NULL;
+
+ /* The local group has been skipped because of CPU affinity */
+ if (!local)
+ return idlest;
+
+ /*
+ * If the local group is idler than the selected idlest group
+ * don't try and push the task.
+ */
+ if (local_sgs.group_type < idlest_sgs.group_type)
+ return NULL;
+
+ /*
+ * If the local group is busier than the selected idlest group
+ * try and push the task.
+ */
+ if (local_sgs.group_type > idlest_sgs.group_type)
+ return idlest;
+
+ switch (local_sgs.group_type) {
+ case group_overloaded:
+ case group_fully_busy:
+ /*
+ * When comparing groups across NUMA domains, it's possible for
+ * the local domain to be very lightly loaded relative to the
+ * remote domains but "imbalance" skews the comparison making
+ * remote CPUs look much more favourable. When considering
+ * cross-domain, add imbalance to the load on the remote node
+ * and consider staying local.
+ */
+
+ if ((sd->flags & SD_NUMA) &&
+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
+ return NULL;
+
+ /*
+ * If the local group is less loaded than the selected
+ * idlest group don't try and push any tasks.
+ */
+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
+ return NULL;
+
+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
+ return NULL;
+ break;
+
+ case group_imbalanced:
+ case group_asym_packing:
+ /* Those type are not used in the slow wakeup path */
+ return NULL;
+
+ case group_misfit_task:
+ /* Select group with the highest max capacity */
+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
+ return NULL;
+ break;
+
+ case group_has_spare:
+ if (sd->flags & SD_NUMA) {
+#ifdef CONFIG_NUMA_BALANCING
+ int idlest_cpu;
+ /*
+ * If there is spare capacity at NUMA, try to select
+ * the preferred node
+ */
+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
+ return NULL;
+
+ idlest_cpu = cpumask_first(sched_group_span(idlest));
+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
+ return idlest;
+#endif
+ /*
+ * Otherwise, keep the task on this node to stay close
+ * its wakeup source and improve locality. If there is
+ * a real need of migration, periodic load balance will
+ * take care of it.
+ */
+ if (local_sgs.idle_cpus)
+ return NULL;
+ }
+
+ /*
+ * Select group with highest number of idle CPUs. We could also
+ * compare the utilization which is more stable but it can end
+ * up that the group has less spare capacity but finally more
+ * idle CPUs which means more opportunity to run task.
+ */
+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
+ return NULL;
+ break;
+ }
+
+ return idlest;
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @sds: variable to hold the statistics for this sched_domain.
*/
+
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON
@@ -8248,22 +8580,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (local_group)
goto next_group;
- /*
- * In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity so that we'll try
- * and move all the excess tasks away. We lower the capacity
- * of a group only if the local group has the capacity to fit
- * these excess tasks. The extra check prevents the case where
- * you always pull from the heaviest group when it is already
- * under-utilized (possible with a large weight task outweighs
- * the tasks on the system).
- */
- if (prefer_sibling && sds->local &&
- group_has_capacity(env, local) &&
- (sgs->sum_nr_running > local->sum_nr_running + 1)) {
- sgs->group_no_capacity = 1;
- sgs->group_type = group_classify(sg, sgs);
- }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -8272,13 +8588,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
- sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
+ /* Tag domain that child domain prefers tasks go to siblings first */
+ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+
#ifdef CONFIG_NO_HZ_COMMON
if ((env->flags & LBF_NOHZ_AGAIN) &&
cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
@@ -8309,203 +8627,177 @@ next_group:
}
/**
- * check_asym_packing - Check to see if the group is packed into the
- * sched domain.
- *
- * This is primarily intended to used at the sibling level. Some
- * cores like POWER7 prefer to use lower numbered SMT threads. In the
- * case of POWER7, it can move to lower SMT modes only when higher
- * threads are idle. When in lower SMT modes, the threads will
- * perform better since they share less core resources. Hence when we
- * have idle threads, we want them to be the higher ones.
- *
- * This packing function is run on idle threads. It checks to see if
- * the busiest CPU in this domain (core in the P7 case) has a higher
- * CPU number than the packing function is being run on. Here we are
- * assuming lower CPU number will be equivalent to lower a SMT thread
- * number.
- *
- * Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in env->imbalance.
- *
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain which is to be packed
- */
-static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
-{
- int busiest_cpu;
-
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return 0;
-
- if (env->idle == CPU_NOT_IDLE)
- return 0;
-
- if (!sds->busiest)
- return 0;
-
- busiest_cpu = sds->busiest->asym_prefer_cpu;
- if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
- return 0;
-
- env->imbalance = sds->busiest_stat.group_load;
-
- return 1;
-}
-
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- * amongst the groups of a sched_domain, during
- * load balancing.
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ * groups of a given sched_domain during load balance.
+ * @env: load balance environment
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
*/
-static inline
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long tmp, capa_now = 0, capa_move = 0;
- unsigned int imbn = 2;
- unsigned long scaled_busy_load_per_task;
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
- if (!local->sum_nr_running)
- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
- else if (busiest->load_per_task > local->load_per_task)
- imbn = 1;
+ if (busiest->group_type == group_misfit_task) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ return;
+ }
- scaled_busy_load_per_task =
- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
- busiest->group_capacity;
+ if (busiest->group_type == group_asym_packing) {
+ /*
+ * In case of asym capacity, we will try to migrate all load to
+ * the preferred CPU.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = busiest->sum_h_nr_running;
+ return;
+ }
- if (busiest->avg_load + scaled_busy_load_per_task >=
- local->avg_load + (scaled_busy_load_per_task * imbn)) {
- env->imbalance = busiest->load_per_task;
+ if (busiest->group_type == group_imbalanced) {
+ /*
+ * In the group_imb case we cannot rely on group-wide averages
+ * to ensure CPU-load equilibrium, try to move any task to fix
+ * the imbalance. The next load balance will take care of
+ * balancing back the system.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
return;
}
/*
- * OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU capacity used by
- * moving them.
+ * Try to use spare capacity of local group without overloading it or
+ * emptying busiest.
*/
+ if (local->group_type == group_has_spare) {
+ if (busiest->group_type > group_fully_busy) {
+ /*
+ * If busiest is overloaded, try to fill spare
+ * capacity. This might end up creating spare capacity
+ * in busiest or busiest still being overloaded but
+ * there is no simple way to directly compute the
+ * amount of load to migrate in order to balance the
+ * system.
+ */
+ env->migration_type = migrate_util;
+ env->imbalance = max(local->group_capacity, local->group_util) -
+ local->group_util;
- capa_now += busiest->group_capacity *
- min(busiest->load_per_task, busiest->avg_load);
- capa_now += local->group_capacity *
- min(local->load_per_task, local->avg_load);
- capa_now /= SCHED_CAPACITY_SCALE;
+ /*
+ * In some cases, the group's utilization is max or even
+ * higher than capacity because of migrations but the
+ * local CPU is (newly) idle. There is at least one
+ * waiting task in this overloaded busiest group. Let's
+ * try to pull it.
+ */
+ if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ }
- /* Amount of load we'd subtract */
- if (busiest->avg_load > scaled_busy_load_per_task) {
- capa_move += busiest->group_capacity *
- min(busiest->load_per_task,
- busiest->avg_load - scaled_busy_load_per_task);
- }
+ return;
+ }
- /* Amount of load we'd add */
- if (busiest->avg_load * busiest->group_capacity <
- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
- tmp = (busiest->avg_load * busiest->group_capacity) /
- local->group_capacity;
- } else {
- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
- local->group_capacity;
- }
- capa_move += local->group_capacity *
- min(local->load_per_task, local->avg_load + tmp);
- capa_move /= SCHED_CAPACITY_SCALE;
+ if (busiest->group_weight == 1 || sds->prefer_sibling) {
+ unsigned int nr_diff = busiest->sum_nr_running;
+ /*
+ * When prefer sibling, evenly spread running tasks on
+ * groups.
+ */
+ env->migration_type = migrate_task;
+ lsub_positive(&nr_diff, local->sum_nr_running);
+ env->imbalance = nr_diff >> 1;
+ } else {
- /* Move if we gain throughput */
- if (capa_move > capa_now)
- env->imbalance = busiest->load_per_task;
-}
+ /*
+ * If there is no overload, we just want to even the number of
+ * idle cpus.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = max_t(long, 0, (local->idle_cpus -
+ busiest->idle_cpus) >> 1);
+ }
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- * groups of a given sched_domain during load balance.
- * @env: load balance environment
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- */
-static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
-{
- unsigned long max_pull, load_above_capacity = ~0UL;
- struct sg_lb_stats *local, *busiest;
+ /* Consider allowing a small imbalance between NUMA groups */
+ if (env->sd->flags & SD_NUMA) {
+ unsigned int imbalance_min;
- local = &sds->local_stat;
- busiest = &sds->busiest_stat;
+ /*
+ * Compute an allowed imbalance based on a simple
+ * pair of communicating tasks that should remain
+ * local and ignore them.
+ *
+ * NOTE: Generally this would have been based on
+ * the domain size and this was evaluated. However,
+ * the benefit is similar across a range of workloads
+ * and machines but scaling by the domain size adds
+ * the risk that lower domains have to be rebalanced.
+ */
+ imbalance_min = 2;
+ if (busiest->sum_nr_running <= imbalance_min)
+ env->imbalance = 0;
+ }
- if (busiest->group_type == group_imbalanced) {
- /*
- * In the group_imb case we cannot rely on group-wide averages
- * to ensure CPU-load equilibrium, look at wider averages. XXX
- */
- busiest->load_per_task =
- min(busiest->load_per_task, sds->avg_load);
+ return;
}
/*
- * Avg load of busiest sg can be less and avg load of local sg can
- * be greater than avg load across all sgs of sd because avg load
- * factors in sg capacity and sgs with smaller group_type are
- * skipped when updating the busiest sg:
+ * Local is fully busy but has to take more load to relieve the
+ * busiest group
*/
- if (busiest->group_type != group_misfit_task &&
- (busiest->avg_load <= sds->avg_load ||
- local->avg_load >= sds->avg_load)) {
- env->imbalance = 0;
- return fix_small_imbalance(env, sds);
- }
+ if (local->group_type < group_overloaded) {
+ /*
+ * Local will become overloaded so the avg_load metrics are
+ * finally needed.
+ */
- /*
- * If there aren't any idle CPUs, avoid creating some.
- */
- if (busiest->group_type == group_overloaded &&
- local->group_type == group_overloaded) {
- load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
- if (load_above_capacity > busiest->group_capacity) {
- load_above_capacity -= busiest->group_capacity;
- load_above_capacity *= scale_load_down(NICE_0_LOAD);
- load_above_capacity /= busiest->group_capacity;
- } else
- load_above_capacity = ~0UL;
+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
}
/*
- * We're trying to get all the CPUs to the average_load, so we don't
- * want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded CPU below the average load. At the same time,
- * we also don't want to reduce the group load below the group
- * capacity. Thus we look for the minimum possible imbalance.
+ * Both group are or will become overloaded and we're trying to get all
+ * the CPUs to the average_load, so we don't want to push ourselves
+ * above the average load, nor do we wish to reduce the max loaded CPU
+ * below the average load. At the same time, we also don't want to
+ * reduce the group load below the group capacity. Thus we look for
+ * the minimum possible imbalance.
*/
- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
-
- /* How much load to actually move to equalise the imbalance */
+ env->migration_type = migrate_load;
env->imbalance = min(
- max_pull * busiest->group_capacity,
+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
-
- /* Boost imbalance to allow misfit task to be balanced. */
- if (busiest->group_type == group_misfit_task) {
- env->imbalance = max_t(long, env->imbalance,
- busiest->group_misfit_task_load);
- }
-
- /*
- * if *imbalance is less than the average load per runnable task
- * there is no guarantee that any tasks will be moved so we'll have
- * a think about bumping its value to force at least one task to be
- * moved
- */
- if (env->imbalance < busiest->load_per_task)
- return fix_small_imbalance(env, sds);
}
/******* find_busiest_group() helpers end here *********************/
+/*
+ * Decision matrix according to the local and busiest group type:
+ *
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
+ * has_spare nr_idle balanced N/A N/A balanced balanced
+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
+ * misfit_task force N/A N/A N/A force force
+ * asym_packing force force N/A N/A force force
+ * imbalanced force force N/A N/A force force
+ * overloaded force force N/A N/A force avg_load
+ *
+ * N/A : Not Applicable because already filtered while updating
+ * statistics.
+ * balanced : The system is balanced for these 2 groups.
+ * force : Calculate the imbalance as load migration is probably needed.
+ * avg_load : Only if imbalance is significant enough.
+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
+ * different in groups.
+ */
+
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
@@ -8525,7 +8817,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
init_sd_lb_stats(&sds);
/*
- * Compute the various statistics relavent for load balancing at
+ * Compute the various statistics relevant for load balancing at
* this level.
*/
update_sd_lb_stats(env, &sds);
@@ -8540,17 +8832,17 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
local = &sds.local_stat;
busiest = &sds.busiest_stat;
- /* ASYM feature bypasses nice load balance check */
- if (check_asym_packing(env, &sds))
- return sds.busiest;
-
/* There is no busy sibling group to pull tasks from */
- if (!sds.busiest || busiest->sum_nr_running == 0)
+ if (!sds.busiest)
goto out_balanced;
- /* XXX broken for overlapping NUMA groups */
- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
- / sds.total_capacity;
+ /* Misfit tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task)
+ goto force_balance;
+
+ /* ASYM feature bypasses nice load balance check */
+ if (busiest->group_type == group_asym_packing)
+ goto force_balance;
/*
* If the busiest group is imbalanced the below checks don't
@@ -8561,55 +8853,80 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/*
- * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
- * capacities from resulting in underutilization due to avg_load.
- */
- if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
- busiest->group_no_capacity)
- goto force_balance;
-
- /* Misfit tasks should be dealt with regardless of the avg load */
- if (busiest->group_type == group_misfit_task)
- goto force_balance;
-
- /*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
- if (local->avg_load >= busiest->avg_load)
+ if (local->group_type > busiest->group_type)
goto out_balanced;
/*
- * Don't pull any tasks if this group is already above the domain
- * average load.
+ * When groups are overloaded, use the avg_load to ensure fairness
+ * between tasks.
*/
- if (local->avg_load >= sds.avg_load)
- goto out_balanced;
+ if (local->group_type == group_overloaded) {
+ /*
+ * If the local group is more loaded than the selected
+ * busiest group don't try to pull any tasks.
+ */
+ if (local->avg_load >= busiest->avg_load)
+ goto out_balanced;
+
+ /* XXX broken for overlapping NUMA groups */
+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
+ sds.total_capacity;
- if (env->idle == CPU_IDLE) {
/*
- * This CPU is idle. If the busiest group is not overloaded
- * and there is no imbalance between this and busiest group
- * wrt idle CPUs, it is balanced. The imbalance becomes
- * significant if the diff is greater than 1 otherwise we
- * might end up to just move the imbalance on another group
+ * Don't pull any tasks if this group is already above the
+ * domain average load.
*/
- if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ if (local->avg_load >= sds.avg_load)
goto out_balanced;
- } else {
+
/*
- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
- * imbalance_pct to be conservative.
+ * If the busiest group is more loaded, use imbalance_pct to be
+ * conservative.
*/
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
+ /* Try to move all excess tasks to child's sibling domain */
+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
+ busiest->sum_nr_running > local->sum_nr_running + 1)
+ goto force_balance;
+
+ if (busiest->group_type != group_overloaded) {
+ if (env->idle == CPU_NOT_IDLE)
+ /*
+ * If the busiest group is not overloaded (and as a
+ * result the local one too) but this CPU is already
+ * busy, let another idle CPU try to pull task.
+ */
+ goto out_balanced;
+
+ if (busiest->group_weight > 1 &&
+ local->idle_cpus <= (busiest->idle_cpus + 1))
+ /*
+ * If the busiest group is not overloaded
+ * and there is no imbalance between this and busiest
+ * group wrt idle CPUs, it is balanced. The imbalance
+ * becomes significant if the diff is greater than 1
+ * otherwise we might end up to just move the imbalance
+ * on another group. Of course this applies only if
+ * there is more than 1 CPU per group.
+ */
+ goto out_balanced;
+
+ if (busiest->sum_h_nr_running == 1)
+ /*
+ * busiest doesn't have any tasks waiting to run
+ */
+ goto out_balanced;
+ }
+
force_balance:
/* Looks like there is an imbalance. Compute it */
- env->src_grp_type = busiest->group_type;
calculate_imbalance(env, &sds);
return env->imbalance ? sds.busiest : NULL;
@@ -8625,11 +8942,13 @@ static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long busiest_load = 0, busiest_capacity = 1;
+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ unsigned int busiest_nr = 0;
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
- unsigned long capacity, load;
+ unsigned long capacity, load, util;
+ unsigned int nr_running;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -8657,20 +8976,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- /*
- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
- * seek the "biggest" misfit task.
- */
- if (env->src_grp_type == group_misfit_task) {
- if (rq->misfit_task_load > busiest_load) {
- busiest_load = rq->misfit_task_load;
- busiest = rq;
- }
-
- continue;
- }
-
capacity = capacity_of(i);
+ nr_running = rq->cfs.h_nr_running;
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
@@ -8680,35 +8987,69 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
capacity_of(env->dst_cpu) < capacity &&
- rq->nr_running == 1)
+ nr_running == 1)
continue;
- load = cpu_runnable_load(rq);
+ switch (env->migration_type) {
+ case migrate_load:
+ /*
+ * When comparing with load imbalance, use cpu_load()
+ * which is not scaled with the CPU capacity.
+ */
+ load = cpu_load(rq);
- /*
- * When comparing with imbalance, use cpu_runnable_load()
- * which is not scaled with the CPU capacity.
- */
+ if (nr_running == 1 && load > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
+ break;
- if (rq->nr_running == 1 && load > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
- continue;
+ /*
+ * For the load comparisons with the other CPUs,
+ * consider the cpu_load() scaled with the CPU
+ * capacity, so that the load can be moved away
+ * from the CPU that is potentially running at a
+ * lower capacity.
+ *
+ * Thus we're looking for max(load_i / capacity_i),
+ * crosswise multiplication to rid ourselves of the
+ * division works out to:
+ * load_i * capacity_j > load_j * capacity_i;
+ * where j is our previous maximum.
+ */
+ if (load * busiest_capacity > busiest_load * capacity) {
+ busiest_load = load;
+ busiest_capacity = capacity;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_util:
+ util = cpu_util(cpu_of(rq));
+
+ if (busiest_util < util) {
+ busiest_util = util;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_task:
+ if (busiest_nr < nr_running) {
+ busiest_nr = nr_running;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_misfit:
+ /*
+ * For ASYM_CPUCAPACITY domains with misfit tasks we
+ * simply seek the "biggest" misfit task.
+ */
+ if (rq->misfit_task_load > busiest_load) {
+ busiest_load = rq->misfit_task_load;
+ busiest = rq;
+ }
+
+ break;
- /*
- * For the load comparisons with the other CPU's, consider
- * the cpu_runnable_load() scaled with the CPU capacity, so
- * that the load can be moved away from the CPU that is
- * potentially running at a lower capacity.
- *
- * Thus we're looking for max(load_i / capacity_i), crosswise
- * multiplication to rid ourselves of the division works out
- * to: load_i * capacity_j > load_j * capacity_i; where j is
- * our previous maximum.
- */
- if (load * busiest_capacity > busiest_load * capacity) {
- busiest_load = load;
- busiest_capacity = capacity;
- busiest = rq;
}
}
@@ -8754,7 +9095,7 @@ voluntary_active_balance(struct lb_env *env)
return 1;
}
- if (env->src_grp_type == group_misfit_task)
+ if (env->migration_type == migrate_misfit)
return 1;
return 0;
@@ -9047,9 +9388,10 @@ more_balance:
out_balanced:
/*
* We reach balance although we may have faced some affinity
- * constraints. Clear the imbalance flag if it was set.
+ * constraints. Clear the imbalance flag only if other tasks got
+ * a chance to move and fix the imbalance.
*/
- if (sd_parent) {
+ if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
@@ -9070,10 +9412,10 @@ out_one_pinned:
ld_moved = 0;
/*
- * idle_balance() disregards balance intervals, so we could repeatedly
- * reach this code, which would lead to balance_interval skyrocketting
- * in a short amount of time. Skip the balance_interval increase logic
- * to avoid that.
+ * newidle_balance() disregards balance intervals, so we could
+ * repeatedly reach this code, which would lead to balance_interval
+ * skyrocketting in a short amount of time. Skip the balance_interval
+ * increase logic to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
goto out;
@@ -9227,6 +9569,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -9263,7 +9606,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
break;
}
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
@@ -9279,9 +9622,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
spin_unlock(&balancing);
@@ -9782,8 +10126,13 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
+ *
+ * Returns:
+ * < 0 - we released the lock and there are !fair tasks present
+ * 0 - failed, no new tasks
+ * > 0 - success, new (fair) tasks present
*/
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -9791,6 +10140,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
int pulled_task = 0;
u64 curr_cost = 0;
+ update_misfit_status(NULL, this_rq);
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -10025,6 +10375,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
+ if (rq->cfs.nr_running == 1)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
@@ -10115,7 +10468,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
}
@@ -10175,9 +10528,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
-static void set_curr_task_fair(struct rq *rq)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
- struct sched_entity *se = &rq->curr->se;
+ struct sched_entity *se = &p->se;
+
+#ifdef CONFIG_SMP
+ if (task_on_rq_queued(p)) {
+ /*
+ * Move the next running task to the front of the list, so our
+ * cfs_tasks list becomes MRU one.
+ */
+ list_move(&se->group_node, &rq->cfs_tasks);
+ }
+#endif
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -10295,18 +10658,18 @@ err:
void online_fair_sched_group(struct task_group *tg)
{
struct sched_entity *se;
+ struct rq_flags rf;
struct rq *rq;
int i;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
se = tg->se[i];
-
- raw_spin_lock_irq(&rq->lock);
+ rq_lock_irq(rq, &rf);
update_rq_clock(rq);
attach_entity_cfs_rq(se);
sync_throttle(tg, i);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
}
}
@@ -10447,10 +10810,12 @@ const struct sched_class fair_sched_class = {
.check_preempt_curr = check_preempt_wakeup,
- .pick_next_task = pick_next_task_fair,
+ .pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
+ .set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
+ .balance = balance_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
@@ -10461,7 +10826,6 @@ const struct sched_class fair_sched_class = {
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 2410db5e9a35..7481cd96f391 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -89,3 +89,4 @@ SCHED_FEAT(WA_BIAS, true)
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
+SCHED_FEAT(UTIL_EST_FASTUP, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80940939b733..b743bf38f08f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -104,7 +104,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
* update no idle residency and return.
*/
if (current_clr_polling_and_test()) {
- dev->last_residency = 0;
+ dev->last_residency_ns = 0;
local_irq_enable();
return -EBUSY;
}
@@ -158,14 +158,16 @@ static void cpuidle_idle_call(void)
/*
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
- * activity happens here and in iterrupts (if any). In that case bypass
+ * activity happens here and in interrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
*/
- if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+ if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
+ u64 max_latency_ns;
+
if (idle_should_enter_s2idle()) {
rcu_idle_enter();
@@ -176,12 +178,16 @@ static void cpuidle_idle_call(void)
}
rcu_idle_exit();
+
+ max_latency_ns = U64_MAX;
+ } else {
+ max_latency_ns = dev->forced_idle_latency_limit_ns;
}
tick_nohz_idle_stop_tick();
rcu_idle_enter();
- next_state = cpuidle_find_deepest_state(drv, dev);
+ next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
} else {
bool stop_tick = true;
@@ -238,16 +244,16 @@ static void do_idle(void)
tick_nohz_idle_enter();
while (!need_resched()) {
- check_pgt_cache();
rmb();
+ local_irq_disable();
+
if (cpu_is_offline(cpu)) {
- tick_nohz_idle_stop_tick_protected();
+ tick_nohz_idle_stop_tick();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
- local_irq_disable();
arch_cpu_idle_enter();
/*
@@ -311,7 +317,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void play_idle(unsigned long duration_ms)
+void play_idle_precise(u64 duration_ns, u64 latency_ns)
{
struct idle_timer it;
@@ -323,28 +329,29 @@ void play_idle(unsigned long duration_ms)
WARN_ON_ONCE(current->nr_cpus_allowed != 1);
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
- WARN_ON_ONCE(!duration_ms);
+ WARN_ON_ONCE(!duration_ns);
rcu_sleep_check();
preempt_disable();
current->flags |= PF_IDLE;
- cpuidle_use_deepest_state(true);
+ cpuidle_use_deepest_state(latency_ns);
it.done = 0;
hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
it.timer.function = idle_inject_timer_fn;
- hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
+ hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
+ HRTIMER_MODE_REL_PINNED);
while (!READ_ONCE(it.done))
do_idle();
- cpuidle_use_deepest_state(false);
+ cpuidle_use_deepest_state(0);
current->flags &= ~PF_IDLE;
preempt_fold_need_resched();
preempt_enable();
}
-EXPORT_SYMBOL_GPL(play_idle);
+EXPORT_SYMBOL_GPL(play_idle_precise);
void cpu_startup_entry(enum cpuhp_state state)
{
@@ -364,6 +371,12 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
+
+static int
+balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return WARN_ON_ONCE(1);
+}
#endif
/*
@@ -374,14 +387,23 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_curr(rq);
}
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
- put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
+}
- return rq->idle;
+struct task_struct *pick_next_task_idle(struct rq *rq)
+{
+ struct task_struct *next = rq->idle;
+
+ set_next_task_idle(rq, next, true);
+
+ return next;
}
/*
@@ -397,10 +419,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
raw_spin_lock_irq(&rq->lock);
}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
-}
-
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -413,10 +431,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
@@ -451,13 +465,14 @@ const struct sched_class idle_sched_class = {
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
+ .set_next_task = set_next_task_idle,
#ifdef CONFIG_SMP
+ .balance = balance_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index ccb28085b114..008d6ac2342b 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -22,9 +22,17 @@ EXPORT_SYMBOL_GPL(housekeeping_enabled);
int housekeeping_any_cpu(enum hk_flags flags)
{
- if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
+ int cpu;
+
+ if (static_branch_unlikely(&housekeeping_overridden)) {
+ if (housekeeping_flags & flags) {
+ cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ }
+ }
return smp_processor_id();
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
@@ -155,6 +163,12 @@ static int __init housekeeping_isolcpus_setup(char *str)
continue;
}
+ if (!strncmp(str, "managed_irq,", 12)) {
+ str += 12;
+ flags |= HK_FLAG_MANAGED_IRQ;
+ continue;
+ }
+
pr_warn("isolcpus: Error, unknown flag\n");
return 0;
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 28a516575c18..de22da666ac7 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void)
return calc_load_idx & 1;
}
-void calc_load_nohz_start(void)
+static void calc_load_nohz_fold(struct rq *rq)
{
- struct rq *this_rq = this_rq();
long delta;
- /*
- * We're going into NO_HZ mode, if there's any pending delta, fold it
- * into the pending NO_HZ delta.
- */
- delta = calc_load_fold_active(this_rq, 0);
+ delta = calc_load_fold_active(rq, 0);
if (delta) {
int idx = calc_load_write_idx();
@@ -248,6 +243,24 @@ void calc_load_nohz_start(void)
}
}
+void calc_load_nohz_start(void)
+{
+ /*
+ * We're going into NO_HZ mode, if there's any pending delta, fold it
+ * into the pending NO_HZ delta.
+ */
+ calc_load_nohz_fold(this_rq());
+}
+
+/*
+ * Keep track of the load for NOHZ_FULL, must be called between
+ * calc_load_nohz_{start,stop}().
+ */
+void calc_load_nohz_remote(struct rq *rq)
+{
+ calc_load_nohz_fold(rq);
+}
+
void calc_load_nohz_stop(void)
{
struct rq *this_rq = this_rq();
@@ -268,7 +281,7 @@ void calc_load_nohz_stop(void)
this_rq->calc_load_update += LOAD_FREQ;
}
-static long calc_load_nohz_fold(void)
+static long calc_load_nohz_read(void)
{
int idx = calc_load_read_idx();
long delta = 0;
@@ -323,7 +336,7 @@ static void calc_global_nohz(void)
}
#else /* !CONFIG_NO_HZ_COMMON */
-static inline long calc_load_nohz_fold(void) { return 0; }
+static inline long calc_load_nohz_read(void) { return 0; }
static inline void calc_global_nohz(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks)
/*
* Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
*/
- delta = calc_load_nohz_fold();
+ delta = calc_load_nohz_read();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index aa8d75804108..168479a7d61b 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -30,10 +30,42 @@ static void ipi_mb(void *info)
smp_mb(); /* IPIs should be serializing but paranoid. */
}
+static void ipi_sync_rq_state(void *info)
+{
+ struct mm_struct *mm = (struct mm_struct *) info;
+
+ if (current->mm != mm)
+ return;
+ this_cpu_write(runqueues.membarrier_state,
+ atomic_read(&mm->membarrier_state));
+ /*
+ * Issue a memory barrier after setting
+ * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
+ * guarantee that no memory access following registration is reordered
+ * before registration.
+ */
+ smp_mb();
+}
+
+void membarrier_exec_mmap(struct mm_struct *mm)
+{
+ /*
+ * Issue a memory barrier before clearing membarrier_state to
+ * guarantee that no memory access prior to exec is reordered after
+ * clearing this state.
+ */
+ smp_mb();
+ atomic_set(&mm->membarrier_state, 0);
+ /*
+ * Keep the runqueue membarrier_state in sync with this mm
+ * membarrier_state.
+ */
+ this_cpu_write(runqueues.membarrier_state, 0);
+}
+
static int membarrier_global_expedited(void)
{
int cpu;
- bool fallback = false;
cpumask_var_t tmpmask;
if (num_online_cpus() == 1)
@@ -45,17 +77,11 @@ static int membarrier_global_expedited(void)
*/
smp_mb(); /* system call entry is not a mb. */
- /*
- * Expedited membarrier commands guarantee that they won't
- * block, hence the GFP_NOWAIT allocation flag and fallback
- * implementation.
- */
- if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
- /* Fallback for OOM. */
- fallback = true;
- }
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
cpus_read_lock();
+ rcu_read_lock();
for_each_online_cpu(cpu) {
struct task_struct *p;
@@ -70,23 +96,28 @@ static int membarrier_global_expedited(void)
if (cpu == raw_smp_processor_id())
continue;
- rcu_read_lock();
- p = task_rcu_dereference(&cpu_rq(cpu)->curr);
- if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
- MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
- if (!fallback)
- __cpumask_set_cpu(cpu, tmpmask);
- else
- smp_call_function_single(cpu, ipi_mb, NULL, 1);
- }
- rcu_read_unlock();
- }
- if (!fallback) {
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
- preempt_enable();
- free_cpumask_var(tmpmask);
+ if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED))
+ continue;
+
+ /*
+ * Skip the CPU if it runs a kernel thread. The scheduler
+ * leaves the prior task mm in place as an optimization when
+ * scheduling a kthread.
+ */
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (p->flags & PF_KTHREAD)
+ continue;
+
+ __cpumask_set_cpu(cpu, tmpmask);
}
+ rcu_read_unlock();
+
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
+
+ free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
@@ -101,22 +132,22 @@ static int membarrier_global_expedited(void)
static int membarrier_private_expedited(int flags)
{
int cpu;
- bool fallback = false;
cpumask_var_t tmpmask;
+ struct mm_struct *mm = current->mm;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
- if (!(atomic_read(&current->mm->membarrier_state) &
+ if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
} else {
- if (!(atomic_read(&current->mm->membarrier_state) &
+ if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM;
}
- if (num_online_cpus() == 1)
+ if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
return 0;
/*
@@ -125,17 +156,11 @@ static int membarrier_private_expedited(int flags)
*/
smp_mb(); /* system call entry is not a mb. */
- /*
- * Expedited membarrier commands guarantee that they won't
- * block, hence the GFP_NOWAIT allocation flag and fallback
- * implementation.
- */
- if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
- /* Fallback for OOM. */
- fallback = true;
- }
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
cpus_read_lock();
+ rcu_read_lock();
for_each_online_cpu(cpu) {
struct task_struct *p;
@@ -149,22 +174,17 @@ static int membarrier_private_expedited(int flags)
*/
if (cpu == raw_smp_processor_id())
continue;
- rcu_read_lock();
- p = task_rcu_dereference(&cpu_rq(cpu)->curr);
- if (p && p->mm == current->mm) {
- if (!fallback)
- __cpumask_set_cpu(cpu, tmpmask);
- else
- smp_call_function_single(cpu, ipi_mb, NULL, 1);
- }
- rcu_read_unlock();
- }
- if (!fallback) {
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
- preempt_enable();
- free_cpumask_var(tmpmask);
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
}
+ rcu_read_unlock();
+
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
+
+ free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
@@ -177,32 +197,78 @@ static int membarrier_private_expedited(int flags)
return 0;
}
+static int sync_runqueues_membarrier_state(struct mm_struct *mm)
+{
+ int membarrier_state = atomic_read(&mm->membarrier_state);
+ cpumask_var_t tmpmask;
+ int cpu;
+
+ if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
+ this_cpu_write(runqueues.membarrier_state, membarrier_state);
+
+ /*
+ * For single mm user, we can simply issue a memory barrier
+ * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
+ * mm and in the current runqueue to guarantee that no memory
+ * access following registration is reordered before
+ * registration.
+ */
+ smp_mb();
+ return 0;
+ }
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ /*
+ * For mm with multiple users, we need to ensure all future
+ * scheduler executions will observe @mm's new membarrier
+ * state.
+ */
+ synchronize_rcu();
+
+ /*
+ * For each cpu runqueue, if the task's mm match @mm, ensure that all
+ * @mm's membarrier state set bits are also set in in the runqueue's
+ * membarrier state. This ensures that a runqueue scheduling
+ * between threads which are users of @mm has its membarrier state
+ * updated.
+ */
+ cpus_read_lock();
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *p;
+
+ p = rcu_dereference(rq->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
+ }
+ rcu_read_unlock();
+
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
+ preempt_enable();
+
+ free_cpumask_var(tmpmask);
+ cpus_read_unlock();
+
+ return 0;
+}
+
static int membarrier_register_global_expedited(void)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ int ret;
if (atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
return 0;
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
- if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
- /*
- * For single mm user, single threaded process, we can
- * simply issue a memory barrier after setting
- * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
- * no memory access following registration is reordered
- * before registration.
- */
- smp_mb();
- } else {
- /*
- * For multi-mm user threads, we need to ensure all
- * future scheduler executions will observe the new
- * thread flag state for this mm.
- */
- synchronize_rcu();
- }
+ ret = sync_runqueues_membarrier_state(mm);
+ if (ret)
+ return ret;
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
&mm->membarrier_state);
@@ -213,12 +279,15 @@ static int membarrier_register_private_expedited(int flags)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
- int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+ int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
+ ret;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
- state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ ready_state =
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
}
/*
@@ -226,20 +295,15 @@ static int membarrier_register_private_expedited(int flags)
* groups, which use the same mm. (CLONE_VM but not
* CLONE_THREAD).
*/
- if (atomic_read(&mm->membarrier_state) & state)
+ if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
return 0;
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
- &mm->membarrier_state);
- if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
- /*
- * Ensure all future scheduler executions will observe the
- * new thread flag state for this process.
- */
- synchronize_rcu();
- }
- atomic_or(state, &mm->membarrier_state);
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+ atomic_or(set_state, &mm->membarrier_state);
+ ret = sync_runqueues_membarrier_state(mm);
+ if (ret)
+ return ret;
+ atomic_or(ready_state, &mm->membarrier_state);
return 0;
}
@@ -253,8 +317,10 @@ static int membarrier_register_private_expedited(int flags)
* command specified does not exist, not available on the running
* kernel, or if the command argument is invalid, this system call
* returns -EINVAL. For a given command, with flags argument set to 0,
- * this system call is guaranteed to always return the same value until
- * reboot.
+ * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
+ * always return the same value until reboot. In addition, it can return
+ * -ENOMEM if there is not enough memory available to perform the system
+ * call.
*
* All memory accesses performed in program order from each targeted thread
* is guaranteed to be ordered with respect to sys_membarrier(). If we use
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index a96db50d40e0..bd006b79b360 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -129,8 +129,20 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
* Step 2
*/
delta %= 1024;
- contrib = __accumulate_pelt_segments(periods,
- 1024 - sa->period_contrib, delta);
+ if (load) {
+ /*
+ * This relies on the:
+ *
+ * if (!load)
+ * runnable = running = 0;
+ *
+ * clause from ___update_load_sum(); this results in
+ * the below usage of @contrib to dissapear entirely,
+ * so no point in calculating it.
+ */
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
}
sa->period_contrib = delta;
@@ -205,7 +217,9 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
* this happens during idle_balance() which calls
- * update_blocked_averages()
+ * update_blocked_averages().
+ *
+ * Also see the comment in accumulate_sum().
*/
if (!load)
runnable = running = 0;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 23fbbcc414d5..028520702717 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -185,7 +185,8 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->avg_next_update = sched_clock() + psi_period;
+ group->avg_last_update = sched_clock();
+ group->avg_next_update = group->avg_last_update + psi_period;
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
@@ -481,7 +482,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
u32 remaining;
remaining = win->size - elapsed;
- growth += div_u64(win->prev_growth * remaining, win->size);
+ growth += div64_u64(win->prev_growth * remaining, win->size);
}
return growth;
@@ -1131,7 +1132,15 @@ static void psi_trigger_destroy(struct kref *ref)
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
if (kworker_to_destroy) {
+ /*
+ * After the RCU grace period has expired, the worker
+ * can no longer be found through group->poll_kworker.
+ * But it might have been already scheduled before
+ * that - deschedule it cleanly before destroying it.
+ */
kthread_cancel_delayed_work_sync(&group->poll_work);
+ atomic_set(&group->poll_scheduled, 0);
+
kthread_destroy_worker(kworker_to_destroy);
}
kfree(t);
@@ -1190,7 +1199,10 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
- buf_size = min(nbytes, (sizeof(buf) - 1));
+ if (!nbytes)
+ return -EINVAL;
+
+ buf_size = min(nbytes, sizeof(buf));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
@@ -1242,39 +1254,41 @@ static int psi_fop_release(struct inode *inode, struct file *file)
return single_release(inode, file);
}
-static const struct file_operations psi_io_fops = {
- .open = psi_io_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_io_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_io_proc_ops = {
+ .proc_open = psi_io_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_io_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
-static const struct file_operations psi_memory_fops = {
- .open = psi_memory_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_memory_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_memory_proc_ops = {
+ .proc_open = psi_memory_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_memory_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
-static const struct file_operations psi_cpu_fops = {
- .open = psi_cpu_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_cpu_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_cpu_proc_ops = {
+ .proc_open = psi_cpu_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_cpu_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
static int __init psi_proc_init(void)
{
- proc_mkdir("pressure", NULL);
- proc_create("pressure/io", 0, NULL, &psi_io_fops);
- proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
- proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+ if (psi_enable) {
+ proc_mkdir("pressure", NULL);
+ proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
+ proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
+ proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
+ }
return 0;
}
module_init(psi_proc_init);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a532558a5176..4043abe45459 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -45,8 +45,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
raw_spin_lock_init(&rt_b->rt_runtime_lock);
- hrtimer_init(&rt_b->rt_period_timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
@@ -67,7 +67,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
* to update the period.
*/
hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
- hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS_PINNED_HARD);
}
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -436,6 +437,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
return rt_se->on_rq;
}
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the uclamp
+ * settings.
+ *
+ * This check is only important for heterogeneous systems where uclamp_min value
+ * is higher than the capacity of a @cpu. For non-heterogeneous system this
+ * function will always return true.
+ *
+ * The function will return true if the capacity of the @cpu is >= the
+ * uclamp_min and false otherwise.
+ *
+ * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
+ * > uclamp_max.
+ */
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned int min_cap;
+ unsigned int max_cap;
+ unsigned int cpu_cap;
+
+ /* Only heterogeneous systems can benefit from this check */
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return true;
+
+ min_cap = uclamp_eff_value(p, UCLAMP_MIN);
+ max_cap = uclamp_eff_value(p, UCLAMP_MAX);
+
+ cpu_cap = capacity_orig_of(cpu);
+
+ return cpu_cap >= min(min_cap, max_cap);
+}
+#else
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ return true;
+}
+#endif
+
#ifdef CONFIG_RT_GROUP_SCHED
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
@@ -1390,6 +1430,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
+ bool test;
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1421,10 +1462,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
*
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
+ *
+ * We take into account the capacity of the CPU to ensure it fits the
+ * requirement of the task - which is only important on heterogeneous
+ * systems like big.LITTLE.
*/
- if (curr && unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ test = curr &&
+ unlikely(rt_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
+
+ if (test || !rt_task_fits_capacity(p, cpu)) {
int target = find_lowest_rq(p);
/*
@@ -1448,15 +1495,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL))
return;
/*
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (p->nr_cpus_allowed != 1
- && cpupri_find(&rq->rd->cpupri, p, NULL))
+ if (p->nr_cpus_allowed != 1 &&
+ cpupri_find(&rq->rd->cpupri, p, NULL, NULL))
return;
/*
@@ -1468,6 +1515,22 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
resched_curr(rq);
}
+static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+{
+ if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_rt_task(rq);
+ rq_repin_lock(rq, rf);
+ }
+
+ return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
+}
#endif /* CONFIG_SMP */
/*
@@ -1498,12 +1561,25 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
+
+ if (!first)
+ return;
+
+ /*
+ * If prev task was rt, put_prev_task() has already updated the
+ * utilization. We only care of the case where we start to schedule a
+ * rt task
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ rt_queue_push_tasks(rq);
}
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -1537,58 +1613,15 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
-static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_rt(struct rq *rq)
{
struct task_struct *p;
- struct rt_rq *rt_rq = &rq->rt;
-
- if (need_pull_rt_task(rq, prev)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we're
- * being very careful to re-start the picking loop.
- */
- rq_unpin_lock(rq, rf);
- pull_rt_task(rq);
- rq_repin_lock(rq, rf);
- /*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl or stop task can slip in, in which case we need
- * to re-start task selection.
- */
- if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
- rq->dl.dl_nr_running))
- return RETRY_TASK;
- }
-
- /*
- * We may dequeue prev's rt_rq in put_prev_task().
- * So, we update time before rt_queued check.
- */
- if (prev->sched_class == &rt_sched_class)
- update_curr_rt(rq);
- if (!rt_rq->rt_queued)
+ if (!sched_rt_runnable(rq))
return NULL;
- put_prev_task(rq, prev);
-
p = _pick_next_task_rt(rq);
-
- set_next_task(rq, p);
-
- rt_queue_push_tasks(rq);
-
- /*
- * If prev task was rt, put_prev_task() has already updated the
- * utilization. We only care of the case where we start to schedule a
- * rt task
- */
- if (rq->curr->sched_class != &rt_sched_class)
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
-
+ set_next_task_rt(rq, p, true);
return p;
}
@@ -1614,7 +1647,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, p->cpus_ptr))
+ cpumask_test_cpu(cpu, p->cpus_ptr) &&
+ rt_task_fits_capacity(p, cpu))
return 1;
return 0;
@@ -1656,7 +1690,8 @@ static int find_lowest_rq(struct task_struct *task)
if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
- if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask,
+ rt_task_fits_capacity))
return -1; /* No targets found */
/*
@@ -2160,12 +2195,14 @@ skip:
*/
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
- !test_tsk_need_resched(rq->curr) &&
- p->nr_cpus_allowed > 1 &&
- (dl_task(rq->curr) || rt_task(rq->curr)) &&
- (rq->curr->nr_cpus_allowed < 2 ||
- rq->curr->prio <= p->prio))
+ bool need_to_push = !task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ p->nr_cpus_allowed > 1 &&
+ (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ rq->curr->prio <= p->prio);
+
+ if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq)))
push_rt_tasks(rq);
}
@@ -2237,7 +2274,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+ bool need_to_push = rq->rt.overloaded ||
+ !rt_task_fits_capacity(p, cpu_of(rq));
+
+ if (p->nr_cpus_allowed > 1 && need_to_push)
rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
@@ -2304,8 +2344,10 @@ static void watchdog(struct rq *rq, struct task_struct *p)
}
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
- if (p->rt.timeout > next)
- p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+ if (p->rt.timeout > next) {
+ posix_cputimers_rt_watchdog(&p->posix_cputimers,
+ p->se.sum_exec_runtime);
+ }
}
}
#else
@@ -2354,11 +2396,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
}
}
-static void set_curr_task_rt(struct rq *rq)
-{
- set_next_task(rq, rq->curr);
-}
-
static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
{
/*
@@ -2380,10 +2417,11 @@ const struct sched_class rt_sched_class = {
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
+ .set_next_task = set_next_task_rt,
#ifdef CONFIG_SMP
+ .balance = balance_rt,
.select_task_rq = select_task_rq_rt,
-
.set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
@@ -2391,7 +2429,6 @@ const struct sched_class rt_sched_class = {
.switched_from = switched_from_rt,
#endif
- .set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 802b1f3405f2..9ea647835fd6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -335,8 +335,6 @@ struct cfs_bandwidth {
u64 quota;
u64 runtime;
s64 hierarchical_quota;
- u64 runtime_expires;
- int expires_seq;
u8 idle;
u8 period_active;
@@ -393,6 +391,16 @@ struct task_group {
#endif
struct cfs_bandwidth cfs_bandwidth;
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* The two decimal precision [%] value requested from user-space */
+ unsigned int uclamp_pct[UCLAMP_CNT];
+ /* Clamp values requested for a task group */
+ struct uclamp_se uclamp_req[UCLAMP_CNT];
+ /* Effective clamp values used for a task group */
+ struct uclamp_se uclamp[UCLAMP_CNT];
+#endif
+
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -483,7 +491,8 @@ struct cfs_rq {
struct load_weight load;
unsigned long runnable_weight;
unsigned int nr_running;
- unsigned int h_nr_running;
+ unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int idle_h_nr_running; /* SCHED_IDLE */
u64 exec_clock;
u64 min_vruntime;
@@ -556,8 +565,6 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
- int expires_seq;
- u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock;
@@ -777,9 +784,6 @@ struct root_domain {
struct perf_domain __rcu *pd;
};
-extern struct root_domain def_root_domain;
-extern struct mutex sched_domains_mutex;
-
extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
@@ -892,7 +896,7 @@ struct rq {
*/
unsigned long nr_uninterruptible;
- struct task_struct *curr;
+ struct task_struct __rcu *curr;
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
@@ -907,6 +911,10 @@ struct rq {
atomic_t nr_iowait;
+#ifdef CONFIG_MEMBARRIER
+ int membarrier_state;
+#endif
+
#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain __rcu *sd;
@@ -1261,16 +1269,18 @@ enum numa_topology_type {
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
-#endif
-
-#ifdef CONFIG_NUMA
extern void sched_init_numa(void);
extern void sched_domains_numa_masks_set(unsigned int cpu);
extern void sched_domains_numa_masks_clear(unsigned int cpu);
+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
#else
static inline void sched_init_numa(void) { }
static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ return nr_cpu_ids;
+}
#endif
#ifdef CONFIG_NUMA_BALANCING
@@ -1449,10 +1459,14 @@ static inline void unregister_sched_domain_sysctl(void)
}
#endif
+extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
#else
static inline void sched_ttwu_pending(void) { }
+static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1699,20 +1713,13 @@ struct sched_class {
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
- /*
- * It is the responsibility of the pick_next_task() method that will
- * return the next task to call put_prev_task() on the @prev task or
- * something equivalent.
- *
- * May return RETRY_TASK when it finds a higher prio class has runnable
- * tasks.
- */
- struct task_struct * (*pick_next_task)(struct rq *rq,
- struct task_struct *prev,
- struct rq_flags *rf);
+ struct task_struct *(*pick_next_task)(struct rq *rq);
+
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+ void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
+ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
@@ -1725,7 +1732,6 @@ struct sched_class {
void (*rq_offline)(struct rq *rq);
#endif
- void (*set_curr_task)(struct rq *rq);
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
void (*task_fork)(struct task_struct *p);
void (*task_dead)(struct task_struct *p);
@@ -1755,12 +1761,14 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
+ WARN_ON_ONCE(rq->curr != prev);
prev->sched_class->put_prev_task(rq, prev);
}
-static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- curr->sched_class->set_curr_task(rq);
+ WARN_ON_ONCE(rq->curr != next);
+ next->sched_class->set_next_task(rq, next, false);
}
#ifdef CONFIG_SMP
@@ -1768,8 +1776,12 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
#else
#define sched_class_highest (&dl_sched_class)
#endif
+
+#define for_class_range(class, _from, _to) \
+ for (class = (_from); class != (_to); class = class->next)
+
#define for_each_class(class) \
- for (class = sched_class_highest; class; class = class->next)
+ for_class_range(class, sched_class_highest, NULL)
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -1777,6 +1789,28 @@ extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
+static inline bool sched_stop_runnable(struct rq *rq)
+{
+ return rq->stop && task_on_rq_queued(rq->stop);
+}
+
+static inline bool sched_dl_runnable(struct rq *rq)
+{
+ return rq->dl.dl_nr_running > 0;
+}
+
+static inline bool sched_rt_runnable(struct rq *rq)
+{
+ return rq->rt.rt_queued > 0;
+}
+
+static inline bool sched_fair_runnable(struct rq *rq)
+{
+ return rq->cfs.nr_running > 0;
+}
+
+extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+extern struct task_struct *pick_next_task_idle(struct rq *rq);
#ifdef CONFIG_SMP
@@ -1943,7 +1977,7 @@ unsigned long arch_scale_freq_capacity(int cpu)
#endif
#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1995,7 +2029,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
return ret;
}
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@ -2266,14 +2300,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
-unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
- struct task_struct *p)
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
{
- unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+ unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+ unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
if (p) {
min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
@@ -2290,18 +2324,10 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
return clamp(util, min_util, max_util);
}
-
-static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
-{
- return uclamp_util_with(rq, util, NULL);
-}
#else /* CONFIG_UCLAMP_TASK */
-static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
- struct task_struct *p)
-{
- return util;
-}
-static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+static inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
{
return util;
}
@@ -2423,3 +2449,46 @@ static inline bool sched_energy_enabled(void)
static inline bool sched_energy_enabled(void) { return false; }
#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+
+#ifdef CONFIG_MEMBARRIER
+/*
+ * The scheduler provides memory barriers required by membarrier between:
+ * - prior user-space memory accesses and store to rq->membarrier_state,
+ * - store to rq->membarrier_state and following user-space memory accesses.
+ * In the same way it provides those guarantees around store to rq->curr.
+ */
+static inline void membarrier_switch_mm(struct rq *rq,
+ struct mm_struct *prev_mm,
+ struct mm_struct *next_mm)
+{
+ int membarrier_state;
+
+ if (prev_mm == next_mm)
+ return;
+
+ membarrier_state = atomic_read(&next_mm->membarrier_state);
+ if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+ return;
+
+ WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+#else
+static inline void membarrier_switch_mm(struct rq *rq,
+ struct mm_struct *prev_mm,
+ struct mm_struct *next_mm)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+ if (!(p->flags & PF_KTHREAD))
+ return false;
+
+ if (p->nr_cpus_allowed != 1)
+ return false;
+
+ return true;
+}
+#endif
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index aa0de240fb41..ba683fe81a6e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -157,9 +157,10 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{
unsigned long long now = rq_clock(rq), delta = 0;
- if (unlikely(sched_info_on()))
+ if (sched_info_on()) {
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
+ }
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
@@ -192,7 +193,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
*/
static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
- if (unlikely(sched_info_on())) {
+ if (sched_info_on()) {
if (!t->sched_info.last_queued)
t->sched_info.last_queued = rq_clock(rq);
}
@@ -239,7 +240,7 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
static inline void
sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
- if (unlikely(sched_info_on()))
+ if (sched_info_on())
__sched_info_switch(rq, prev, next);
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c183b790ca54..4c9e9975684f 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -15,6 +15,12 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
+
+static int
+balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return sched_stop_runnable(rq);
+}
#endif /* CONFIG_SMP */
static void
@@ -23,19 +29,18 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
-static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool first)
{
- struct task_struct *stop = rq->stop;
+ stop->se.exec_start = rq_clock_task(rq);
+}
- if (!stop || !task_on_rq_queued(stop))
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+ if (!sched_stop_runnable(rq))
return NULL;
- put_prev_task(rq, prev);
-
- stop->se.exec_start = rq_clock_task(rq);
-
- return stop;
+ set_next_task_stop(rq, rq->stop, true);
+ return rq->stop;
}
static void
@@ -86,13 +91,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_stop(struct rq *rq)
-{
- struct task_struct *stop = rq->stop;
-
- stop->se.exec_start = rq_clock_task(rq);
-}
-
static void switched_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
@@ -128,13 +126,14 @@ const struct sched_class stop_sched_class = {
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
+ .set_next_task = set_next_task_stop,
#ifdef CONFIG_SMP
+ .balance = balance_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_stop,
.task_tick = task_tick_stop,
.get_rr_interval = get_rr_interval_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f751ce0b783e..dfb64c08a407 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1201,16 +1201,13 @@ static void set_domain_attribute(struct sched_domain *sd,
if (!attr || attr->relax_domain_level < 0) {
if (default_relax_domain_level < 0)
return;
- else
- request = default_relax_domain_level;
+ request = default_relax_domain_level;
} else
request = attr->relax_domain_level;
- if (request < sd->level) {
+
+ if (sd->level > request) {
/* Turn off idle balance on this domain: */
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- } else {
- /* Turn on idle balance on this domain: */
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
}
}
@@ -1284,6 +1281,7 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
#endif
/*
@@ -1402,7 +1400,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
sd->flags &= ~(SD_BALANCE_EXEC |
SD_BALANCE_FORK |
SD_WAKE_AFFINE);
@@ -1724,6 +1722,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
}
}
+/*
+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
+ * closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: cpu to be close to
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ int i, j = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ }
+ return nr_cpu_ids;
+}
+
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -1862,6 +1880,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
}
/*
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
+ * any two given CPUs at this (non-NUMA) topology level.
+ */
+static bool topology_span_sane(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, int cpu)
+{
+ int i;
+
+ /* NUMA levels are allowed to overlap */
+ if (tl->flags & SDTL_OVERLAP)
+ return true;
+
+ /*
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
+ */
+ for_each_cpu(i, cpu_map) {
+ if (i == cpu)
+ continue;
+ /*
+ * We should 'and' all those masks with 'cpu_map' to exactly
+ * match the topology we're about to build, but that can only
+ * remove CPUs, which only lessens our ability to detect
+ * overlaps
+ */
+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
+ return false;
+ }
+
+ return true;
+}
+
+/*
* Find the sched_domain_topology_level where all CPU capacities are visible
* for all CPUs.
*/
@@ -1927,7 +1981,7 @@ next_level:
static int
build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
{
- enum s_alloc alloc_state;
+ enum s_alloc alloc_state = sa_none;
struct sched_domain *sd;
struct s_data d;
struct rq *rq = NULL;
@@ -1935,6 +1989,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
+ if (WARN_ON(cpumask_empty(cpu_map)))
+ goto error;
+
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
@@ -1954,6 +2011,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
has_asym = true;
}
+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
+ goto error;
+
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
if (tl == sched_domain_topology)
@@ -2005,7 +2065,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
rcu_read_unlock();
if (has_asym)
- static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
+ static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
if (rq && sched_debug_enabled) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
@@ -2100,8 +2160,12 @@ int sched_init_domains(const struct cpumask *cpu_map)
*/
static void detach_destroy_domains(const struct cpumask *cpu_map)
{
+ unsigned int cpu = cpumask_any(cpu_map);
int i;
+ if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
+ static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
+
rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
@@ -2149,16 +2213,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
* ndoms_new == 0 is a special case for destroying existing domains,
* and it will not create the default domain.
*
- * Call with hotplug lock held
+ * Call with hotplug lock and sched_domains_mutex held
*/
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
{
bool __maybe_unused has_eas = false;
int i, j, n;
int new_topology;
- mutex_lock(&sched_domains_mutex);
+ lockdep_assert_held(&sched_domains_mutex);
/* Always unregister in case we don't destroy any domains: */
unregister_sched_domain_sysctl();
@@ -2183,8 +2247,19 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_cur[i], doms_new[j]) &&
- dattrs_equal(dattr_cur, i, dattr_new, j))
+ dattrs_equal(dattr_cur, i, dattr_new, j)) {
+ struct root_domain *rd;
+
+ /*
+ * This domain won't be destroyed and as such
+ * its dl_bw->total_bw needs to be cleared. It
+ * will be recomputed in function
+ * update_tasks_root_domain().
+ */
+ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
+ dl_clear_root_domain(rd);
goto match1;
+ }
}
/* No match - a current sched domain not in new doms_new[] */
detach_destroy_domains(doms_cur[i]);
@@ -2241,6 +2316,15 @@ match3:
ndoms_cur = ndoms_new;
register_sched_domain_sysctl();
+}
+/*
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ mutex_lock(&sched_domains_mutex);
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
mutex_unlock(&sched_domains_mutex);
}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index c1e566a114ca..ba059fbfc53a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -169,7 +169,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
* @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
@@ -183,26 +182,44 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
* accessing the task state.
*/
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, void *key)
+ void *key)
{
- int wake_flags = 1; /* XXX WF_SYNC */
-
if (unlikely(!wq_head))
return;
- if (unlikely(nr_exclusive != 1))
- wake_flags = 0;
-
- __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
+ __wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+/**
+ * __wake_up_locked_sync_key - wake up a thread blocked on a locked waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs in that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
+ unsigned int mode, void *key)
+{
+ __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
+
/*
* __wake_up_sync - see __wake_up_sync_key()
*/
-void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive)
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
{
- __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL);
+ __wake_up_sync_key(wq_head, mode, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 45eba18a2898..02ce292b9bc0 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -179,6 +179,7 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int
.bit_nr = -1,
},
.wq_entry = {
+ .flags = flags,
.private = current,
.func = var_wake_function,
.entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dba52a7db5e8..b6ea3dcb57bf 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -75,6 +75,7 @@ struct seccomp_knotif {
/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
int error;
long val;
+ u32 flags;
/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
struct completion ready;
@@ -732,11 +733,12 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
return filter->notif->next_id++;
}
-static void seccomp_do_user_notification(int this_syscall,
- struct seccomp_filter *match,
- const struct seccomp_data *sd)
+static int seccomp_do_user_notification(int this_syscall,
+ struct seccomp_filter *match,
+ const struct seccomp_data *sd)
{
int err;
+ u32 flags = 0;
long ret = 0;
struct seccomp_knotif n = {};
@@ -764,6 +766,7 @@ static void seccomp_do_user_notification(int this_syscall,
if (err == 0) {
ret = n.val;
err = n.error;
+ flags = n.flags;
}
/*
@@ -780,8 +783,14 @@ static void seccomp_do_user_notification(int this_syscall,
list_del(&n.list);
out:
mutex_unlock(&match->notify_lock);
+
+ /* Userspace requests to continue the syscall. */
+ if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
+ return 0;
+
syscall_set_return_value(current, task_pt_regs(current),
err, ret);
+ return -1;
}
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
@@ -867,8 +876,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
case SECCOMP_RET_USER_NOTIF:
- seccomp_do_user_notification(this_syscall, match, sd);
- goto skip;
+ if (seccomp_do_user_notification(this_syscall, match, sd))
+ goto skip;
+
+ return 0;
case SECCOMP_RET_LOG:
seccomp_log(this_syscall, 0, action, true);
@@ -1015,6 +1026,13 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
struct seccomp_notif unotif;
ssize_t ret;
+ /* Verify that we're not given garbage to keep struct extensible. */
+ ret = check_zeroed_user(buf, sizeof(unotif));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return -EINVAL;
+
memset(&unotif, 0, sizeof(unotif));
ret = down_interruptible(&filter->notif->request);
@@ -1087,7 +1105,11 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
if (copy_from_user(&resp, buf, sizeof(resp)))
return -EFAULT;
- if (resp.flags)
+ if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
+ return -EINVAL;
+
+ if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
+ (resp.error || resp.val))
return -EINVAL;
ret = mutex_lock_interruptible(&filter->notify_lock);
@@ -1116,6 +1138,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
knotif->state = SECCOMP_NOTIFY_REPLIED;
knotif->error = resp.error;
knotif->val = resp.val;
+ knotif->flags = resp.flags;
complete(&knotif->ready);
out:
mutex_unlock(&filter->notify_lock);
diff --git a/kernel/signal.c b/kernel/signal.c
index 534fec266a33..9ad8dea93dbb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1383,7 +1383,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
* must see ->sighand == NULL.
*/
spin_lock_irqsave(&sighand->siglock, *flags);
- if (likely(sighand == tsk->sighand))
+ if (likely(sighand == rcu_access_pointer(tsk->sighand)))
break;
spin_unlock_irqrestore(&sighand->siglock, *flags);
}
@@ -2205,8 +2205,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
*/
preempt_disable();
read_unlock(&tasklist_lock);
- preempt_enable_no_resched();
cgroup_enter_frozen();
+ preempt_enable_no_resched();
freezable_schedule();
cgroup_leave_frozen(true);
} else {
@@ -3678,8 +3678,11 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
static struct pid *pidfd_to_pid(const struct file *file)
{
- if (file->f_op == &pidfd_fops)
- return file->private_data;
+ struct pid *pid;
+
+ pid = pidfd_pid(file);
+ if (!IS_ERR(pid))
+ return pid;
return tgid_pidfd_to_pid(file);
}
diff --git a/kernel/smp.c b/kernel/smp.c
index 7dbcb402c2fc..d0ada39eb4d4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -395,22 +395,9 @@ call:
}
EXPORT_SYMBOL_GPL(smp_call_function_any);
-/**
- * smp_call_function_many(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on (only runs on online subset).
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- * on other CPUs.
- *
- * If @wait is true, then returns once @func has returned.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler. Preemption
- * must be disabled when calling this function.
- */
-void smp_call_function_many(const struct cpumask *mask,
- smp_call_func_t func, void *info, bool wait)
+static void smp_call_function_many_cond(const struct cpumask *mask,
+ smp_call_func_t func, void *info,
+ bool wait, smp_cond_func_t cond_func)
{
struct call_function_data *cfd;
int cpu, next_cpu, this_cpu = smp_processor_id();
@@ -448,7 +435,8 @@ void smp_call_function_many(const struct cpumask *mask,
/* Fastpath: do that cpu by itself. */
if (next_cpu >= nr_cpu_ids) {
- smp_call_function_single(cpu, func, info, wait);
+ if (!cond_func || cond_func(cpu, info))
+ smp_call_function_single(cpu, func, info, wait);
return;
}
@@ -465,6 +453,9 @@ void smp_call_function_many(const struct cpumask *mask,
for_each_cpu(cpu, cfd->cpumask) {
call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
+ if (cond_func && !cond_func(cpu, info))
+ continue;
+
csd_lock(csd);
if (wait)
csd->flags |= CSD_FLAG_SYNCHRONOUS;
@@ -486,6 +477,26 @@ void smp_call_function_many(const struct cpumask *mask,
}
}
}
+
+/**
+ * smp_call_function_many(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. Preemption
+ * must be disabled when calling this function.
+ */
+void smp_call_function_many(const struct cpumask *mask,
+ smp_call_func_t func, void *info, bool wait)
+{
+ smp_call_function_many_cond(mask, func, info, wait, NULL);
+}
EXPORT_SYMBOL(smp_call_function_many);
/**
@@ -668,11 +679,6 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* @info: An arbitrary pointer to pass to both functions.
* @wait: If true, wait (atomically) until function has
* completed on other CPUs.
- * @gfp_flags: GFP flags to use when allocating the cpumask
- * used internally by the function.
- *
- * The function might sleep if the GFP flags indicates a non
- * atomic allocation is allowed.
*
* Preemption is disabled to protect against CPUs going offline but not online.
* CPUs going online during the call will not be seen or sent an IPI.
@@ -680,46 +686,27 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* You must not call this function with disabled interrupts or
* from a hardware interrupt handler or from a bottom half handler.
*/
-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags, const struct cpumask *mask)
+void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, const struct cpumask *mask)
{
- cpumask_var_t cpus;
- int cpu, ret;
-
- might_sleep_if(gfpflags_allow_blocking(gfp_flags));
-
- if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
- preempt_disable();
- for_each_cpu(cpu, mask)
- if (cond_func(cpu, info))
- __cpumask_set_cpu(cpu, cpus);
- on_each_cpu_mask(cpus, func, info, wait);
- preempt_enable();
- free_cpumask_var(cpus);
- } else {
- /*
- * No free cpumask, bother. No matter, we'll
- * just have to IPI them one by one.
- */
- preempt_disable();
- for_each_cpu(cpu, mask)
- if (cond_func(cpu, info)) {
- ret = smp_call_function_single(cpu, func,
- info, wait);
- WARN_ON_ONCE(ret);
- }
- preempt_enable();
+ int cpu = get_cpu();
+
+ smp_call_function_many_cond(mask, func, info, wait, cond_func);
+ if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
}
+ put_cpu();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags)
+void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait)
{
- on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
- cpu_online_mask);
+ on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
}
EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index f5440abb7532..2af66e449aa6 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -20,7 +20,7 @@
* @nr_entries: Number of entries in the storage array
* @spaces: Number of leading spaces to print
*/
-void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
+void stack_trace_print(const unsigned long *entries, unsigned int nr_entries,
int spaces)
{
unsigned int i;
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(stack_trace_print);
*
* Return: Number of bytes printed.
*/
-int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
+int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries,
unsigned int nr_entries, int spaces)
{
unsigned int generated, i, total = 0;
@@ -141,7 +141,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
struct stacktrace_cookie c = {
.store = store,
.size = size,
- .skip = skipnr + 1,
+ /* skip this function if they are tracing us */
+ .skip = skipnr + (current == tsk),
};
if (!try_get_task_stack(tsk))
@@ -298,7 +299,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
struct stack_trace trace = {
.entries = store,
.max_entries = size,
- .skip = skipnr + 1,
+ /* skip this function if they are tracing us */
+ .skip = skipnr + (current == task),
};
save_stack_trace_tsk(task, &trace);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4f83f7bdf86..865bb0228ab6 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -7,6 +7,7 @@
* Copyright (C) 2010 SUSE Linux Products GmbH
* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
*/
+#include <linux/compiler.h>
#include <linux/completion.h>
#include <linux/cpu.h>
#include <linux/init.h>
@@ -167,7 +168,7 @@ static void set_state(struct multi_stop_data *msdata,
/* Reset ack counter. */
atomic_set(&msdata->thread_ack, msdata->num_threads);
smp_wmb();
- msdata->state = newstate;
+ WRITE_ONCE(msdata->state, newstate);
}
/* Last one to ack a state moves to the next state. */
@@ -186,7 +187,7 @@ void __weak stop_machine_yield(const struct cpumask *cpumask)
static int multi_cpu_stop(void *data)
{
struct multi_stop_data *msdata = data;
- enum multi_stop_state curstate = MULTI_STOP_NONE;
+ enum multi_stop_state newstate, curstate = MULTI_STOP_NONE;
int cpu = smp_processor_id(), err = 0;
const struct cpumask *cpumask;
unsigned long flags;
@@ -210,8 +211,9 @@ static int multi_cpu_stop(void *data)
do {
/* Chill out and ensure we re-read multi_stop_state. */
stop_machine_yield(cpumask);
- if (msdata->state != curstate) {
- curstate = msdata->state;
+ newstate = READ_ONCE(msdata->state);
+ if (newstate != curstate) {
+ curstate = newstate;
switch (curstate) {
case MULTI_STOP_DISABLE_IRQ:
local_irq_disable();
@@ -233,6 +235,7 @@ static int multi_cpu_stop(void *data)
*/
touch_nmi_watchdog();
}
+ rcu_momentary_dyntick_idle();
} while (curstate != MULTI_STOP_EXIT);
local_irq_restore(flags);
@@ -383,6 +386,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
*/
preempt_disable();
stop_cpus_in_progress = true;
+ barrier();
for_each_cpu(cpu, cpumask) {
work = &per_cpu(cpu_stopper.stop_work, cpu);
work->fn = fn;
@@ -391,6 +395,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
if (cpu_stop_queue_work(cpu, work))
queued = true;
}
+ barrier();
stop_cpus_in_progress = false;
preempt_enable();
@@ -437,7 +442,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
* @cpumask were offline; otherwise, 0 if all executions of @fn
* returned 0, any non zero return value if any returned non zero.
*/
-int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
{
int ret;
@@ -448,36 +453,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
return ret;
}
-/**
- * try_stop_cpus - try to stop multiple cpus
- * @cpumask: cpus to stop
- * @fn: function to execute
- * @arg: argument to @fn
- *
- * Identical to stop_cpus() except that it fails with -EAGAIN if
- * someone else is already using the facility.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * -EAGAIN if someone else is already stopping cpus, -ENOENT if
- * @fn(@arg) was not executed at all because all cpus in @cpumask were
- * offline; otherwise, 0 if all executions of @fn returned 0, any non
- * zero return value if any returned non zero.
- */
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
-{
- int ret;
-
- /* static works are used, process one request at a time */
- if (!mutex_trylock(&stop_cpus_mutex))
- return -EAGAIN;
- ret = __stop_cpus(cpumask, fn, arg);
- mutex_unlock(&stop_cpus_mutex);
- return ret;
-}
-
static int cpu_stop_should_run(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
diff --git a/kernel/sys.c b/kernel/sys.c
index 2969304c29fe..f9bc5c303e3f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -103,12 +103,6 @@
#ifndef SET_TSC_CTL
# define SET_TSC_CTL(a) (-EINVAL)
#endif
-#ifndef MPX_ENABLE_MANAGEMENT
-# define MPX_ENABLE_MANAGEMENT() (-EINVAL)
-#endif
-#ifndef MPX_DISABLE_MANAGEMENT
-# define MPX_DISABLE_MANAGEMENT() (-EINVAL)
-#endif
#ifndef GET_FP_MODE
# define GET_FP_MODE(a) (-EINVAL)
#endif
@@ -124,6 +118,12 @@
#ifndef PAC_RESET_KEYS
# define PAC_RESET_KEYS(a, b) (-EINVAL)
#endif
+#ifndef SET_TAGGED_ADDR_CTRL
+# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL)
+#endif
+#ifndef GET_TAGGED_ADDR_CTRL
+# define GET_TAGGED_ADDR_CTRL() (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -1279,11 +1279,13 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
{
- struct oldold_utsname tmp = {};
+ struct oldold_utsname tmp;
if (!name)
return -EFAULT;
+ memset(&tmp, 0, sizeof(tmp));
+
down_read(&uts_sem);
memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN);
memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN);
@@ -1557,15 +1559,6 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
retval = -EPERM;
if (!retval)
retval = security_task_setrlimit(tsk, resource, new_rlim);
- if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
- /*
- * The caller is asking for an immediate RLIMIT_CPU
- * expiry. But we use the zero value to mean "it was
- * never set". So let's cheat and make it one second
- * instead
- */
- new_rlim->rlim_cur = 1;
- }
}
if (!retval) {
if (old_rlim)
@@ -1576,10 +1569,9 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
task_unlock(tsk->group_leader);
/*
- * RLIMIT_CPU handling. Note that the kernel fails to return an error
- * code if it rejected the user's attempt to set RLIMIT_CPU. This is a
- * very long-standing error, and fixing it now risks breakage of
- * applications, so we live with it
+ * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
+ * infite. In case of RLIM_INFINITY the posix CPU timer code
+ * ignores the rlimit.
*/
if (!retval && new_rlim && resource == RLIMIT_CPU &&
new_rlim->rlim_cur != RLIM_INFINITY &&
@@ -1773,8 +1765,8 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
unlock_task_sighand(p, &flags);
out:
- r->ru_utime = ns_to_timeval(utime);
- r->ru_stime = ns_to_timeval(stime);
+ r->ru_utime = ns_to_kernel_old_timeval(utime);
+ r->ru_stime = ns_to_kernel_old_timeval(stime);
if (who != RUSAGE_CHILDREN) {
struct mm_struct *mm = get_task_mm(p);
@@ -2269,6 +2261,8 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
return -EINVAL;
}
+#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2456,15 +2450,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
up_write(&me->mm->mmap_sem);
break;
case PR_MPX_ENABLE_MANAGEMENT:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = MPX_ENABLE_MANAGEMENT();
- break;
case PR_MPX_DISABLE_MANAGEMENT:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = MPX_DISABLE_MANAGEMENT();
- break;
+ /* No longer implemented: */
+ return -EINVAL;
case PR_SET_FP_MODE:
error = SET_FP_MODE(me, arg2);
break;
@@ -2492,6 +2480,39 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = PAC_RESET_KEYS(me, arg2);
break;
+ case PR_SET_TAGGED_ADDR_CTRL:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = SET_TAGGED_ADDR_CTRL(arg2);
+ break;
+ case PR_GET_TAGGED_ADDR_CTRL:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = GET_TAGGED_ADDR_CTRL();
+ break;
+ case PR_SET_IO_FLUSHER:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ if (arg2 == 1)
+ current->flags |= PR_IO_FLUSHER;
+ else if (!arg2)
+ current->flags &= ~PR_IO_FLUSHER;
+ else
+ return -EINVAL;
+ break;
+ case PR_GET_IO_FLUSHER:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 34b76895b81e..3b69a560a7ac 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -410,6 +410,29 @@ COND_SYSCALL(send);
COND_SYSCALL(bdflush);
COND_SYSCALL(uselib);
+/* optional: time32 */
+COND_SYSCALL(time32);
+COND_SYSCALL(stime32);
+COND_SYSCALL(utime32);
+COND_SYSCALL(adjtimex_time32);
+COND_SYSCALL(sched_rr_get_interval_time32);
+COND_SYSCALL(nanosleep_time32);
+COND_SYSCALL(rt_sigtimedwait_time32);
+COND_SYSCALL_COMPAT(rt_sigtimedwait_time32);
+COND_SYSCALL(timer_settime32);
+COND_SYSCALL(timer_gettime32);
+COND_SYSCALL(clock_settime32);
+COND_SYSCALL(clock_gettime32);
+COND_SYSCALL(clock_getres_time32);
+COND_SYSCALL(clock_nanosleep_time32);
+COND_SYSCALL(utimes_time32);
+COND_SYSCALL(futimesat_time32);
+COND_SYSCALL(pselect6_time32);
+COND_SYSCALL_COMPAT(pselect6_time32);
+COND_SYSCALL(ppoll_time32);
+COND_SYSCALL_COMPAT(ppoll_time32);
+COND_SYSCALL(utimensat_time32);
+COND_SYSCALL(clock_adjtime32);
/*
* The syscalls below are not found in include/uapi/asm-generic/unistd.h
diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c
new file mode 100644
index 000000000000..ccb78509f1a8
--- /dev/null
+++ b/kernel/sysctl-test.c
@@ -0,0 +1,394 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of proc sysctl.
+ */
+
+#include <kunit/test.h>
+#include <linux/sysctl.h>
+
+#define KUNIT_PROC_READ 0
+#define KUNIT_PROC_WRITE 1
+
+static int i_zero;
+static int i_one_hundred = 100;
+
+/*
+ * Test that proc_dointvec will not try to use a NULL .data field even when the
+ * length is non-zero.
+ */
+static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test)
+{
+ struct ctl_table null_data_table = {
+ .procname = "foo",
+ /*
+ * Here we are testing that proc_dointvec behaves correctly when
+ * we give it a NULL .data field. Normally this would point to a
+ * piece of memory where the value would be stored.
+ */
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &i_zero,
+ .extra2 = &i_one_hundred,
+ };
+ /*
+ * proc_dointvec expects a buffer in user space, so we allocate one. We
+ * also need to cast it to __user so sparse doesn't get mad.
+ */
+ void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
+ GFP_USER);
+ size_t len;
+ loff_t pos;
+
+ /*
+ * We don't care what the starting length is since proc_dointvec should
+ * not try to read because .data is NULL.
+ */
+ len = 1234;
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table,
+ KUNIT_PROC_READ, buffer, &len,
+ &pos));
+ KUNIT_EXPECT_EQ(test, (size_t)0, len);
+
+ /*
+ * See above.
+ */
+ len = 1234;
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table,
+ KUNIT_PROC_WRITE, buffer, &len,
+ &pos));
+ KUNIT_EXPECT_EQ(test, (size_t)0, len);
+}
+
+/*
+ * Similar to the previous test, we create a struct ctrl_table that has a .data
+ * field that proc_dointvec cannot do anything with; however, this time it is
+ * because we tell proc_dointvec that the size is 0.
+ */
+static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test)
+{
+ int data = 0;
+ struct ctl_table data_maxlen_unset_table = {
+ .procname = "foo",
+ .data = &data,
+ /*
+ * So .data is no longer NULL, but we tell proc_dointvec its
+ * length is 0, so it still shouldn't try to use it.
+ */
+ .maxlen = 0,
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &i_zero,
+ .extra2 = &i_one_hundred,
+ };
+ void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
+ GFP_USER);
+ size_t len;
+ loff_t pos;
+
+ /*
+ * As before, we don't care what buffer length is because proc_dointvec
+ * cannot do anything because its internal .data buffer has zero length.
+ */
+ len = 1234;
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table,
+ KUNIT_PROC_READ, buffer, &len,
+ &pos));
+ KUNIT_EXPECT_EQ(test, (size_t)0, len);
+
+ /*
+ * See previous comment.
+ */
+ len = 1234;
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table,
+ KUNIT_PROC_WRITE, buffer, &len,
+ &pos));
+ KUNIT_EXPECT_EQ(test, (size_t)0, len);
+}
+
+/*
+ * Here we provide a valid struct ctl_table, but we try to read and write from
+ * it using a buffer of zero length, so it should still fail in a similar way as
+ * before.
+ */
+static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test)
+{
+ int data = 0;
+ /* Good table. */
+ struct ctl_table table = {
+ .procname = "foo",
+ .data = &data,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &i_zero,
+ .extra2 = &i_one_hundred,
+ };
+ void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
+ GFP_USER);
+ /*
+ * However, now our read/write buffer has zero length.
+ */
+ size_t len = 0;
+ loff_t pos;
+
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer,
+ &len, &pos));
+ KUNIT_EXPECT_EQ(test, (size_t)0, len);
+
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, buffer,
+ &len, &pos));
+ KUNIT_EXPECT_EQ(test, (size_t)0, len);
+}
+
+/*
+ * Test that proc_dointvec refuses to read when the file position is non-zero.
+ */
+static void sysctl_test_api_dointvec_table_read_but_position_set(
+ struct kunit *test)
+{
+ int data = 0;
+ /* Good table. */
+ struct ctl_table table = {
+ .procname = "foo",
+ .data = &data,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &i_zero,
+ .extra2 = &i_one_hundred,
+ };
+ void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
+ GFP_USER);
+ /*
+ * We don't care about our buffer length because we start off with a
+ * non-zero file position.
+ */
+ size_t len = 1234;
+ /*
+ * proc_dointvec should refuse to read into the buffer since the file
+ * pos is non-zero.
+ */
+ loff_t pos = 1;
+
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer,
+ &len, &pos));
+ KUNIT_EXPECT_EQ(test, (size_t)0, len);
+}
+
+/*
+ * Test that we can read a two digit number in a sufficiently size buffer.
+ * Nothing fancy.
+ */
+static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test)
+{
+ int data = 0;
+ /* Good table. */
+ struct ctl_table table = {
+ .procname = "foo",
+ .data = &data,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &i_zero,
+ .extra2 = &i_one_hundred,
+ };
+ size_t len = 4;
+ loff_t pos = 0;
+ char *buffer = kunit_kzalloc(test, len, GFP_USER);
+ char __user *user_buffer = (char __user *)buffer;
+ /* Store 13 in the data field. */
+ *((int *)table.data) = 13;
+
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ,
+ user_buffer, &len, &pos));
+ KUNIT_ASSERT_EQ(test, (size_t)3, len);
+ buffer[len] = '\0';
+ /* And we read 13 back out. */
+ KUNIT_EXPECT_STREQ(test, "13\n", buffer);
+}
+
+/*
+ * Same as previous test, just now with negative numbers.
+ */
+static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test)
+{
+ int data = 0;
+ /* Good table. */
+ struct ctl_table table = {
+ .procname = "foo",
+ .data = &data,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &i_zero,
+ .extra2 = &i_one_hundred,
+ };
+ size_t len = 5;
+ loff_t pos = 0;
+ char *buffer = kunit_kzalloc(test, len, GFP_USER);
+ char __user *user_buffer = (char __user *)buffer;
+ *((int *)table.data) = -16;
+
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ,
+ user_buffer, &len, &pos));
+ KUNIT_ASSERT_EQ(test, (size_t)4, len);
+ buffer[len] = '\0';
+ KUNIT_EXPECT_STREQ(test, "-16\n", (char *)buffer);
+}
+
+/*
+ * Test that a simple positive write works.
+ */
+static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test)
+{
+ int data = 0;
+ /* Good table. */
+ struct ctl_table table = {
+ .procname = "foo",
+ .data = &data,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &i_zero,
+ .extra2 = &i_one_hundred,
+ };
+ char input[] = "9";
+ size_t len = sizeof(input) - 1;
+ loff_t pos = 0;
+ char *buffer = kunit_kzalloc(test, len, GFP_USER);
+ char __user *user_buffer = (char __user *)buffer;
+
+ memcpy(buffer, input, len);
+
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE,
+ user_buffer, &len, &pos));
+ KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len);
+ KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos);
+ KUNIT_EXPECT_EQ(test, 9, *((int *)table.data));
+}
+
+/*
+ * Same as previous test, but now with negative numbers.
+ */
+static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test)
+{
+ int data = 0;
+ struct ctl_table table = {
+ .procname = "foo",
+ .data = &data,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &i_zero,
+ .extra2 = &i_one_hundred,
+ };
+ char input[] = "-9";
+ size_t len = sizeof(input) - 1;
+ loff_t pos = 0;
+ char *buffer = kunit_kzalloc(test, len, GFP_USER);
+ char __user *user_buffer = (char __user *)buffer;
+
+ memcpy(buffer, input, len);
+
+ KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE,
+ user_buffer, &len, &pos));
+ KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len);
+ KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos);
+ KUNIT_EXPECT_EQ(test, -9, *((int *)table.data));
+}
+
+/*
+ * Test that writing a value smaller than the minimum possible value is not
+ * allowed.
+ */
+static void sysctl_test_api_dointvec_write_single_less_int_min(
+ struct kunit *test)
+{
+ int data = 0;
+ struct ctl_table table = {
+ .procname = "foo",
+ .data = &data,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &i_zero,
+ .extra2 = &i_one_hundred,
+ };
+ size_t max_len = 32, len = max_len;
+ loff_t pos = 0;
+ char *buffer = kunit_kzalloc(test, max_len, GFP_USER);
+ char __user *user_buffer = (char __user *)buffer;
+ unsigned long abs_of_less_than_min = (unsigned long)INT_MAX
+ - (INT_MAX + INT_MIN) + 1;
+
+ /*
+ * We use this rigmarole to create a string that contains a value one
+ * less than the minimum accepted value.
+ */
+ KUNIT_ASSERT_LT(test,
+ (size_t)snprintf(buffer, max_len, "-%lu",
+ abs_of_less_than_min),
+ max_len);
+
+ KUNIT_EXPECT_EQ(test, -EINVAL, proc_dointvec(&table, KUNIT_PROC_WRITE,
+ user_buffer, &len, &pos));
+ KUNIT_EXPECT_EQ(test, max_len, len);
+ KUNIT_EXPECT_EQ(test, 0, *((int *)table.data));
+}
+
+/*
+ * Test that writing the maximum possible value works.
+ */
+static void sysctl_test_api_dointvec_write_single_greater_int_max(
+ struct kunit *test)
+{
+ int data = 0;
+ struct ctl_table table = {
+ .procname = "foo",
+ .data = &data,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &i_zero,
+ .extra2 = &i_one_hundred,
+ };
+ size_t max_len = 32, len = max_len;
+ loff_t pos = 0;
+ char *buffer = kunit_kzalloc(test, max_len, GFP_USER);
+ char __user *user_buffer = (char __user *)buffer;
+ unsigned long greater_than_max = (unsigned long)INT_MAX + 1;
+
+ KUNIT_ASSERT_GT(test, greater_than_max, (unsigned long)INT_MAX);
+ KUNIT_ASSERT_LT(test, (size_t)snprintf(buffer, max_len, "%lu",
+ greater_than_max),
+ max_len);
+ KUNIT_EXPECT_EQ(test, -EINVAL, proc_dointvec(&table, KUNIT_PROC_WRITE,
+ user_buffer, &len, &pos));
+ KUNIT_ASSERT_EQ(test, max_len, len);
+ KUNIT_EXPECT_EQ(test, 0, *((int *)table.data));
+}
+
+static struct kunit_case sysctl_test_cases[] = {
+ KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data),
+ KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset),
+ KUNIT_CASE(sysctl_test_api_dointvec_table_len_is_zero),
+ KUNIT_CASE(sysctl_test_api_dointvec_table_read_but_position_set),
+ KUNIT_CASE(sysctl_test_dointvec_read_happy_single_positive),
+ KUNIT_CASE(sysctl_test_dointvec_read_happy_single_negative),
+ KUNIT_CASE(sysctl_test_dointvec_write_happy_single_positive),
+ KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative),
+ KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min),
+ KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max),
+ {}
+};
+
+static struct kunit_suite sysctl_test_suite = {
+ .name = "sysctl_test",
+ .test_cases = sysctl_test_cases,
+};
+
+kunit_test_suites(&sysctl_test_suite);
+
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 078950d9605b..d396aaaf19a3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -163,7 +163,7 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#ifdef CONFIG_SPARC
#endif
-#ifdef __hppa__
+#ifdef CONFIG_PARISC
extern int pwrsw_enabled;
#endif
@@ -264,7 +264,8 @@ extern struct ctl_table epoll_table[];
extern struct ctl_table firmware_config_table[];
#endif
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
int sysctl_legacy_va_layout;
#endif
@@ -619,7 +620,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef __hppa__
+#ifdef CONFIG_PARISC
{
.procname = "soft-power",
.data = &pwrsw_enabled,
@@ -1267,7 +1268,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_do_static_key,
},
#endif
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+#if defined(CONFIG_TREE_RCU)
{
.procname = "panic_on_rcu_stall",
.data = &sysctl_panic_on_rcu_stall,
@@ -1465,7 +1466,7 @@ static struct ctl_table vm_table[] = {
.procname = "drop_caches",
.data = &sysctl_drop_caches,
.maxlen = sizeof(int),
- .mode = 0644,
+ .mode = 0200,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = SYSCTL_ONE,
.extra2 = &four,
@@ -1573,7 +1574,8 @@ static struct ctl_table vm_table[] = {
.proc_handler = proc_dointvec,
.extra1 = SYSCTL_ZERO,
},
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
{
.procname = "legacy_va_layout",
.data = &sysctl_legacy_va_layout,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 73c132095a7b..7d550cc76a3b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -18,1317 +18,12 @@
#include <linux/slab.h>
#include <linux/compat.h>
-#ifdef CONFIG_SYSCTL_SYSCALL
-
-struct bin_table;
-typedef ssize_t bin_convert_t(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
-
-static bin_convert_t bin_dir;
-static bin_convert_t bin_string;
-static bin_convert_t bin_intvec;
-static bin_convert_t bin_ulongvec;
-static bin_convert_t bin_uuid;
-static bin_convert_t bin_dn_node_address;
-
-#define CTL_DIR bin_dir
-#define CTL_STR bin_string
-#define CTL_INT bin_intvec
-#define CTL_ULONG bin_ulongvec
-#define CTL_UUID bin_uuid
-#define CTL_DNADR bin_dn_node_address
-
-#define BUFSZ 256
-
-struct bin_table {
- bin_convert_t *convert;
- int ctl_name;
- const char *procname;
- const struct bin_table *child;
-};
-
-static const struct bin_table bin_random_table[] = {
- { CTL_INT, RANDOM_POOLSIZE, "poolsize" },
- { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" },
- { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" },
- { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
- { CTL_UUID, RANDOM_BOOT_ID, "boot_id" },
- { CTL_UUID, RANDOM_UUID, "uuid" },
- {}
-};
-
-static const struct bin_table bin_pty_table[] = {
- { CTL_INT, PTY_MAX, "max" },
- { CTL_INT, PTY_NR, "nr" },
- {}
-};
-
-static const struct bin_table bin_kern_table[] = {
- { CTL_STR, KERN_OSTYPE, "ostype" },
- { CTL_STR, KERN_OSRELEASE, "osrelease" },
- /* KERN_OSREV not used */
- { CTL_STR, KERN_VERSION, "version" },
- /* KERN_SECUREMASK not used */
- /* KERN_PROF not used */
- { CTL_STR, KERN_NODENAME, "hostname" },
- { CTL_STR, KERN_DOMAINNAME, "domainname" },
-
- { CTL_INT, KERN_PANIC, "panic" },
- { CTL_INT, KERN_REALROOTDEV, "real-root-dev" },
-
- { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" },
- { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" },
- { CTL_INT, KERN_PRINTK, "printk" },
-
- /* KERN_NAMETRANS not used */
- /* KERN_PPC_HTABRECLAIM not used */
- /* KERN_PPC_ZEROPAGED not used */
- { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
-
- { CTL_STR, KERN_MODPROBE, "modprobe" },
- { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" },
- { CTL_INT, KERN_ACCT, "acct" },
- /* KERN_PPC_L2CR "l2cr" no longer used */
-
- /* KERN_RTSIGNR not used */
- /* KERN_RTSIGMAX not used */
-
- { CTL_ULONG, KERN_SHMMAX, "shmmax" },
- { CTL_INT, KERN_MSGMAX, "msgmax" },
- { CTL_INT, KERN_MSGMNB, "msgmnb" },
- /* KERN_MSGPOOL not used*/
- { CTL_INT, KERN_SYSRQ, "sysrq" },
- { CTL_INT, KERN_MAX_THREADS, "threads-max" },
- { CTL_DIR, KERN_RANDOM, "random", bin_random_table },
- { CTL_ULONG, KERN_SHMALL, "shmall" },
- { CTL_INT, KERN_MSGMNI, "msgmni" },
- { CTL_INT, KERN_SEM, "sem" },
- { CTL_INT, KERN_SPARC_STOP_A, "stop-a" },
- { CTL_INT, KERN_SHMMNI, "shmmni" },
-
- { CTL_INT, KERN_OVERFLOWUID, "overflowuid" },
- { CTL_INT, KERN_OVERFLOWGID, "overflowgid" },
-
- { CTL_STR, KERN_HOTPLUG, "hotplug", },
- { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
-
- { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
- { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" },
- /* KERN_TAINTED "tainted" no longer used */
- { CTL_INT, KERN_CADPID, "cad_pid" },
- { CTL_INT, KERN_PIDMAX, "pid_max" },
- { CTL_STR, KERN_CORE_PATTERN, "core_pattern" },
- { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" },
- { CTL_INT, KERN_HPPA_PWRSW, "soft-power" },
- { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" },
-
- { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
- { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
-
- { CTL_DIR, KERN_PTY, "pty", bin_pty_table },
- { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" },
- { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
- /* KERN_HZ_TIMER "hz_timer" no longer used */
- { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
- { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" },
- { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" },
-
- { CTL_INT, KERN_SPIN_RETRY, "spin_retry" },
- /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
- { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
- { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
- { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
- { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
- { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
- { CTL_ULONG, KERN_PANIC_PRINT, "panic_print" },
- {}
-};
-
-static const struct bin_table bin_vm_table[] = {
- { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
- { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" },
- { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
- { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
- /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
- /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
- /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
- { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
- /* VM_PAGEBUF unused */
- /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
- { CTL_INT, VM_SWAPPINESS, "swappiness" },
- { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
- { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" },
- { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" },
- { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" },
- { CTL_INT, VM_BLOCK_DUMP, "block_dump" },
- { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" },
- { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
- { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
- /* VM_SWAP_TOKEN_TIMEOUT unused */
- { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" },
- { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
- { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
- { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" },
- { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" },
- { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" },
- { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" },
-
- {}
-};
-
-static const struct bin_table bin_net_core_table[] = {
- { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" },
- { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" },
- { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" },
- { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" },
- /* NET_CORE_DESTROY_DELAY unused */
- { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
- /* NET_CORE_FASTROUTE unused */
- { CTL_INT, NET_CORE_MSG_COST, "message_cost" },
- { CTL_INT, NET_CORE_MSG_BURST, "message_burst" },
- { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" },
- /* NET_CORE_HOT_LIST_LENGTH unused */
- /* NET_CORE_DIVERT_VERSION unused */
- /* NET_CORE_NO_CONG_THRESH unused */
- /* NET_CORE_NO_CONG unused */
- /* NET_CORE_LO_CONG unused */
- /* NET_CORE_MOD_CONG unused */
- { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" },
- { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" },
- { CTL_INT, NET_CORE_BUDGET, "netdev_budget" },
- { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
- { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
- { CTL_INT, NET_CORE_WARNINGS, "warnings" },
- {},
-};
-
-static const struct bin_table bin_net_unix_table[] = {
- /* NET_UNIX_DESTROY_DELAY unused */
- /* NET_UNIX_DELETE_DELAY unused */
- { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
- {}
-};
-
-static const struct bin_table bin_net_ipv4_route_table[] = {
- { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" },
- /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
- /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
- { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
- { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
- { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
- { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
- { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
- /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
- { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
- { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
- { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
- { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
- { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
- { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
- { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
- { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
- { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
- {}
-};
-
-static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
- { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" },
- { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
-
- { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
- { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
- { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
- { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
- { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" },
- { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
- { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
- { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
- { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
- { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
- { CTL_INT, NET_IPV4_CONF_TAG, "tag" },
- { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" },
- { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
- { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
- { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
- { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
-
- { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
- { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" },
- { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
- { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
- {}
-};
-
-static const struct bin_table bin_net_ipv4_conf_table[] = {
- { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table },
- { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table },
- { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table },
- {}
-};
-
-static const struct bin_table bin_net_neigh_vars_table[] = {
- { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
- { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
- { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" },
- /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
- { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
- { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
- { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
- { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" },
- { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
- /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
- /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
- /* NET_NEIGH_LOCKTIME "locktime" no longer used */
- { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" },
- { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" },
- { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" },
- { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" },
- { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
- { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
- {}
-};
-
-static const struct bin_table bin_net_neigh_table[] = {
- { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
- { CTL_DIR, 0, NULL, bin_net_neigh_vars_table },
- {}
-};
-
-static const struct bin_table bin_net_ipv4_netfilter_table[] = {
- { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
-
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
-
- /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
- /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
- /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
- /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
-
- { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
- { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
- { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
- { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
- { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
-
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
-
- { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
- { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
- {}
-};
-
-static const struct bin_table bin_net_ipv4_table[] = {
- {CTL_INT, NET_IPV4_FORWARD, "ip_forward" },
-
- { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table },
- { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table },
- { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table },
- /* NET_IPV4_FIB_HASH unused */
- { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table },
-
- { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
- { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
- { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" },
- { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
- { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
- /* NET_IPV4_AUTOCONFIG unused */
- { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
- { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
- { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
- { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
- { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
- { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
- { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" },
- { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
- { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
- { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
- { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
- { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
- { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
- { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" },
- { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
- { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
- { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" },
- { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" },
- { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
- { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
- { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
- { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
- { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
- { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
- { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
- { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
- { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
- { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
- { CTL_INT, NET_TCP_FACK, "tcp_fack" },
- { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" },
- { CTL_INT, NET_TCP_ECN, "tcp_ecn" },
- { CTL_INT, NET_TCP_DSACK, "tcp_dsack" },
- { CTL_INT, NET_TCP_MEM, "tcp_mem" },
- { CTL_INT, NET_TCP_WMEM, "tcp_wmem" },
- { CTL_INT, NET_TCP_RMEM, "tcp_rmem" },
- { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" },
- { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
- { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" },
- { CTL_INT, NET_TCP_FRTO, "tcp_frto" },
- { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
- { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" },
- { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
- { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
- { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
- { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
- { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
- { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
- { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
- { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
- { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
- { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
- { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
- { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
- /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
- { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
- { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
-
- { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
- { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
- { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
- { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
- { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
- { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
-
- { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
- { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
- { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
-
- { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
- /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
-
- { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
-
- /* NET_TCP_DEFAULT_WIN_SCALE unused */
- /* NET_TCP_BIC_BETA unused */
- /* NET_IPV4_TCP_MAX_KA_PROBES unused */
- /* NET_IPV4_IP_MASQ_DEBUG unused */
- /* NET_TCP_SYN_TAILDROP unused */
- /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
- /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
- /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
- /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
- /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
- /* NET_IPV4_ALWAYS_DEFRAG unused */
- {}
-};
-
-static const struct bin_table bin_net_ipx_table[] = {
- { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
- /* NET_IPX_FORWARDING unused */
- {}
-};
-
-static const struct bin_table bin_net_atalk_table[] = {
- { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
- { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
- { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
- { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
- {},
-};
-
-static const struct bin_table bin_net_netrom_table[] = {
- { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
- { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
- { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
- { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
- { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
- { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
- { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
- { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
- { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
- { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" },
- { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
- { CTL_INT, NET_NETROM_RESET, "reset" },
- {}
-};
-
-static const struct bin_table bin_net_ax25_param_table[] = {
- { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
- { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
- { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" },
- { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" },
- { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" },
- { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
- { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" },
- { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" },
- { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" },
- { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
- { CTL_INT, NET_AX25_N2, "maximum_retry_count" },
- { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" },
- { CTL_INT, NET_AX25_PROTOCOL, "protocol" },
- { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
- {}
-};
-
-static const struct bin_table bin_net_ax25_table[] = {
- { CTL_DIR, 0, NULL, bin_net_ax25_param_table },
- {}
-};
-
-static const struct bin_table bin_net_rose_table[] = {
- { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
- { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
- { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
- { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
- { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
- { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" },
- { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
- { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
- { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" },
- { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
- {}
-};
-
-static const struct bin_table bin_net_ipv6_conf_var_table[] = {
- { CTL_INT, NET_IPV6_FORWARDING, "forwarding" },
- { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" },
- { CTL_INT, NET_IPV6_MTU, "mtu" },
- { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" },
- { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
- { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" },
- { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
- { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" },
- { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
- { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
- { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
- { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
- { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
- { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
- { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
- { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" },
- { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
- { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
- { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
- { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
- { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
- { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
- { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
- { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
- { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" },
- {}
-};
-
-static const struct bin_table bin_net_ipv6_conf_table[] = {
- { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table },
- { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table },
- { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table },
- {}
-};
-
-static const struct bin_table bin_net_ipv6_route_table[] = {
- /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */
- { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
- { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
- { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
- { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
- { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
- { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
- { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
- { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
- { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
- {}
-};
-
-static const struct bin_table bin_net_ipv6_icmp_table[] = {
- { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
- {}
-};
-
-static const struct bin_table bin_net_ipv6_table[] = {
- { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table },
- { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table },
- { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table },
- { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table },
- { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" },
- { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
- { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
- { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
- { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
- { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
- { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
- {}
-};
-
-static const struct bin_table bin_net_x25_table[] = {
- { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
- { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
- { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
- { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
- { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
- { CTL_INT, NET_X25_FORWARD, "x25_forward" },
- {}
-};
-
-static const struct bin_table bin_net_tr_table[] = {
- { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" },
- {}
-};
-
-
-static const struct bin_table bin_net_decnet_conf_vars[] = {
- { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
- { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" },
- { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" },
- { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" },
- {}
-};
-
-static const struct bin_table bin_net_decnet_conf[] = {
- { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars },
- { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars },
- { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars },
- { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars },
- { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars },
- { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
- { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars },
- {}
-};
-
-static const struct bin_table bin_net_decnet_table[] = {
- { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf },
- { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" },
- { CTL_STR, NET_DECNET_NODE_NAME, "node_name" },
- { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" },
- { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" },
- { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" },
- { CTL_INT, NET_DECNET_DI_COUNT, "di_count" },
- { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" },
- { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
- { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
- { CTL_INT, NET_DECNET_MEM, "decnet_mem" },
- { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" },
- { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" },
- { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" },
- {}
-};
-
-static const struct bin_table bin_net_sctp_table[] = {
- { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" },
- { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" },
- { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" },
- { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
- { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
- { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
- { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
- { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
- { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
- { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" },
- { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
- { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" },
- { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" },
- { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
- { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
- { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
- { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
- {}
-};
-
-static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
- { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" },
- { CTL_INT, NET_LLC2_P_TIMEOUT, "p" },
- { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" },
- { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" },
- {}
-};
-
-static const struct bin_table bin_net_llc_station_table[] = {
- { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
- {}
-};
-
-static const struct bin_table bin_net_llc_llc2_table[] = {
- { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table },
- {}
-};
-
-static const struct bin_table bin_net_llc_table[] = {
- { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table },
- { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table },
- {}
-};
-
-static const struct bin_table bin_net_netfilter_table[] = {
- { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
- /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */
- /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
- /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
- /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
- { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
- { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
- { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
- { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
- { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
- { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
- /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
- /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
- { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
- { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
- { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
-
- {}
-};
-
-static const struct bin_table bin_net_table[] = {
- { CTL_DIR, NET_CORE, "core", bin_net_core_table },
- /* NET_ETHER not used */
- /* NET_802 not used */
- { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table },
- { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table },
- { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table },
- { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table },
- { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table },
- { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table },
- /* NET_BRIDGE "bridge" no longer used */
- { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table },
- { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table },
- { CTL_DIR, NET_X25, "x25", bin_net_x25_table },
- { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table },
- { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table },
- /* NET_ECONET not used */
- { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table },
- { CTL_DIR, NET_LLC, "llc", bin_net_llc_table },
- { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table },
- /* NET_DCCP "dccp" no longer used */
- /* NET_IRDA "irda" no longer used */
- { CTL_INT, 2089, "nf_conntrack_max" },
- {}
-};
-
-static const struct bin_table bin_fs_quota_table[] = {
- { CTL_INT, FS_DQ_LOOKUPS, "lookups" },
- { CTL_INT, FS_DQ_DROPS, "drops" },
- { CTL_INT, FS_DQ_READS, "reads" },
- { CTL_INT, FS_DQ_WRITES, "writes" },
- { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" },
- { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" },
- { CTL_INT, FS_DQ_FREE, "free_dquots" },
- { CTL_INT, FS_DQ_SYNCS, "syncs" },
- { CTL_INT, FS_DQ_WARNINGS, "warnings" },
- {}
-};
-
-static const struct bin_table bin_fs_xfs_table[] = {
- { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" },
- { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" },
- { CTL_INT, XFS_PANIC_MASK, "panic_mask" },
-
- { CTL_INT, XFS_ERRLEVEL, "error_level" },
- { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
- { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" },
- { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" },
- { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" },
- { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" },
- { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" },
- { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
- { CTL_INT, XFS_ROTORSTEP, "rotorstep" },
- { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" },
- { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" },
- { CTL_INT, XFS_STATS_CLEAR, "stats_clear" },
- {}
-};
-
-static const struct bin_table bin_fs_ocfs2_nm_table[] = {
- { CTL_STR, 1, "hb_ctl_path" },
- {}
-};
-
-static const struct bin_table bin_fs_ocfs2_table[] = {
- { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table },
- {}
-};
-
-static const struct bin_table bin_inotify_table[] = {
- { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
- { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
- { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
- {}
-};
-
-static const struct bin_table bin_fs_table[] = {
- { CTL_INT, FS_NRINODE, "inode-nr" },
- { CTL_INT, FS_STATINODE, "inode-state" },
- /* FS_MAXINODE unused */
- /* FS_NRDQUOT unused */
- /* FS_MAXDQUOT unused */
- /* FS_NRFILE "file-nr" no longer used */
- { CTL_INT, FS_MAXFILE, "file-max" },
- { CTL_INT, FS_DENTRY, "dentry-state" },
- /* FS_NRSUPER unused */
- /* FS_MAXUPSER unused */
- { CTL_INT, FS_OVERFLOWUID, "overflowuid" },
- { CTL_INT, FS_OVERFLOWGID, "overflowgid" },
- { CTL_INT, FS_LEASES, "leases-enable" },
- { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" },
- { CTL_INT, FS_LEASE_TIME, "lease-break-time" },
- { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table },
- { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table },
- { CTL_ULONG, FS_AIO_NR, "aio-nr" },
- { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" },
- { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table },
- { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table },
- { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" },
- {}
-};
-
-static const struct bin_table bin_ipmi_table[] = {
- { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
- {}
-};
-
-static const struct bin_table bin_mac_hid_files[] = {
- /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
- /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
- { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
- { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
- { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
- /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
- {}
-};
-
-static const struct bin_table bin_raid_table[] = {
- { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
- { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
- {}
-};
-
-static const struct bin_table bin_scsi_table[] = {
- { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
- {}
-};
-
-static const struct bin_table bin_dev_table[] = {
- /* DEV_CDROM "cdrom" no longer used */
- /* DEV_HWMON unused */
- /* DEV_PARPORT "parport" no longer used */
- { CTL_DIR, DEV_RAID, "raid", bin_raid_table },
- { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files },
- { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table },
- { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table },
- {}
-};
-
-static const struct bin_table bin_bus_isa_table[] = {
- { CTL_INT, BUS_ISA_MEM_BASE, "membase" },
- { CTL_INT, BUS_ISA_PORT_BASE, "portbase" },
- { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" },
- {}
-};
-
-static const struct bin_table bin_bus_table[] = {
- { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table },
- {}
-};
-
-
-static const struct bin_table bin_s390dbf_table[] = {
- { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
- { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
- {}
-};
-
-static const struct bin_table bin_sunrpc_table[] = {
- /* CTL_RPCDEBUG "rpc_debug" no longer used */
- /* CTL_NFSDEBUG "nfs_debug" no longer used */
- /* CTL_NFSDDEBUG "nfsd_debug" no longer used */
- /* CTL_NLMDEBUG "nlm_debug" no longer used */
-
- { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
- { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
- { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" },
- { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" },
- {}
-};
-
-static const struct bin_table bin_pm_table[] = {
- /* frv specific */
- /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */
- { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" },
- { CTL_INT, 3 /* CTL_PM_P0 */, "p0" },
- { CTL_INT, 4 /* CTL_PM_CM */, "cm" },
- {}
-};
-
-static const struct bin_table bin_root_table[] = {
- { CTL_DIR, CTL_KERN, "kernel", bin_kern_table },
- { CTL_DIR, CTL_VM, "vm", bin_vm_table },
- { CTL_DIR, CTL_NET, "net", bin_net_table },
- /* CTL_PROC not used */
- { CTL_DIR, CTL_FS, "fs", bin_fs_table },
- /* CTL_DEBUG "debug" no longer used */
- { CTL_DIR, CTL_DEV, "dev", bin_dev_table },
- { CTL_DIR, CTL_BUS, "bus", bin_bus_table },
- { CTL_DIR, CTL_ABI, "abi" },
- /* CTL_CPU not used */
- /* CTL_ARLAN "arlan" no longer used */
- { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table },
- { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table },
- { CTL_DIR, CTL_PM, "pm", bin_pm_table },
- {}
-};
-
-static ssize_t bin_dir(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- return -ENOTDIR;
-}
-
-
-static ssize_t bin_string(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- ssize_t result, copied = 0;
-
- if (oldval && oldlen) {
- char __user *lastp;
- loff_t pos = 0;
- int ch;
-
- result = vfs_read(file, oldval, oldlen, &pos);
- if (result < 0)
- goto out;
-
- copied = result;
- lastp = oldval + copied - 1;
-
- result = -EFAULT;
- if (get_user(ch, lastp))
- goto out;
-
- /* Trim off the trailing newline */
- if (ch == '\n') {
- result = -EFAULT;
- if (put_user('\0', lastp))
- goto out;
- copied -= 1;
- }
- }
-
- if (newval && newlen) {
- loff_t pos = 0;
-
- result = vfs_write(file, newval, newlen, &pos);
- if (result < 0)
- goto out;
- }
-
- result = copied;
-out:
- return result;
-}
-
-static ssize_t bin_intvec(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- ssize_t copied = 0;
- char *buffer;
- ssize_t result;
-
- result = -ENOMEM;
- buffer = kmalloc(BUFSZ, GFP_KERNEL);
- if (!buffer)
- goto out;
-
- if (oldval && oldlen) {
- unsigned __user *vec = oldval;
- size_t length = oldlen / sizeof(*vec);
- char *str, *end;
- int i;
- loff_t pos = 0;
-
- result = kernel_read(file, buffer, BUFSZ - 1, &pos);
- if (result < 0)
- goto out_kfree;
-
- str = buffer;
- end = str + result;
- *end++ = '\0';
- for (i = 0; i < length; i++) {
- unsigned long value;
-
- value = simple_strtoul(str, &str, 10);
- while (isspace(*str))
- str++;
-
- result = -EFAULT;
- if (put_user(value, vec + i))
- goto out_kfree;
-
- copied += sizeof(*vec);
- if (!isdigit(*str))
- break;
- }
- }
-
- if (newval && newlen) {
- unsigned __user *vec = newval;
- size_t length = newlen / sizeof(*vec);
- char *str, *end;
- int i;
- loff_t pos = 0;
-
- str = buffer;
- end = str + BUFSZ;
- for (i = 0; i < length; i++) {
- unsigned long value;
-
- result = -EFAULT;
- if (get_user(value, vec + i))
- goto out_kfree;
-
- str += scnprintf(str, end - str, "%lu\t", value);
- }
-
- result = kernel_write(file, buffer, str - buffer, &pos);
- if (result < 0)
- goto out_kfree;
- }
- result = copied;
-out_kfree:
- kfree(buffer);
-out:
- return result;
-}
-
-static ssize_t bin_ulongvec(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- ssize_t copied = 0;
- char *buffer;
- ssize_t result;
-
- result = -ENOMEM;
- buffer = kmalloc(BUFSZ, GFP_KERNEL);
- if (!buffer)
- goto out;
-
- if (oldval && oldlen) {
- unsigned long __user *vec = oldval;
- size_t length = oldlen / sizeof(*vec);
- char *str, *end;
- int i;
- loff_t pos = 0;
-
- result = kernel_read(file, buffer, BUFSZ - 1, &pos);
- if (result < 0)
- goto out_kfree;
-
- str = buffer;
- end = str + result;
- *end++ = '\0';
- for (i = 0; i < length; i++) {
- unsigned long value;
-
- value = simple_strtoul(str, &str, 10);
- while (isspace(*str))
- str++;
-
- result = -EFAULT;
- if (put_user(value, vec + i))
- goto out_kfree;
-
- copied += sizeof(*vec);
- if (!isdigit(*str))
- break;
- }
- }
-
- if (newval && newlen) {
- unsigned long __user *vec = newval;
- size_t length = newlen / sizeof(*vec);
- char *str, *end;
- int i;
- loff_t pos = 0;
-
- str = buffer;
- end = str + BUFSZ;
- for (i = 0; i < length; i++) {
- unsigned long value;
-
- result = -EFAULT;
- if (get_user(value, vec + i))
- goto out_kfree;
-
- str += scnprintf(str, end - str, "%lu\t", value);
- }
-
- result = kernel_write(file, buffer, str - buffer, &pos);
- if (result < 0)
- goto out_kfree;
- }
- result = copied;
-out_kfree:
- kfree(buffer);
-out:
- return result;
-}
-
-static ssize_t bin_uuid(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- ssize_t result, copied = 0;
-
- /* Only supports reads */
- if (oldval && oldlen) {
- char buf[UUID_STRING_LEN + 1];
- uuid_t uuid;
- loff_t pos = 0;
-
- result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
- if (result < 0)
- goto out;
-
- buf[result] = '\0';
-
- result = -EIO;
- if (uuid_parse(buf, &uuid))
- goto out;
-
- if (oldlen > 16)
- oldlen = 16;
-
- result = -EFAULT;
- if (copy_to_user(oldval, &uuid, oldlen))
- goto out;
-
- copied = oldlen;
- }
- result = copied;
-out:
- return result;
-}
-
-static ssize_t bin_dn_node_address(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- ssize_t result, copied = 0;
-
- if (oldval && oldlen) {
- char buf[15], *nodep;
- unsigned long area, node;
- __le16 dnaddr;
- loff_t pos = 0;
-
- result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
- if (result < 0)
- goto out;
-
- buf[result] = '\0';
-
- /* Convert the decnet address to binary */
- result = -EIO;
- nodep = strchr(buf, '.');
- if (!nodep)
- goto out;
- ++nodep;
-
- area = simple_strtoul(buf, NULL, 10);
- node = simple_strtoul(nodep, NULL, 10);
-
- result = -EIO;
- if ((area > 63)||(node > 1023))
- goto out;
-
- dnaddr = cpu_to_le16((area << 10) | node);
-
- result = -EFAULT;
- if (put_user(dnaddr, (__le16 __user *)oldval))
- goto out;
-
- copied = sizeof(dnaddr);
- }
-
- if (newval && newlen) {
- __le16 dnaddr;
- char buf[15];
- int len;
- loff_t pos = 0;
-
- result = -EINVAL;
- if (newlen != sizeof(dnaddr))
- goto out;
-
- result = -EFAULT;
- if (get_user(dnaddr, (__le16 __user *)newval))
- goto out;
-
- len = scnprintf(buf, sizeof(buf), "%hu.%hu",
- le16_to_cpu(dnaddr) >> 10,
- le16_to_cpu(dnaddr) & 0x3ff);
-
- result = kernel_write(file, buf, len, &pos);
- if (result < 0)
- goto out;
- }
-
- result = copied;
-out:
- return result;
-}
-
-static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
-{
- const struct bin_table *table = &bin_root_table[0];
- int ctl_name;
-
- /* The binary sysctl tables have a small maximum depth so
- * there is no danger of overflowing our path as it PATH_MAX
- * bytes long.
- */
- memcpy(path, "sys/", 4);
- path += 4;
-
-repeat:
- if (!nlen)
- return ERR_PTR(-ENOTDIR);
- ctl_name = *name;
- name++;
- nlen--;
- for ( ; table->convert; table++) {
- int len = 0;
-
- /*
- * For a wild card entry map from ifindex to network
- * device name.
- */
- if (!table->ctl_name) {
-#ifdef CONFIG_NET
- struct net *net = current->nsproxy->net_ns;
- struct net_device *dev;
- dev = dev_get_by_index(net, ctl_name);
- if (dev) {
- len = strlen(dev->name);
- memcpy(path, dev->name, len);
- dev_put(dev);
- }
-#endif
- /* Use the well known sysctl number to proc name mapping */
- } else if (ctl_name == table->ctl_name) {
- len = strlen(table->procname);
- memcpy(path, table->procname, len);
- }
- if (len) {
- path += len;
- if (table->child) {
- *path++ = '/';
- table = table->child;
- goto repeat;
- }
- *path = '\0';
- return table;
- }
- }
- return ERR_PTR(-ENOTDIR);
-}
-
-static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
-{
- char *tmp, *result;
-
- result = ERR_PTR(-ENOMEM);
- tmp = __getname();
- if (tmp) {
- const struct bin_table *table = get_sysctl(name, nlen, tmp);
- result = tmp;
- *tablep = table;
- if (IS_ERR(table)) {
- __putname(tmp);
- result = ERR_CAST(table);
- }
- }
- return result;
-}
-
-static ssize_t binary_sysctl(const int *name, int nlen,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- const struct bin_table *table = NULL;
- struct vfsmount *mnt;
- struct file *file;
- ssize_t result;
- char *pathname;
- int flags;
-
- pathname = sysctl_getname(name, nlen, &table);
- result = PTR_ERR(pathname);
- if (IS_ERR(pathname))
- goto out;
-
- /* How should the sysctl be accessed? */
- if (oldval && oldlen && newval && newlen) {
- flags = O_RDWR;
- } else if (newval && newlen) {
- flags = O_WRONLY;
- } else if (oldval && oldlen) {
- flags = O_RDONLY;
- } else {
- result = 0;
- goto out_putname;
- }
-
- mnt = task_active_pid_ns(current)->proc_mnt;
- file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0);
- result = PTR_ERR(file);
- if (IS_ERR(file))
- goto out_putname;
-
- result = table->convert(file, oldval, oldlen, newval, newlen);
-
- fput(file);
-out_putname:
- __putname(pathname);
-out:
- return result;
-}
-
-
-#else /* CONFIG_SYSCTL_SYSCALL */
-
static ssize_t binary_sysctl(const int *name, int nlen,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
return -ENOSYS;
}
-#endif /* CONFIG_SYSCTL_SYSCALL */
-
-
static void deprecated_sysctl_warning(const int *name, int nlen)
{
int i;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13a0f2e6ebc2..e2ac0e37c4ae 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -554,25 +554,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
- struct taskstats *stats;
+ struct taskstats *stats_new, *stats;
- if (sig->stats || thread_group_empty(tsk))
- goto ret;
+ /* Pairs with smp_store_release() below. */
+ stats = smp_load_acquire(&sig->stats);
+ if (stats || thread_group_empty(tsk))
+ return stats;
/* No problem if kmem_cache_zalloc() fails */
- stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+ stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
spin_lock_irq(&tsk->sighand->siglock);
- if (!sig->stats) {
- sig->stats = stats;
- stats = NULL;
+ stats = sig->stats;
+ if (!stats) {
+ /*
+ * Pairs with smp_store_release() above and order the
+ * kmem_cache_zalloc().
+ */
+ smp_store_release(&sig->stats, stats_new);
+ stats = stats_new;
+ stats_new = NULL;
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (stats)
- kmem_cache_free(taskstats_cache, stats);
-ret:
- return sig->stats;
+ if (stats_new)
+ kmem_cache_free(taskstats_cache, stats_new);
+
+ return stats;
}
/* Send pid data out on exit */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 1867044800bb..c8f00168afe8 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
+obj-$(CONFIG_TIME_NS) += namespace.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 57518efc3810..2ffb466af77e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,7 @@
#include <linux/freezer.h>
#include <linux/compat.h>
#include <linux/module.h>
+#include <linux/time_namespace.h>
#include "posix-timers.h"
@@ -36,13 +37,15 @@
* struct alarm_base - Alarm timer bases
* @lock: Lock for syncrhonized access to the base
* @timerqueue: Timerqueue head managing the list of events
- * @gettime: Function to read the time correlating to the base
+ * @get_ktime: Function to read the time correlating to the base
+ * @get_timespec: Function to read the namespace time correlating to the base
* @base_clockid: clockid for the base
*/
static struct alarm_base {
spinlock_t lock;
struct timerqueue_head timerqueue;
- ktime_t (*gettime)(void);
+ ktime_t (*get_ktime)(void);
+ void (*get_timespec)(struct timespec64 *tp);
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
@@ -55,8 +58,6 @@ static DEFINE_SPINLOCK(freezer_delta_lock);
#endif
#ifdef CONFIG_RTC_CLASS
-static struct wakeup_source *ws;
-
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
static struct rtc_device *rtcdev;
@@ -66,8 +67,6 @@ static DEFINE_SPINLOCK(rtcdev_lock);
* alarmtimer_get_rtcdev - Return selected rtcdevice
*
* This function returns the rtc device to use for wakealarms.
- * If one has not already been chosen, it checks to see if a
- * functional rtc device is available.
*/
struct rtc_device *alarmtimer_get_rtcdev(void)
{
@@ -87,7 +86,8 @@ static int alarmtimer_rtc_add_device(struct device *dev,
{
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
- struct wakeup_source *__ws;
+ struct platform_device *pdev;
+ int ret = 0;
if (rtcdev)
return -EBUSY;
@@ -97,26 +97,31 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (!device_may_wakeup(rtc->dev.parent))
return -1;
- __ws = wakeup_source_register("alarmtimer");
+ pdev = platform_device_register_data(dev, "alarmtimer",
+ PLATFORM_DEVID_AUTO, NULL, 0);
+ if (!IS_ERR(pdev))
+ device_init_wakeup(&pdev->dev, true);
spin_lock_irqsave(&rtcdev_lock, flags);
- if (!rtcdev) {
+ if (!IS_ERR(pdev) && !rtcdev) {
if (!try_module_get(rtc->owner)) {
- spin_unlock_irqrestore(&rtcdev_lock, flags);
- return -1;
+ ret = -1;
+ goto unlock;
}
rtcdev = rtc;
/* hold a reference so it doesn't go away */
get_device(dev);
- ws = __ws;
- __ws = NULL;
+ pdev = NULL;
+ } else {
+ ret = -1;
}
+unlock:
spin_unlock_irqrestore(&rtcdev_lock, flags);
- wakeup_source_unregister(__ws);
+ platform_device_unregister(pdev);
- return 0;
+ return ret;
}
static inline void alarmtimer_rtc_timer_init(void)
@@ -138,11 +143,6 @@ static void alarmtimer_rtc_interface_remove(void)
class_interface_unregister(&alarmtimer_rtc_interface);
}
#else
-struct rtc_device *alarmtimer_get_rtcdev(void)
-{
- return NULL;
-}
-#define rtcdev (NULL)
static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
static inline void alarmtimer_rtc_interface_remove(void) { }
static inline void alarmtimer_rtc_timer_init(void) { }
@@ -207,7 +207,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
spin_unlock_irqrestore(&base->lock, flags);
if (alarm->function)
- restart = alarm->function(alarm, base->gettime());
+ restart = alarm->function(alarm, base->get_ktime());
spin_lock_irqsave(&base->lock, flags);
if (restart != ALARMTIMER_NORESTART) {
@@ -217,7 +217,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
}
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_fired(alarm, base->gettime());
+ trace_alarmtimer_fired(alarm, base->get_ktime());
return ret;
}
@@ -225,7 +225,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
ktime_t alarm_expires_remaining(const struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- return ktime_sub(alarm->node.expires, base->gettime());
+ return ktime_sub(alarm->node.expires, base->get_ktime());
}
EXPORT_SYMBOL_GPL(alarm_expires_remaining);
@@ -270,7 +270,7 @@ static int alarmtimer_suspend(struct device *dev)
spin_unlock_irqrestore(&base->lock, flags);
if (!next)
continue;
- delta = ktime_sub(next->expires, base->gettime());
+ delta = ktime_sub(next->expires, base->get_ktime());
if (!min || (delta < min)) {
expires = next->expires;
min = delta;
@@ -281,7 +281,7 @@ static int alarmtimer_suspend(struct device *dev)
return 0;
if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
- __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+ pm_wakeup_event(dev, 2 * MSEC_PER_SEC);
return -EBUSY;
}
@@ -296,7 +296,7 @@ static int alarmtimer_suspend(struct device *dev)
/* Set alarm, if in the past reject suspend briefly to handle */
ret = rtc_timer_start(rtc, &rtctimer, now, 0);
if (ret < 0)
- __pm_wakeup_event(ws, MSEC_PER_SEC);
+ pm_wakeup_event(dev, MSEC_PER_SEC);
return ret;
}
@@ -364,7 +364,7 @@ void alarm_start(struct alarm *alarm, ktime_t start)
hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_start(alarm, base->gettime());
+ trace_alarmtimer_start(alarm, base->get_ktime());
}
EXPORT_SYMBOL_GPL(alarm_start);
@@ -377,7 +377,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- start = ktime_add_safe(start, base->gettime());
+ start = ktime_add_safe(start, base->get_ktime());
alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -414,7 +414,7 @@ int alarm_try_to_cancel(struct alarm *alarm)
alarmtimer_dequeue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_cancel(alarm, base->gettime());
+ trace_alarmtimer_cancel(alarm, base->get_ktime());
return ret;
}
EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
@@ -432,7 +432,7 @@ int alarm_cancel(struct alarm *alarm)
int ret = alarm_try_to_cancel(alarm);
if (ret >= 0)
return ret;
- cpu_relax();
+ hrtimer_cancel_wait_running(&alarm->timer);
}
}
EXPORT_SYMBOL_GPL(alarm_cancel);
@@ -474,7 +474,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- return alarm_forward(alarm, base->gettime(), interval);
+ return alarm_forward(alarm, base->get_ktime(), interval);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
@@ -500,7 +500,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
return;
}
- delta = ktime_sub(absexp, base->gettime());
+ delta = ktime_sub(absexp, base->get_ktime());
spin_lock_irqsave(&freezer_delta_lock, flags);
if (!freezer_delta || (delta < freezer_delta)) {
@@ -606,6 +606,19 @@ static int alarm_timer_try_to_cancel(struct k_itimer *timr)
}
/**
+ * alarm_timer_wait_running - Posix timer callback to wait for a timer
+ * @timr: Pointer to the posixtimer data struct
+ *
+ * Called from the core code when timer cancel detected that the callback
+ * is running. @timr is unlocked and rcu read lock is held to prevent it
+ * from being freed.
+ */
+static void alarm_timer_wait_running(struct k_itimer *timr)
+{
+ hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer);
+}
+
+/**
* alarm_timer_arm - Posix timer callback to arm a timer
* @timr: Pointer to the posixtimer data struct
* @expires: The new expiry time
@@ -619,7 +632,7 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
struct alarm_base *base = &alarm_bases[alarm->type];
if (!absolute)
- expires = ktime_add_safe(expires, base->gettime());
+ expires = ktime_add_safe(expires, base->get_ktime());
if (sigev_none)
alarm->node.expires = expires;
else
@@ -644,24 +657,41 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp
}
/**
- * alarm_clock_get - posix clock_get interface
+ * alarm_clock_get_timespec - posix clock_get_timespec interface
* @which_clock: clockid
* @tp: timespec to fill.
*
- * Provides the underlying alarm base time.
+ * Provides the underlying alarm base time in a tasks time namespace.
*/
-static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
+static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
return -EINVAL;
- *tp = ktime_to_timespec64(base->gettime());
+ base->get_timespec(tp);
+
return 0;
}
/**
+ * alarm_clock_get_ktime - posix clock_get_ktime interface
+ * @which_clock: clockid
+ *
+ * Provides the underlying alarm base time in the root namespace.
+ */
+static ktime_t alarm_clock_get_ktime(clockid_t which_clock)
+{
+ struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+
+ if (!alarmtimer_get_rtcdev())
+ return -EINVAL;
+
+ return base->get_ktime();
+}
+
+/**
* alarm_timer_create - posix timer_create interface
* @new_timer: k_itimer pointer to manage
*
@@ -672,7 +702,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
enum alarmtimer_type type;
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
@@ -734,7 +764,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
struct timespec64 rmt;
ktime_t rem;
- rem = ktime_sub(absexp, alarm_bases[type].gettime());
+ rem = ktime_sub(absexp, alarm_bases[type].get_ktime());
if (rem <= 0)
return 0;
@@ -790,7 +820,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
int ret = 0;
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (flags & ~TIMER_ABSTIME)
return -EINVAL;
@@ -803,9 +833,11 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
exp = timespec64_to_ktime(*tsreq);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
- ktime_t now = alarm_bases[type].gettime();
+ ktime_t now = alarm_bases[type].get_ktime();
exp = ktime_add_safe(now, exp);
+ } else {
+ exp = timens_ktime_to_host(which_clock, exp);
}
ret = alarmtimer_do_nsleep(&alarm, exp, type);
@@ -824,7 +856,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
const struct k_clock alarm_clock = {
.clock_getres = alarm_clock_getres,
- .clock_get = alarm_clock_get,
+ .clock_get_ktime = alarm_clock_get_ktime,
+ .clock_get_timespec = alarm_clock_get_timespec,
.timer_create = alarm_timer_create,
.timer_set = common_timer_set,
.timer_del = common_timer_del,
@@ -834,6 +867,7 @@ const struct k_clock alarm_clock = {
.timer_forward = alarm_timer_forward,
.timer_remaining = alarm_timer_remaining,
.timer_try_to_cancel = alarm_timer_try_to_cancel,
+ .timer_wait_running = alarm_timer_wait_running,
.nsleep = alarm_timer_nsleep,
};
#endif /* CONFIG_POSIX_TIMERS */
@@ -852,6 +886,12 @@ static struct platform_driver alarmtimer_driver = {
}
};
+static void get_boottime_timespec(struct timespec64 *tp)
+{
+ ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
+}
+
/**
* alarmtimer_init - Initialize alarm timer code
*
@@ -860,17 +900,18 @@ static struct platform_driver alarmtimer_driver = {
*/
static int __init alarmtimer_init(void)
{
- struct platform_device *pdev;
- int error = 0;
+ int error;
int i;
alarmtimer_rtc_timer_init();
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
- alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+ alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real;
+ alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64,
alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
- alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
+ alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime;
+ alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec;
for (i = 0; i < ALARM_NUMTYPE; i++) {
timerqueue_init_head(&alarm_bases[i].timerqueue);
spin_lock_init(&alarm_bases[i].lock);
@@ -884,15 +925,7 @@ static int __init alarmtimer_init(void)
if (error)
goto out_if;
- pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
- if (IS_ERR(pdev)) {
- error = PTR_ERR(pdev);
- goto out_drv;
- }
return 0;
-
-out_drv:
- platform_driver_unregister(&alarmtimer_driver);
out_if:
alarmtimer_rtc_interface_remove();
return error;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fff5f64981c6..428beb69426a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -293,8 +293,15 @@ static void clocksource_watchdog(struct timer_list *unused)
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(cpu_online_mask);
- watchdog_timer.expires += WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, next_cpu);
+
+ /*
+ * Arm timer if not already pending: could race with concurrent
+ * pair clocksource_stop_watchdog() clocksource_start_watchdog().
+ */
+ if (!timer_pending(&watchdog_timer)) {
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, next_cpu);
+ }
out:
spin_unlock(&watchdog_lock);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5ee77f1a8a92..3a609e7344f3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -140,6 +140,11 @@ static struct hrtimer_cpu_base migration_cpu_base = {
#define migration_base migration_cpu_base.clock_base[0]
+static inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return base == &migration_base;
+}
+
/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
@@ -159,7 +164,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
struct hrtimer_clock_base *base;
for (;;) {
- base = timer->base;
+ base = READ_ONCE(timer->base);
if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
@@ -239,7 +244,7 @@ again:
return base;
/* See the comment in lock_hrtimer_base() */
- timer->base = &migration_base;
+ WRITE_ONCE(timer->base, &migration_base);
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
@@ -248,10 +253,10 @@ again:
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
- timer->base = base;
+ WRITE_ONCE(timer->base, base);
goto again;
}
- timer->base = new_base;
+ WRITE_ONCE(timer->base, new_base);
} else {
if (new_cpu_base != this_cpu_base &&
hrtimer_check_target(timer, new_base)) {
@@ -264,6 +269,11 @@ again:
#else /* CONFIG_SMP */
+static inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return false;
+}
+
static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
@@ -427,6 +437,17 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
}
EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
+static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+ clockid_t clock_id, enum hrtimer_mode mode);
+
+void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
+ clockid_t clock_id, enum hrtimer_mode mode)
+{
+ debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
+ __hrtimer_init_sleeper(sl, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
+
void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
debug_object_free(timer, &hrtimer_debug_descr);
@@ -945,7 +966,8 @@ static int enqueue_hrtimer(struct hrtimer *timer,
base->cpu_base->active_bases |= 1 << base->index;
- timer->state = HRTIMER_STATE_ENQUEUED;
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
return timerqueue_add(&base->active, &timer->node);
}
@@ -967,7 +989,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
u8 state = timer->state;
- timer->state = newstate;
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->state, newstate);
if (!(state & HRTIMER_STATE_ENQUEUED))
return;
@@ -992,8 +1015,9 @@ static void __remove_hrtimer(struct hrtimer *timer,
static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
{
- if (hrtimer_is_queued(timer)) {
- u8 state = timer->state;
+ u8 state = timer->state;
+
+ if (state & HRTIMER_STATE_ENQUEUED) {
int reprogram;
/*
@@ -1096,9 +1120,13 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/*
* Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
- * match.
+ * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
+ * expiry mode because unmarked timers are moved to softirq expiry.
*/
- WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+ else
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
base = lock_hrtimer_base(timer, &flags);
@@ -1147,6 +1175,93 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
}
EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
+#ifdef CONFIG_PREEMPT_RT
+static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
+{
+ spin_lock_init(&base->softirq_expiry_lock);
+}
+
+static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
+{
+ spin_lock(&base->softirq_expiry_lock);
+}
+
+static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
+{
+ spin_unlock(&base->softirq_expiry_lock);
+}
+
+/*
+ * The counterpart to hrtimer_cancel_wait_running().
+ *
+ * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
+ * the timer callback to finish. Drop expiry_lock and reaquire it. That
+ * allows the waiter to acquire the lock and make progress.
+ */
+static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
+ unsigned long flags)
+{
+ if (atomic_read(&cpu_base->timer_waiters)) {
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ spin_unlock(&cpu_base->softirq_expiry_lock);
+ spin_lock(&cpu_base->softirq_expiry_lock);
+ raw_spin_lock_irq(&cpu_base->lock);
+ }
+}
+
+/*
+ * This function is called on PREEMPT_RT kernels when the fast path
+ * deletion of a timer failed because the timer callback function was
+ * running.
+ *
+ * This prevents priority inversion: if the soft irq thread is preempted
+ * in the middle of a timer callback, then calling del_timer_sync() can
+ * lead to two issues:
+ *
+ * - If the caller is on a remote CPU then it has to spin wait for the timer
+ * handler to complete. This can result in unbound priority inversion.
+ *
+ * - If the caller originates from the task which preempted the timer
+ * handler on the same CPU, then spin waiting for the timer handler to
+ * complete is never going to end.
+ */
+void hrtimer_cancel_wait_running(const struct hrtimer *timer)
+{
+ /* Lockless read. Prevent the compiler from reloading it below */
+ struct hrtimer_clock_base *base = READ_ONCE(timer->base);
+
+ /*
+ * Just relax if the timer expires in hard interrupt context or if
+ * it is currently on the migration base.
+ */
+ if (!timer->is_soft || is_migration_base(base)) {
+ cpu_relax();
+ return;
+ }
+
+ /*
+ * Mark the base as contended and grab the expiry lock, which is
+ * held by the softirq across the timer callback. Drop the lock
+ * immediately so the softirq can expire the next timer. In theory
+ * the timer could already be running again, but that's more than
+ * unlikely and just causes another wait loop.
+ */
+ atomic_inc(&base->cpu_base->timer_waiters);
+ spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
+ atomic_dec(&base->cpu_base->timer_waiters);
+ spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
+}
+#else
+static inline void
+hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
+static inline void
+hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void
+hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
+ unsigned long flags) { }
+#endif
+
/**
* hrtimer_cancel - cancel a timer and wait for the handler to finish.
* @timer: the timer to be cancelled
@@ -1157,13 +1272,15 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
*/
int hrtimer_cancel(struct hrtimer *timer)
{
- for (;;) {
- int ret = hrtimer_try_to_cancel(timer);
+ int ret;
- if (ret >= 0)
- return ret;
- cpu_relax();
- }
+ do {
+ ret = hrtimer_try_to_cancel(timer);
+
+ if (ret < 0)
+ hrtimer_cancel_wait_running(timer);
+ } while (ret < 0);
+ return ret;
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);
@@ -1260,8 +1377,17 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
- int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
struct hrtimer_cpu_base *cpu_base;
+ int base;
+
+ /*
+ * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+ * marked for hard interrupt expiry mode are moved into soft
+ * interrupt context for latency reasons and because the callbacks
+ * can invoke functions which might sleep on RT, e.g. spin_lock().
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
+ softtimer = true;
memset(timer, 0, sizeof(struct hrtimer));
@@ -1275,8 +1401,10 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
clock_id = CLOCK_MONOTONIC;
+ base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
base += hrtimer_clockid_to_base(clock_id);
timer->is_soft = softtimer;
+ timer->is_hard = !softtimer;
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
}
@@ -1349,7 +1477,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
struct hrtimer *timer, ktime_t *now,
- unsigned long flags)
+ unsigned long flags) __must_hold(&cpu_base->lock)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
@@ -1449,6 +1577,8 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
break;
__run_hrtimer(cpu_base, base, timer, &basenow, flags);
+ if (active_mask == HRTIMER_ACTIVE_SOFT)
+ hrtimer_sync_wait_running(cpu_base, flags);
}
}
}
@@ -1459,6 +1589,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
unsigned long flags;
ktime_t now;
+ hrtimer_cpu_base_lock_expiry(cpu_base);
raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
@@ -1468,6 +1599,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
hrtimer_update_softirq_timer(cpu_base, true);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ hrtimer_cpu_base_unlock_expiry(cpu_base);
}
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1639,10 +1771,75 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
+/**
+ * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
+ * @sl: sleeper to be started
+ * @mode: timer mode abs/rel
+ *
+ * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
+ * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
+ */
+void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
+ enum hrtimer_mode mode)
+{
+ /*
+ * Make the enqueue delivery mode check work on RT. If the sleeper
+ * was initialized for hard interrupt delivery, force the mode bit.
+ * This is a special case for hrtimer_sleepers because
+ * hrtimer_init_sleeper() determines the delivery mode on RT so the
+ * fiddling with this decision is avoided at the call sites.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
+ mode |= HRTIMER_MODE_HARD;
+
+ hrtimer_start_expires(&sl->timer, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
+
+static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+ clockid_t clock_id, enum hrtimer_mode mode)
{
+ /*
+ * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+ * marked for hard interrupt expiry mode are moved into soft
+ * interrupt context either for latency reasons or because the
+ * hrtimer callback takes regular spinlocks or invokes other
+ * functions which are not suitable for hard interrupt context on
+ * PREEMPT_RT.
+ *
+ * The hrtimer_sleeper callback is RT compatible in hard interrupt
+ * context, but there is a latency concern: Untrusted userspace can
+ * spawn many threads which arm timers for the same expiry time on
+ * the same CPU. That causes a latency spike due to the wakeup of
+ * a gazillion threads.
+ *
+ * OTOH, priviledged real-time user space applications rely on the
+ * low latency of hard interrupt wakeups. If the current task is in
+ * a real-time scheduling class, mark the mode for hard interrupt
+ * expiry.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
+ mode |= HRTIMER_MODE_HARD;
+ }
+
+ __hrtimer_init(&sl->timer, clock_id, mode);
sl->timer.function = hrtimer_wakeup;
- sl->task = task;
+ sl->task = current;
+}
+
+/**
+ * hrtimer_init_sleeper - initialize sleeper to the given clock
+ * @sl: sleeper to be initialized
+ * @clock_id: the clock to be used
+ * @mode: timer mode abs/rel
+ */
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
+ enum hrtimer_mode mode)
+{
+ debug_init(&sl->timer, clock_id, mode);
+ __hrtimer_init_sleeper(sl, clock_id, mode);
+
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
@@ -1669,11 +1866,9 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
{
struct restart_block *restart;
- hrtimer_init_sleeper(t, current);
-
do {
set_current_state(TASK_INTERRUPTIBLE);
- hrtimer_start_expires(&t->timer, mode);
+ hrtimer_sleeper_start_expires(t, mode);
if (likely(t->task))
freezable_schedule();
@@ -1707,17 +1902,16 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
struct hrtimer_sleeper t;
int ret;
- hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
- HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
+ HRTIMER_MODE_ABS);
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
-
ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
destroy_hrtimer_on_stack(&t.timer);
return ret;
}
-long hrtimer_nanosleep(const struct timespec64 *rqtp,
- const enum hrtimer_mode mode, const clockid_t clockid)
+long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
+ const clockid_t clockid)
{
struct restart_block *restart;
struct hrtimer_sleeper t;
@@ -1728,8 +1922,8 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
if (dl_task(current) || rt_task(current))
slack = 0;
- hrtimer_init_on_stack(&t.timer, clockid, mode);
- hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
+ hrtimer_init_sleeper_on_stack(&t, clockid, mode);
+ hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
ret = do_nanosleep(&t, mode);
if (ret != -ERESTART_RESTARTBLOCK)
goto out;
@@ -1749,7 +1943,7 @@ out:
return ret;
}
-#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+#ifdef CONFIG_64BIT
SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
struct __kernel_timespec __user *, rmtp)
@@ -1764,7 +1958,8 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
+ CLOCK_MONOTONIC);
}
#endif
@@ -1784,7 +1979,8 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
+ CLOCK_MONOTONIC);
}
#endif
@@ -1809,6 +2005,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
+ hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
@@ -1927,12 +2124,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
- hrtimer_init_on_stack(&t.timer, clock_id, mode);
+ hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
-
- hrtimer_init_sleeper(&t, current);
-
- hrtimer_start_expires(&t.timer, mode);
+ hrtimer_sleeper_start_expires(&t, mode);
if (likely(t.task))
schedule();
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 02068b2d5862..ca4e6d57d68b 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -26,7 +26,7 @@
* Returns the delta between the expiry time and now, which can be
* less than zero or 1usec for an pending expired timer
*/
-static struct timeval itimer_get_remtime(struct hrtimer *timer)
+static struct timespec64 itimer_get_remtime(struct hrtimer *timer)
{
ktime_t rem = __hrtimer_get_remaining(timer, true);
@@ -41,11 +41,11 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
} else
rem = 0;
- return ktime_to_timeval(rem);
+ return ktime_to_timespec64(rem);
}
static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
- struct itimerval *const value)
+ struct itimerspec64 *const value)
{
u64 val, interval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
@@ -55,15 +55,10 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
val = it->expires;
interval = it->incr;
if (val) {
- struct task_cputime cputime;
- u64 t;
+ u64 t, samples[CPUCLOCK_MAX];
- thread_group_cputimer(tsk, &cputime);
- if (clock_id == CPUCLOCK_PROF)
- t = cputime.utime + cputime.stime;
- else
- /* CPUCLOCK_VIRT */
- t = cputime.utime;
+ thread_group_sample_cputime(tsk, samples);
+ t = samples[clock_id];
if (val < t)
/* about to fire */
@@ -74,11 +69,11 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
spin_unlock_irq(&tsk->sighand->siglock);
- value->it_value = ns_to_timeval(val);
- value->it_interval = ns_to_timeval(interval);
+ value->it_value = ns_to_timespec64(val);
+ value->it_interval = ns_to_timespec64(interval);
}
-int do_getitimer(int which, struct itimerval *value)
+static int do_getitimer(int which, struct itimerspec64 *value)
{
struct task_struct *tsk = current;
@@ -87,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value)
spin_lock_irq(&tsk->sighand->siglock);
value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
value->it_interval =
- ktime_to_timeval(tsk->signal->it_real_incr);
+ ktime_to_timespec64(tsk->signal->it_real_incr);
spin_unlock_irq(&tsk->sighand->siglock);
break;
case ITIMER_VIRTUAL:
@@ -102,34 +97,59 @@ int do_getitimer(int which, struct itimerval *value)
return 0;
}
-SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
+static int put_itimerval(struct __kernel_old_itimerval __user *o,
+ const struct itimerspec64 *i)
{
- int error = -EFAULT;
- struct itimerval get_buffer;
+ struct __kernel_old_itimerval v;
- if (value) {
- error = do_getitimer(which, &get_buffer);
- if (!error &&
- copy_to_user(value, &get_buffer, sizeof(get_buffer)))
- error = -EFAULT;
- }
+ v.it_interval.tv_sec = i->it_interval.tv_sec;
+ v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC;
+ v.it_value.tv_sec = i->it_value.tv_sec;
+ v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC;
+ return copy_to_user(o, &v, sizeof(struct __kernel_old_itimerval)) ? -EFAULT : 0;
+}
+
+
+SYSCALL_DEFINE2(getitimer, int, which, struct __kernel_old_itimerval __user *, value)
+{
+ struct itimerspec64 get_buffer;
+ int error = do_getitimer(which, &get_buffer);
+
+ if (!error && put_itimerval(value, &get_buffer))
+ error = -EFAULT;
return error;
}
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
+struct old_itimerval32 {
+ struct old_timeval32 it_interval;
+ struct old_timeval32 it_value;
+};
+
+static int put_old_itimerval32(struct old_itimerval32 __user *o,
+ const struct itimerspec64 *i)
+{
+ struct old_itimerval32 v32;
+
+ v32.it_interval.tv_sec = i->it_interval.tv_sec;
+ v32.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC;
+ v32.it_value.tv_sec = i->it_value.tv_sec;
+ v32.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC;
+ return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0;
+}
+
COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
- struct compat_itimerval __user *, it)
+ struct old_itimerval32 __user *, value)
{
- struct itimerval kit;
- int error = do_getitimer(which, &kit);
+ struct itimerspec64 get_buffer;
+ int error = do_getitimer(which, &get_buffer);
- if (!error && put_compat_itimerval(it, &kit))
+ if (!error && put_old_itimerval32(value, &get_buffer))
error = -EFAULT;
return error;
}
#endif
-
/*
* The timer is automagically restarted, when interval != 0
*/
@@ -146,8 +166,8 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
}
static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
- const struct itimerval *const value,
- struct itimerval *const ovalue)
+ const struct itimerspec64 *const value,
+ struct itimerspec64 *const ovalue)
{
u64 oval, nval, ointerval, ninterval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
@@ -156,8 +176,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
* Use the to_ktime conversion because that clamps the maximum
* value to KTIME_MAX and avoid multiplication overflows.
*/
- nval = ktime_to_ns(timeval_to_ktime(value->it_value));
- ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval));
+ nval = timespec64_to_ns(&value->it_value);
+ ninterval = timespec64_to_ns(&value->it_interval);
spin_lock_irq(&tsk->sighand->siglock);
@@ -176,8 +196,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
spin_unlock_irq(&tsk->sighand->siglock);
if (ovalue) {
- ovalue->it_value = ns_to_timeval(oval);
- ovalue->it_interval = ns_to_timeval(ointerval);
+ ovalue->it_value = ns_to_timespec64(oval);
+ ovalue->it_interval = ns_to_timespec64(ointerval);
}
}
@@ -187,19 +207,13 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
#define timeval_valid(t) \
(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
-int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
+static int do_setitimer(int which, struct itimerspec64 *value,
+ struct itimerspec64 *ovalue)
{
struct task_struct *tsk = current;
struct hrtimer *timer;
ktime_t expires;
- /*
- * Validate the timevals in value.
- */
- if (!timeval_valid(&value->it_value) ||
- !timeval_valid(&value->it_interval))
- return -EINVAL;
-
switch (which) {
case ITIMER_REAL:
again:
@@ -208,17 +222,18 @@ again:
if (ovalue) {
ovalue->it_value = itimer_get_remtime(timer);
ovalue->it_interval
- = ktime_to_timeval(tsk->signal->it_real_incr);
+ = ktime_to_timespec64(tsk->signal->it_real_incr);
}
/* We are sharing ->siglock with it_real_fn() */
if (hrtimer_try_to_cancel(timer) < 0) {
spin_unlock_irq(&tsk->sighand->siglock);
+ hrtimer_cancel_wait_running(timer);
goto again;
}
- expires = timeval_to_ktime(value->it_value);
+ expires = timespec64_to_ktime(value->it_value);
if (expires != 0) {
tsk->signal->it_real_incr =
- timeval_to_ktime(value->it_interval);
+ timespec64_to_ktime(value->it_interval);
hrtimer_start(timer, expires, HRTIMER_MODE_REL);
} else
tsk->signal->it_real_incr = 0;
@@ -238,6 +253,17 @@ again:
return 0;
}
+#ifdef CONFIG_SECURITY_SELINUX
+void clear_itimer(void)
+{
+ struct itimerspec64 v = {};
+ int i;
+
+ for (i = 0; i < 3; i++)
+ do_setitimer(i, &v, NULL);
+}
+#endif
+
#ifdef __ARCH_WANT_SYS_ALARM
/**
@@ -254,15 +280,15 @@ again:
*/
static unsigned int alarm_setitimer(unsigned int seconds)
{
- struct itimerval it_new, it_old;
+ struct itimerspec64 it_new, it_old;
#if BITS_PER_LONG < 64
if (seconds > INT_MAX)
seconds = INT_MAX;
#endif
it_new.it_value.tv_sec = seconds;
- it_new.it_value.tv_usec = 0;
- it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+ it_new.it_value.tv_nsec = 0;
+ it_new.it_interval.tv_sec = it_new.it_interval.tv_nsec = 0;
do_setitimer(ITIMER_REAL, &it_new, &it_old);
@@ -270,8 +296,8 @@ static unsigned int alarm_setitimer(unsigned int seconds)
* We can't return 0 if we have an alarm pending ... And we'd
* better return too much than too little anyway
*/
- if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
- it_old.it_value.tv_usec >= 500000)
+ if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) ||
+ it_old.it_value.tv_nsec >= (NSEC_PER_SEC / 2))
it_old.it_value.tv_sec++;
return it_old.it_value.tv_sec;
@@ -288,15 +314,35 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
#endif
-SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
- struct itimerval __user *, ovalue)
+static int get_itimerval(struct itimerspec64 *o, const struct __kernel_old_itimerval __user *i)
{
- struct itimerval set_buffer, get_buffer;
+ struct __kernel_old_itimerval v;
+
+ if (copy_from_user(&v, i, sizeof(struct __kernel_old_itimerval)))
+ return -EFAULT;
+
+ /* Validate the timevals in value. */
+ if (!timeval_valid(&v.it_value) ||
+ !timeval_valid(&v.it_interval))
+ return -EINVAL;
+
+ o->it_interval.tv_sec = v.it_interval.tv_sec;
+ o->it_interval.tv_nsec = v.it_interval.tv_usec * NSEC_PER_USEC;
+ o->it_value.tv_sec = v.it_value.tv_sec;
+ o->it_value.tv_nsec = v.it_value.tv_usec * NSEC_PER_USEC;
+ return 0;
+}
+
+SYSCALL_DEFINE3(setitimer, int, which, struct __kernel_old_itimerval __user *, value,
+ struct __kernel_old_itimerval __user *, ovalue)
+{
+ struct itimerspec64 set_buffer, get_buffer;
int error;
if (value) {
- if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
- return -EFAULT;
+ error = get_itimerval(&set_buffer, value);
+ if (error)
+ return error;
} else {
memset(&set_buffer, 0, sizeof(set_buffer));
printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
@@ -308,30 +354,53 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
if (error || !ovalue)
return error;
- if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
+ if (put_itimerval(ovalue, &get_buffer))
+ return -EFAULT;
+ return 0;
+}
+
+#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
+static int get_old_itimerval32(struct itimerspec64 *o, const struct old_itimerval32 __user *i)
+{
+ struct old_itimerval32 v32;
+
+ if (copy_from_user(&v32, i, sizeof(struct old_itimerval32)))
return -EFAULT;
+
+ /* Validate the timevals in value. */
+ if (!timeval_valid(&v32.it_value) ||
+ !timeval_valid(&v32.it_interval))
+ return -EINVAL;
+
+ o->it_interval.tv_sec = v32.it_interval.tv_sec;
+ o->it_interval.tv_nsec = v32.it_interval.tv_usec * NSEC_PER_USEC;
+ o->it_value.tv_sec = v32.it_value.tv_sec;
+ o->it_value.tv_nsec = v32.it_value.tv_usec * NSEC_PER_USEC;
return 0;
}
-#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
- struct compat_itimerval __user *, in,
- struct compat_itimerval __user *, out)
+ struct old_itimerval32 __user *, value,
+ struct old_itimerval32 __user *, ovalue)
{
- struct itimerval kin, kout;
+ struct itimerspec64 set_buffer, get_buffer;
int error;
- if (in) {
- if (get_compat_itimerval(&kin, in))
- return -EFAULT;
+ if (value) {
+ error = get_old_itimerval32(&set_buffer, value);
+ if (error)
+ return error;
} else {
- memset(&kin, 0, sizeof(kin));
+ memset(&set_buffer, 0, sizeof(set_buffer));
+ printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
+ " Misfeature support will be removed\n",
+ current->comm);
}
- error = do_setitimer(which, &kin, out ? &kout : NULL);
- if (error || !out)
+ error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
+ if (error || !ovalue)
return error;
- if (put_compat_itimerval(out, &kout))
+ if (put_old_itimerval32(ovalue, &get_buffer))
return -EFAULT;
return 0;
}
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
new file mode 100644
index 000000000000..12858507d75a
--- /dev/null
+++ b/kernel/time/namespace.c
@@ -0,0 +1,468 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Andrei Vagin <avagin@openvz.org>
+ * Author: Dmitry Safonov <dima@arista.com>
+ */
+
+#include <linux/time_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/seq_file.h>
+#include <linux/proc_ns.h>
+#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+
+#include <vdso/datapage.h>
+
+ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
+ struct timens_offsets *ns_offsets)
+{
+ ktime_t offset;
+
+ switch (clockid) {
+ case CLOCK_MONOTONIC:
+ offset = timespec64_to_ktime(ns_offsets->monotonic);
+ break;
+ case CLOCK_BOOTTIME:
+ case CLOCK_BOOTTIME_ALARM:
+ offset = timespec64_to_ktime(ns_offsets->boottime);
+ break;
+ default:
+ return tim;
+ }
+
+ /*
+ * Check that @tim value is in [offset, KTIME_MAX + offset]
+ * and subtract offset.
+ */
+ if (tim < offset) {
+ /*
+ * User can specify @tim *absolute* value - if it's lesser than
+ * the time namespace's offset - it's already expired.
+ */
+ tim = 0;
+ } else {
+ tim = ktime_sub(tim, offset);
+ if (unlikely(tim > KTIME_MAX))
+ tim = KTIME_MAX;
+ }
+
+ return tim;
+}
+
+static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES);
+}
+
+static void dec_time_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES);
+}
+
+/**
+ * clone_time_ns - Clone a time namespace
+ * @user_ns: User namespace which owns a new namespace.
+ * @old_ns: Namespace to clone
+ *
+ * Clone @old_ns and set the clone refcount to 1
+ *
+ * Return: The new namespace or ERR_PTR.
+ */
+static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
+ struct time_namespace *old_ns)
+{
+ struct time_namespace *ns;
+ struct ucounts *ucounts;
+ int err;
+
+ err = -ENOSPC;
+ ucounts = inc_time_namespaces(user_ns);
+ if (!ucounts)
+ goto fail;
+
+ err = -ENOMEM;
+ ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+ if (!ns)
+ goto fail_dec;
+
+ kref_init(&ns->kref);
+
+ ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!ns->vvar_page)
+ goto fail_free;
+
+ err = ns_alloc_inum(&ns->ns);
+ if (err)
+ goto fail_free_page;
+
+ ns->ucounts = ucounts;
+ ns->ns.ops = &timens_operations;
+ ns->user_ns = get_user_ns(user_ns);
+ ns->offsets = old_ns->offsets;
+ ns->frozen_offsets = false;
+ return ns;
+
+fail_free_page:
+ __free_page(ns->vvar_page);
+fail_free:
+ kfree(ns);
+fail_dec:
+ dec_time_namespaces(ucounts);
+fail:
+ return ERR_PTR(err);
+}
+
+/**
+ * copy_time_ns - Create timens_for_children from @old_ns
+ * @flags: Cloning flags
+ * @user_ns: User namespace which owns a new namespace.
+ * @old_ns: Namespace to clone
+ *
+ * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children;
+ * adds a refcounter to @old_ns otherwise.
+ *
+ * Return: timens_for_children namespace or ERR_PTR.
+ */
+struct time_namespace *copy_time_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct time_namespace *old_ns)
+{
+ if (!(flags & CLONE_NEWTIME))
+ return get_time_ns(old_ns);
+
+ return clone_time_ns(user_ns, old_ns);
+}
+
+static struct timens_offset offset_from_ts(struct timespec64 off)
+{
+ struct timens_offset ret;
+
+ ret.sec = off.tv_sec;
+ ret.nsec = off.tv_nsec;
+
+ return ret;
+}
+
+/*
+ * A time namespace VVAR page has the same layout as the VVAR page which
+ * contains the system wide VDSO data.
+ *
+ * For a normal task the VVAR pages are installed in the normal ordering:
+ * VVAR
+ * PVCLOCK
+ * HVCLOCK
+ * TIMENS <- Not really required
+ *
+ * Now for a timens task the pages are installed in the following order:
+ * TIMENS
+ * PVCLOCK
+ * HVCLOCK
+ * VVAR
+ *
+ * The check for vdso_data->clock_mode is in the unlikely path of
+ * the seq begin magic. So for the non-timens case most of the time
+ * 'seq' is even, so the branch is not taken.
+ *
+ * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
+ * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
+ * update to finish and for 'seq' to become even anyway.
+ *
+ * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces
+ * the time namespace handling path.
+ */
+static void timens_setup_vdso_data(struct vdso_data *vdata,
+ struct time_namespace *ns)
+{
+ struct timens_offset *offset = vdata->offset;
+ struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
+ struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
+
+ vdata->seq = 1;
+ vdata->clock_mode = VCLOCK_TIMENS;
+ offset[CLOCK_MONOTONIC] = monotonic;
+ offset[CLOCK_MONOTONIC_RAW] = monotonic;
+ offset[CLOCK_MONOTONIC_COARSE] = monotonic;
+ offset[CLOCK_BOOTTIME] = boottime;
+ offset[CLOCK_BOOTTIME_ALARM] = boottime;
+}
+
+/*
+ * Protects possibly multiple offsets writers racing each other
+ * and tasks entering the namespace.
+ */
+static DEFINE_MUTEX(offset_lock);
+
+static void timens_set_vvar_page(struct task_struct *task,
+ struct time_namespace *ns)
+{
+ struct vdso_data *vdata;
+ unsigned int i;
+
+ if (ns == &init_time_ns)
+ return;
+
+ /* Fast-path, taken by every task in namespace except the first. */
+ if (likely(ns->frozen_offsets))
+ return;
+
+ mutex_lock(&offset_lock);
+ /* Nothing to-do: vvar_page has been already initialized. */
+ if (ns->frozen_offsets)
+ goto out;
+
+ ns->frozen_offsets = true;
+ vdata = arch_get_vdso_data(page_address(ns->vvar_page));
+
+ for (i = 0; i < CS_BASES; i++)
+ timens_setup_vdso_data(&vdata[i], ns);
+
+out:
+ mutex_unlock(&offset_lock);
+}
+
+void free_time_ns(struct kref *kref)
+{
+ struct time_namespace *ns;
+
+ ns = container_of(kref, struct time_namespace, kref);
+ dec_time_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ __free_page(ns->vvar_page);
+ kfree(ns);
+}
+
+static struct time_namespace *to_time_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct time_namespace, ns);
+}
+
+static struct ns_common *timens_get(struct task_struct *task)
+{
+ struct time_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->time_ns;
+ get_time_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static struct ns_common *timens_for_children_get(struct task_struct *task)
+{
+ struct time_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->time_ns_for_children;
+ get_time_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void timens_put(struct ns_common *ns)
+{
+ put_time_ns(to_time_ns(ns));
+}
+
+static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
+{
+ struct time_namespace *ns = to_time_ns(new);
+ int err;
+
+ if (!current_is_single_threaded())
+ return -EUSERS;
+
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ timens_set_vvar_page(current, ns);
+
+ err = vdso_join_timens(current, ns);
+ if (err)
+ return err;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns);
+ nsproxy->time_ns = ns;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns_for_children);
+ nsproxy->time_ns_for_children = ns;
+ return 0;
+}
+
+int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
+{
+ struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
+ struct time_namespace *ns = to_time_ns(nsc);
+ int err;
+
+ /* create_new_namespaces() already incremented the ref counter */
+ if (nsproxy->time_ns == nsproxy->time_ns_for_children)
+ return 0;
+
+ timens_set_vvar_page(tsk, ns);
+
+ err = vdso_join_timens(tsk, ns);
+ if (err)
+ return err;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns);
+ nsproxy->time_ns = ns;
+
+ return 0;
+}
+
+static struct user_namespace *timens_owner(struct ns_common *ns)
+{
+ return to_time_ns(ns)->user_ns;
+}
+
+static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
+{
+ seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec);
+}
+
+void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
+{
+ struct ns_common *ns;
+ struct time_namespace *time_ns;
+
+ ns = timens_for_children_get(p);
+ if (!ns)
+ return;
+ time_ns = to_time_ns(ns);
+
+ show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
+ show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
+ put_time_ns(time_ns);
+}
+
+int proc_timens_set_offset(struct file *file, struct task_struct *p,
+ struct proc_timens_offset *offsets, int noffsets)
+{
+ struct ns_common *ns;
+ struct time_namespace *time_ns;
+ struct timespec64 tp;
+ int i, err;
+
+ ns = timens_for_children_get(p);
+ if (!ns)
+ return -ESRCH;
+ time_ns = to_time_ns(ns);
+
+ if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
+ put_time_ns(time_ns);
+ return -EPERM;
+ }
+
+ for (i = 0; i < noffsets; i++) {
+ struct proc_timens_offset *off = &offsets[i];
+
+ switch (off->clockid) {
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(&tp);
+ break;
+ case CLOCK_BOOTTIME:
+ ktime_get_boottime_ts64(&tp);
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = -ERANGE;
+
+ if (off->val.tv_sec > KTIME_SEC_MAX ||
+ off->val.tv_sec < -KTIME_SEC_MAX)
+ goto out;
+
+ tp = timespec64_add(tp, off->val);
+ /*
+ * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is
+ * still unreachable.
+ */
+ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
+ goto out;
+ }
+
+ mutex_lock(&offset_lock);
+ if (time_ns->frozen_offsets) {
+ err = -EACCES;
+ goto out_unlock;
+ }
+
+ err = 0;
+ /* Don't report errors after this line */
+ for (i = 0; i < noffsets; i++) {
+ struct proc_timens_offset *off = &offsets[i];
+ struct timespec64 *offset = NULL;
+
+ switch (off->clockid) {
+ case CLOCK_MONOTONIC:
+ offset = &time_ns->offsets.monotonic;
+ break;
+ case CLOCK_BOOTTIME:
+ offset = &time_ns->offsets.boottime;
+ break;
+ }
+
+ *offset = off->val;
+ }
+
+out_unlock:
+ mutex_unlock(&offset_lock);
+out:
+ put_time_ns(time_ns);
+
+ return err;
+}
+
+const struct proc_ns_operations timens_operations = {
+ .name = "time",
+ .type = CLONE_NEWTIME,
+ .get = timens_get,
+ .put = timens_put,
+ .install = timens_install,
+ .owner = timens_owner,
+};
+
+const struct proc_ns_operations timens_for_children_operations = {
+ .name = "time_for_children",
+ .type = CLONE_NEWTIME,
+ .get = timens_for_children_get,
+ .put = timens_put,
+ .install = timens_install,
+ .owner = timens_owner,
+};
+
+struct time_namespace init_time_ns = {
+ .kref = KREF_INIT(3),
+ .user_ns = &init_user_ns,
+ .ns.inum = PROC_TIME_INIT_INO,
+ .ns.ops = &timens_operations,
+ .frozen_offsets = true,
+};
+
+static int __init time_ns_init(void)
+{
+ return 0;
+}
+subsys_initcall(time_ns_init);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 65eb796610dc..069ca78fb0bf 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -771,7 +771,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
/* fill PPS status fields */
pps_fill_timex(txc);
- txc->time.tv_sec = (time_t)ts->tv_sec;
+ txc->time.tv_sec = ts->tv_sec;
txc->time.tv_usec = ts->tv_nsec;
if (!(time_status & STA_NANO))
txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC;
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index ec960bb939fd..77c0c2370b6d 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -14,8 +14,6 @@
#include "posix-timers.h"
-static void delete_clock(struct kref *kref);
-
/*
* Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
*/
@@ -125,7 +123,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = 0;
if (!err) {
- kref_get(&clk->kref);
+ get_device(clk->dev);
fp->private_data = clk;
}
out:
@@ -141,7 +139,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp)
if (clk->ops.release)
err = clk->ops.release(clk);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
fp->private_data = NULL;
@@ -161,38 +159,35 @@ static const struct file_operations posix_clock_file_operations = {
#endif
};
-int posix_clock_register(struct posix_clock *clk, dev_t devid)
+int posix_clock_register(struct posix_clock *clk, struct device *dev)
{
int err;
- kref_init(&clk->kref);
init_rwsem(&clk->rwsem);
cdev_init(&clk->cdev, &posix_clock_file_operations);
+ err = cdev_device_add(&clk->cdev, dev);
+ if (err) {
+ pr_err("%s unable to add device %d:%d\n",
+ dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt));
+ return err;
+ }
clk->cdev.owner = clk->ops.owner;
- err = cdev_add(&clk->cdev, devid, 1);
+ clk->dev = dev;
- return err;
+ return 0;
}
EXPORT_SYMBOL_GPL(posix_clock_register);
-static void delete_clock(struct kref *kref)
-{
- struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
-
- if (clk->release)
- clk->release(clk);
-}
-
void posix_clock_unregister(struct posix_clock *clk)
{
- cdev_del(&clk->cdev);
+ cdev_device_del(&clk->cdev, clk->dev);
down_write(&clk->rwsem);
clk->zombie = true;
up_write(&clk->rwsem);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
}
EXPORT_SYMBOL_GPL(posix_clock_unregister);
@@ -315,8 +310,8 @@ out:
}
const struct k_clock clock_posix_dynamic = {
- .clock_getres = pc_clock_getres,
- .clock_set = pc_clock_settime,
- .clock_get = pc_clock_gettime,
- .clock_adj = pc_clock_adjtime,
+ .clock_getres = pc_clock_getres,
+ .clock_set = pc_clock_settime,
+ .clock_get_timespec = pc_clock_gettime,
+ .clock_adj = pc_clock_adjtime,
};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0a426f4e3125..8ff6da77a01f 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -20,11 +20,20 @@
static void posix_cpu_timer_rearm(struct k_itimer *timer);
+void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
+{
+ posix_cputimers_init(pct);
+ if (cpu_limit != RLIM_INFINITY) {
+ pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC;
+ pct->timers_active = true;
+ }
+}
+
/*
* Called after updating RLIMIT_CPU to run cpu timer and update
- * tsk->signal->cputime_expires expiration cache if necessary. Needs
- * siglock protection since other code may update expiration cache as
- * well.
+ * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
+ * necessary. Needs siglock protection since other code may update the
+ * expiration cache as well.
*/
void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{
@@ -35,46 +44,97 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
spin_unlock_irq(&task->sighand->siglock);
}
-static int check_clock(const clockid_t which_clock)
+/*
+ * Functions for validating access to tasks.
+ */
+static struct task_struct *lookup_task(const pid_t pid, bool thread,
+ bool gettime)
{
- int error = 0;
struct task_struct *p;
- const pid_t pid = CPUCLOCK_PID(which_clock);
-
- if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
- return -EINVAL;
- if (pid == 0)
- return 0;
+ /*
+ * If the encoded PID is 0, then the timer is targeted at current
+ * or the process to which current belongs.
+ */
+ if (!pid)
+ return thread ? current : current->group_leader;
- rcu_read_lock();
p = find_task_by_vpid(pid);
- if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
- same_thread_group(p, current) : has_group_leader_pid(p))) {
- error = -EINVAL;
+ if (!p)
+ return p;
+
+ if (thread)
+ return same_thread_group(p, current) ? p : NULL;
+
+ if (gettime) {
+ /*
+ * For clock_gettime(PROCESS) the task does not need to be
+ * the actual group leader. tsk->sighand gives
+ * access to the group's clock.
+ *
+ * Timers need the group leader because they take a
+ * reference on it and store the task pointer until the
+ * timer is destroyed.
+ */
+ return (p == current || thread_group_leader(p)) ? p : NULL;
}
+
+ /*
+ * For processes require that p is group leader.
+ */
+ return has_group_leader_pid(p) ? p : NULL;
+}
+
+static struct task_struct *__get_task_for_clock(const clockid_t clock,
+ bool getref, bool gettime)
+{
+ const bool thread = !!CPUCLOCK_PERTHREAD(clock);
+ const pid_t pid = CPUCLOCK_PID(clock);
+ struct task_struct *p;
+
+ if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
+ return NULL;
+
+ rcu_read_lock();
+ p = lookup_task(pid, thread, gettime);
+ if (p && getref)
+ get_task_struct(p);
rcu_read_unlock();
+ return p;
+}
- return error;
+static inline struct task_struct *get_task_for_clock(const clockid_t clock)
+{
+ return __get_task_for_clock(clock, true, false);
+}
+
+static inline struct task_struct *get_task_for_clock_get(const clockid_t clock)
+{
+ return __get_task_for_clock(clock, true, true);
+}
+
+static inline int validate_clock_permissions(const clockid_t clock)
+{
+ return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL;
}
/*
* Update expiry time from increment, and increase overrun count,
* given the current clock sample.
*/
-static void bump_cpu_timer(struct k_itimer *timer, u64 now)
+static u64 bump_cpu_timer(struct k_itimer *timer, u64 now)
{
+ u64 delta, incr, expires = timer->it.cpu.node.expires;
int i;
- u64 delta, incr;
if (!timer->it_interval)
- return;
+ return expires;
- if (now < timer->it.cpu.expires)
- return;
+ if (now < expires)
+ return expires;
incr = timer->it_interval;
- delta = now + incr - timer->it.cpu.expires;
+ delta = now + incr - expires;
/* Don't use (incr*2 < delta), incr*2 might overflow. */
for (i = 0; incr < delta - incr; i++)
@@ -84,48 +144,26 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
if (delta < incr)
continue;
- timer->it.cpu.expires += incr;
+ timer->it.cpu.node.expires += incr;
timer->it_overrun += 1LL << i;
delta -= incr;
}
+ return timer->it.cpu.node.expires;
}
-/**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime: The struct to compare.
- *
- * Checks @cputime to see if all fields are zero. Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
+/* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */
+static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct)
{
- if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
- return 1;
- return 0;
-}
-
-static inline u64 prof_ticks(struct task_struct *p)
-{
- u64 utime, stime;
-
- task_cputime(p, &utime, &stime);
-
- return utime + stime;
-}
-static inline u64 virt_ticks(struct task_struct *p)
-{
- u64 utime, stime;
-
- task_cputime(p, &utime, &stime);
-
- return utime;
+ return !(~pct->bases[CPUCLOCK_PROF].nextevt |
+ ~pct->bases[CPUCLOCK_VIRT].nextevt |
+ ~pct->bases[CPUCLOCK_SCHED].nextevt);
}
static int
posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
{
- int error = check_clock(which_clock);
+ int error = validate_clock_permissions(which_clock);
+
if (!error) {
tp->tv_sec = 0;
tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
@@ -142,42 +180,66 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
}
static int
-posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
+posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
{
+ int error = validate_clock_permissions(clock);
+
/*
* You can never reset a CPU clock, but we check for other errors
* in the call before failing with EPERM.
*/
- int error = check_clock(which_clock);
- if (error == 0) {
- error = -EPERM;
- }
- return error;
+ return error ? : -EPERM;
}
-
/*
- * Sample a per-thread clock for the given task.
+ * Sample a per-thread clock for the given task. clkid is validated.
*/
-static int cpu_clock_sample(const clockid_t which_clock,
- struct task_struct *p, u64 *sample)
+static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
{
- switch (CPUCLOCK_WHICH(which_clock)) {
- default:
- return -EINVAL;
+ u64 utime, stime;
+
+ if (clkid == CPUCLOCK_SCHED)
+ return task_sched_runtime(p);
+
+ task_cputime(p, &utime, &stime);
+
+ switch (clkid) {
case CPUCLOCK_PROF:
- *sample = prof_ticks(p);
- break;
+ return utime + stime;
case CPUCLOCK_VIRT:
- *sample = virt_ticks(p);
- break;
- case CPUCLOCK_SCHED:
- *sample = task_sched_runtime(p);
- break;
+ return utime;
+ default:
+ WARN_ON_ONCE(1);
}
return 0;
}
+static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime)
+{
+ samples[CPUCLOCK_PROF] = stime + utime;
+ samples[CPUCLOCK_VIRT] = utime;
+ samples[CPUCLOCK_SCHED] = rtime;
+}
+
+static void task_sample_cputime(struct task_struct *p, u64 *samples)
+{
+ u64 stime, utime;
+
+ task_cputime(p, &utime, &stime);
+ store_samples(samples, stime, utime, p->se.sum_exec_runtime);
+}
+
+static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
+ u64 *samples)
+{
+ u64 stime, utime, rtime;
+
+ utime = atomic64_read(&at->utime);
+ stime = atomic64_read(&at->stime);
+ rtime = atomic64_read(&at->sum_exec_runtime);
+ store_samples(samples, stime, utime, rtime);
+}
+
/*
* Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
* to avoid race conditions with concurrent updates to cputime.
@@ -193,29 +255,56 @@ retry:
}
}
-static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
+ struct task_cputime *sum)
{
__update_gt_cputime(&cputime_atomic->utime, sum->utime);
__update_gt_cputime(&cputime_atomic->stime, sum->stime);
__update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
}
-/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
-static inline void sample_cputime_atomic(struct task_cputime *times,
- struct task_cputime_atomic *atomic_times)
+/**
+ * thread_group_sample_cputime - Sample cputime for a given task
+ * @tsk: Task for which cputime needs to be started
+ * @samples: Storage for time samples
+ *
+ * Called from sys_getitimer() to calculate the expiry time of an active
+ * timer. That means group cputime accounting is already active. Called
+ * with task sighand lock held.
+ *
+ * Updates @times with an uptodate sample of the thread group cputimes.
+ */
+void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
{
- times->utime = atomic64_read(&atomic_times->utime);
- times->stime = atomic64_read(&atomic_times->stime);
- times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
+
+ WARN_ON_ONCE(!pct->timers_active);
+
+ proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
}
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
+/**
+ * thread_group_start_cputime - Start cputime and return a sample
+ * @tsk: Task for which cputime needs to be started
+ * @samples: Storage for time samples
+ *
+ * The thread group cputime accouting is avoided when there are no posix
+ * CPU timers armed. Before starting a timer it's required to check whether
+ * the time accounting is active. If not, a full update of the atomic
+ * accounting store needs to be done and the accounting enabled.
+ *
+ * Updates @times with an uptodate sample of the thread group cputimes.
+ */
+static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- struct task_cputime sum;
+ struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
/* Check if cputimer isn't running. This is accessed without locking. */
- if (!READ_ONCE(cputimer->running)) {
+ if (!READ_ONCE(pct->timers_active)) {
+ struct task_cputime sum;
+
/*
* The POSIX timer interface allows for absolute time expiry
* values through the TIMER_ABSTIME flag, therefore we have
@@ -225,94 +314,69 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
update_gt_cputime(&cputimer->cputime_atomic, &sum);
/*
- * We're setting cputimer->running without a lock. Ensure
- * this only gets written to in one operation. We set
- * running after update_gt_cputime() as a small optimization,
- * but barriers are not required because update_gt_cputime()
+ * We're setting timers_active without a lock. Ensure this
+ * only gets written to in one operation. We set it after
+ * update_gt_cputime() as a small optimization, but
+ * barriers are not required because update_gt_cputime()
* can handle concurrent updates.
*/
- WRITE_ONCE(cputimer->running, true);
+ WRITE_ONCE(pct->timers_active, true);
}
- sample_cputime_atomic(times, &cputimer->cputime_atomic);
+ proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
}
-/*
- * Sample a process (thread group) clock for the given group_leader task.
- * Must be called with task sighand lock held for safe while_each_thread()
- * traversal.
- */
-static int cpu_clock_sample_group(const clockid_t which_clock,
- struct task_struct *p,
- u64 *sample)
+static void __thread_group_cputime(struct task_struct *tsk, u64 *samples)
{
- struct task_cputime cputime;
+ struct task_cputime ct;
- switch (CPUCLOCK_WHICH(which_clock)) {
- default:
- return -EINVAL;
- case CPUCLOCK_PROF:
- thread_group_cputime(p, &cputime);
- *sample = cputime.utime + cputime.stime;
- break;
- case CPUCLOCK_VIRT:
- thread_group_cputime(p, &cputime);
- *sample = cputime.utime;
- break;
- case CPUCLOCK_SCHED:
- thread_group_cputime(p, &cputime);
- *sample = cputime.sum_exec_runtime;
- break;
- }
- return 0;
+ thread_group_cputime(tsk, &ct);
+ store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime);
}
-static int posix_cpu_clock_get_task(struct task_struct *tsk,
- const clockid_t which_clock,
- struct timespec64 *tp)
+/*
+ * Sample a process (thread group) clock for the given task clkid. If the
+ * group's cputime accounting is already enabled, read the atomic
+ * store. Otherwise a full update is required. Task's sighand lock must be
+ * held to protect the task traversal on a full update. clkid is already
+ * validated.
+ */
+static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
+ bool start)
{
- int err = -EINVAL;
- u64 rtn;
+ struct thread_group_cputimer *cputimer = &p->signal->cputimer;
+ struct posix_cputimers *pct = &p->signal->posix_cputimers;
+ u64 samples[CPUCLOCK_MAX];
- if (CPUCLOCK_PERTHREAD(which_clock)) {
- if (same_thread_group(tsk, current))
- err = cpu_clock_sample(which_clock, tsk, &rtn);
+ if (!READ_ONCE(pct->timers_active)) {
+ if (start)
+ thread_group_start_cputime(p, samples);
+ else
+ __thread_group_cputime(p, samples);
} else {
- if (tsk == current || thread_group_leader(tsk))
- err = cpu_clock_sample_group(which_clock, tsk, &rtn);
+ proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
}
- if (!err)
- *tp = ns_to_timespec64(rtn);
-
- return err;
+ return samples[clkid];
}
-
-static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
+static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
{
- const pid_t pid = CPUCLOCK_PID(which_clock);
- int err = -EINVAL;
+ const clockid_t clkid = CPUCLOCK_WHICH(clock);
+ struct task_struct *tsk;
+ u64 t;
- if (pid == 0) {
- /*
- * Special case constant value for our own clocks.
- * We don't have to do any lookup to find ourselves.
- */
- err = posix_cpu_clock_get_task(current, which_clock, tp);
- } else {
- /*
- * Find the given PID, and validate that the caller
- * should be able to see it.
- */
- struct task_struct *p;
- rcu_read_lock();
- p = find_task_by_vpid(pid);
- if (p)
- err = posix_cpu_clock_get_task(p, which_clock, tp);
- rcu_read_unlock();
- }
+ tsk = get_task_for_clock_get(clock);
+ if (!tsk)
+ return -EINVAL;
- return err;
+ if (CPUCLOCK_PERTHREAD(clock))
+ t = cpu_clock_sample(clkid, tsk);
+ else
+ t = cpu_clock_sample_group(clkid, tsk, false);
+ put_task_struct(tsk);
+
+ *tp = ns_to_timespec64(t);
+ return 0;
}
/*
@@ -322,44 +386,15 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *t
*/
static int posix_cpu_timer_create(struct k_itimer *new_timer)
{
- int ret = 0;
- const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
- struct task_struct *p;
+ struct task_struct *p = get_task_for_clock(new_timer->it_clock);
- if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
+ if (!p)
return -EINVAL;
new_timer->kclock = &clock_posix_cpu;
-
- INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-
- rcu_read_lock();
- if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
- if (pid == 0) {
- p = current;
- } else {
- p = find_task_by_vpid(pid);
- if (p && !same_thread_group(p, current))
- p = NULL;
- }
- } else {
- if (pid == 0) {
- p = current->group_leader;
- } else {
- p = find_task_by_vpid(pid);
- if (p && !has_group_leader_pid(p))
- p = NULL;
- }
- }
+ timerqueue_init(&new_timer->it.cpu.node);
new_timer->it.cpu.task = p;
- if (p) {
- get_task_struct(p);
- } else {
- ret = -EINVAL;
- }
- rcu_read_unlock();
-
- return ret;
+ return 0;
}
/*
@@ -370,12 +405,14 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
*/
static int posix_cpu_timer_del(struct k_itimer *timer)
{
- int ret = 0;
- unsigned long flags;
+ struct cpu_timer *ctmr = &timer->it.cpu;
+ struct task_struct *p = ctmr->task;
struct sighand_struct *sighand;
- struct task_struct *p = timer->it.cpu.task;
+ unsigned long flags;
+ int ret = 0;
- WARN_ON_ONCE(p == NULL);
+ if (WARN_ON_ONCE(!p))
+ return -EINVAL;
/*
* Protect against sighand release/switch in exit/exec and process/
@@ -384,15 +421,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
sighand = lock_task_sighand(p, &flags);
if (unlikely(sighand == NULL)) {
/*
- * We raced with the reaping of the task.
- * The deletion should have cleared us off the list.
+ * This raced with the reaping of the task. The exit cleanup
+ * should have removed this timer from the timer queue.
*/
- WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
+ WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node));
} else {
if (timer->it.cpu.firing)
ret = TIMER_RETRY;
else
- list_del(&timer->it.cpu.entry);
+ cpu_timer_dequeue(ctmr);
unlock_task_sighand(p, &flags);
}
@@ -403,25 +440,30 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
return ret;
}
-static void cleanup_timers_list(struct list_head *head)
+static void cleanup_timerqueue(struct timerqueue_head *head)
{
- struct cpu_timer_list *timer, *next;
+ struct timerqueue_node *node;
+ struct cpu_timer *ctmr;
- list_for_each_entry_safe(timer, next, head, entry)
- list_del_init(&timer->entry);
+ while ((node = timerqueue_getnext(head))) {
+ timerqueue_del(head, node);
+ ctmr = container_of(node, struct cpu_timer, node);
+ ctmr->head = NULL;
+ }
}
/*
- * Clean out CPU timers still ticking when a thread exited. The task
- * pointer is cleared, and the expiry time is replaced with the residual
- * time for later timer_gettime calls to return.
+ * Clean out CPU timers which are still armed when a thread exits. The
+ * timers are only removed from the list. No other updates are done. The
+ * corresponding posix timers are still accessible, but cannot be rearmed.
+ *
* This must be called with the siglock held.
*/
-static void cleanup_timers(struct list_head *head)
+static void cleanup_timers(struct posix_cputimers *pct)
{
- cleanup_timers_list(head);
- cleanup_timers_list(++head);
- cleanup_timers_list(++head);
+ cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead);
+ cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead);
+ cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead);
}
/*
@@ -431,16 +473,11 @@ static void cleanup_timers(struct list_head *head)
*/
void posix_cpu_timers_exit(struct task_struct *tsk)
{
- cleanup_timers(tsk->cpu_timers);
+ cleanup_timers(&tsk->posix_cputimers);
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
- cleanup_timers(tsk->signal->cpu_timers);
-}
-
-static inline int expires_gt(u64 expires, u64 new_exp)
-{
- return expires == 0 || expires > new_exp;
+ cleanup_timers(&tsk->signal->posix_cputimers);
}
/*
@@ -449,58 +486,33 @@ static inline int expires_gt(u64 expires, u64 new_exp)
*/
static void arm_timer(struct k_itimer *timer)
{
- struct task_struct *p = timer->it.cpu.task;
- struct list_head *head, *listpos;
- struct task_cputime *cputime_expires;
- struct cpu_timer_list *const nt = &timer->it.cpu;
- struct cpu_timer_list *next;
-
- if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
- head = p->cpu_timers;
- cputime_expires = &p->cputime_expires;
- } else {
- head = p->signal->cpu_timers;
- cputime_expires = &p->signal->cputime_expires;
- }
- head += CPUCLOCK_WHICH(timer->it_clock);
-
- listpos = head;
- list_for_each_entry(next, head, entry) {
- if (nt->expires < next->expires)
- break;
- listpos = &next->entry;
- }
- list_add(&nt->entry, listpos);
-
- if (listpos == head) {
- u64 exp = nt->expires;
+ int clkidx = CPUCLOCK_WHICH(timer->it_clock);
+ struct cpu_timer *ctmr = &timer->it.cpu;
+ u64 newexp = cpu_timer_getexpires(ctmr);
+ struct task_struct *p = ctmr->task;
+ struct posix_cputimer_base *base;
+
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
+ base = p->posix_cputimers.bases + clkidx;
+ else
+ base = p->signal->posix_cputimers.bases + clkidx;
+
+ if (!cpu_timer_enqueue(&base->tqhead, ctmr))
+ return;
- /*
- * We are the new earliest-expiring POSIX 1.b timer, hence
- * need to update expiration cache. Take into account that
- * for process timers we share expiration cache with itimers
- * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
- */
+ /*
+ * We are the new earliest-expiring POSIX 1.b timer, hence
+ * need to update expiration cache. Take into account that
+ * for process timers we share expiration cache with itimers
+ * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
+ */
+ if (newexp < base->nextevt)
+ base->nextevt = newexp;
- switch (CPUCLOCK_WHICH(timer->it_clock)) {
- case CPUCLOCK_PROF:
- if (expires_gt(cputime_expires->prof_exp, exp))
- cputime_expires->prof_exp = exp;
- break;
- case CPUCLOCK_VIRT:
- if (expires_gt(cputime_expires->virt_exp, exp))
- cputime_expires->virt_exp = exp;
- break;
- case CPUCLOCK_SCHED:
- if (expires_gt(cputime_expires->sched_exp, exp))
- cputime_expires->sched_exp = exp;
- break;
- }
- if (CPUCLOCK_PERTHREAD(timer->it_clock))
- tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
- else
- tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
- }
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
+ tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
+ else
+ tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
}
/*
@@ -508,24 +520,26 @@ static void arm_timer(struct k_itimer *timer)
*/
static void cpu_timer_fire(struct k_itimer *timer)
{
+ struct cpu_timer *ctmr = &timer->it.cpu;
+
if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
/*
* User don't want any signal.
*/
- timer->it.cpu.expires = 0;
+ cpu_timer_setexpires(ctmr, 0);
} else if (unlikely(timer->sigq == NULL)) {
/*
* This a special case for clock_nanosleep,
* not a normal timer from sys_timer_create.
*/
wake_up_process(timer->it_process);
- timer->it.cpu.expires = 0;
+ cpu_timer_setexpires(ctmr, 0);
} else if (!timer->it_interval) {
/*
* One-shot timer. Clear it as soon as it's fired.
*/
posix_timer_event(timer, 0);
- timer->it.cpu.expires = 0;
+ cpu_timer_setexpires(ctmr, 0);
} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
/*
* The signal did not get queued because the signal
@@ -539,33 +553,6 @@ static void cpu_timer_fire(struct k_itimer *timer)
}
/*
- * Sample a process (thread group) timer for the given group_leader task.
- * Must be called with task sighand lock held for safe while_each_thread()
- * traversal.
- */
-static int cpu_timer_sample_group(const clockid_t which_clock,
- struct task_struct *p, u64 *sample)
-{
- struct task_cputime cputime;
-
- thread_group_cputimer(p, &cputime);
- switch (CPUCLOCK_WHICH(which_clock)) {
- default:
- return -EINVAL;
- case CPUCLOCK_PROF:
- *sample = cputime.utime + cputime.stime;
- break;
- case CPUCLOCK_VIRT:
- *sample = cputime.utime;
- break;
- case CPUCLOCK_SCHED:
- *sample = cputime.sum_exec_runtime;
- break;
- }
- return 0;
-}
-
-/*
* Guts of sys_timer_settime for CPU timers.
* This is called with the timer locked and interrupts disabled.
* If we return TIMER_RETRY, it's necessary to release the timer's lock
@@ -574,13 +561,16 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
struct itimerspec64 *new, struct itimerspec64 *old)
{
- unsigned long flags;
- struct sighand_struct *sighand;
- struct task_struct *p = timer->it.cpu.task;
+ clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
u64 old_expires, new_expires, old_incr, val;
- int ret;
+ struct cpu_timer *ctmr = &timer->it.cpu;
+ struct task_struct *p = ctmr->task;
+ struct sighand_struct *sighand;
+ unsigned long flags;
+ int ret = 0;
- WARN_ON_ONCE(p == NULL);
+ if (WARN_ON_ONCE(!p))
+ return -EINVAL;
/*
* Use the to_ktime conversion because that clamps the maximum
@@ -597,22 +587,21 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* If p has just been reaped, we can no
* longer get any information about it at all.
*/
- if (unlikely(sighand == NULL)) {
+ if (unlikely(sighand == NULL))
return -ESRCH;
- }
/*
* Disarm any old timer after extracting its expiry time.
*/
-
- ret = 0;
old_incr = timer->it_interval;
- old_expires = timer->it.cpu.expires;
+ old_expires = cpu_timer_getexpires(ctmr);
+
if (unlikely(timer->it.cpu.firing)) {
timer->it.cpu.firing = -1;
ret = TIMER_RETRY;
- } else
- list_del_init(&timer->it.cpu.entry);
+ } else {
+ cpu_timer_dequeue(ctmr);
+ }
/*
* We need to sample the current value to convert the new
@@ -622,11 +611,10 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* times (in arm_timer). With an absolute time, we must
* check if it's already passed. In short, we need a sample.
*/
- if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
- cpu_clock_sample(timer->it_clock, p, &val);
- } else {
- cpu_timer_sample_group(timer->it_clock, p, &val);
- }
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
+ val = cpu_clock_sample(clkid, p);
+ else
+ val = cpu_clock_sample_group(clkid, p, true);
if (old) {
if (old_expires == 0) {
@@ -634,18 +622,16 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
old->it_value.tv_nsec = 0;
} else {
/*
- * Update the timer in case it has
- * overrun already. If it has,
- * we'll report it as having overrun
- * and with the next reloaded timer
- * already ticking, though we are
- * swallowing that pending
- * notification here to install the
- * new setting.
+ * Update the timer in case it has overrun already.
+ * If it has, we'll report it as having overrun and
+ * with the next reloaded timer already ticking,
+ * though we are swallowing that pending
+ * notification here to install the new setting.
*/
- bump_cpu_timer(timer, val);
- if (val < timer->it.cpu.expires) {
- old_expires = timer->it.cpu.expires - val;
+ u64 exp = bump_cpu_timer(timer, val);
+
+ if (val < exp) {
+ old_expires = exp - val;
old->it_value = ns_to_timespec64(old_expires);
} else {
old->it_value.tv_nsec = 1;
@@ -674,7 +660,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* For a timer with no notification action, we don't actually
* arm the timer (we'll just fake it for timer_gettime).
*/
- timer->it.cpu.expires = new_expires;
+ cpu_timer_setexpires(ctmr, new_expires);
if (new_expires != 0 && val < new_expires) {
arm_timer(timer);
}
@@ -715,24 +701,27 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
{
- u64 now;
- struct task_struct *p = timer->it.cpu.task;
+ clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
+ struct cpu_timer *ctmr = &timer->it.cpu;
+ u64 now, expires = cpu_timer_getexpires(ctmr);
+ struct task_struct *p = ctmr->task;
- WARN_ON_ONCE(p == NULL);
+ if (WARN_ON_ONCE(!p))
+ return;
/*
* Easy part: convert the reload time.
*/
itp->it_interval = ktime_to_timespec64(timer->it_interval);
- if (!timer->it.cpu.expires)
+ if (!expires)
return;
/*
* Sample the clock to take the difference with the expiry time.
*/
if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
- cpu_clock_sample(timer->it_clock, p, &now);
+ now = cpu_clock_sample(clkid, p);
} else {
struct sighand_struct *sighand;
unsigned long flags;
@@ -747,18 +736,18 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
/*
* The process has been reaped.
* We can't even collect a sample any more.
- * Call the timer disarmed, nothing else to do.
+ * Disarm the timer, nothing else to do.
*/
- timer->it.cpu.expires = 0;
+ cpu_timer_setexpires(ctmr, 0);
return;
} else {
- cpu_timer_sample_group(timer->it_clock, p, &now);
+ now = cpu_clock_sample_group(clkid, p, false);
unlock_task_sighand(p, &flags);
}
}
- if (now < timer->it.cpu.expires) {
- itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now);
+ if (now < expires) {
+ itp->it_value = ns_to_timespec64(expires - now);
} else {
/*
* The timer should have expired already, but the firing
@@ -769,26 +758,42 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
}
}
-static unsigned long long
-check_timers_list(struct list_head *timers,
- struct list_head *firing,
- unsigned long long curr)
-{
- int maxfire = 20;
+#define MAX_COLLECTED 20
- while (!list_empty(timers)) {
- struct cpu_timer_list *t;
+static u64 collect_timerqueue(struct timerqueue_head *head,
+ struct list_head *firing, u64 now)
+{
+ struct timerqueue_node *next;
+ int i = 0;
+
+ while ((next = timerqueue_getnext(head))) {
+ struct cpu_timer *ctmr;
+ u64 expires;
+
+ ctmr = container_of(next, struct cpu_timer, node);
+ expires = cpu_timer_getexpires(ctmr);
+ /* Limit the number of timers to expire at once */
+ if (++i == MAX_COLLECTED || now < expires)
+ return expires;
+
+ ctmr->firing = 1;
+ cpu_timer_dequeue(ctmr);
+ list_add_tail(&ctmr->elist, firing);
+ }
- t = list_first_entry(timers, struct cpu_timer_list, entry);
+ return U64_MAX;
+}
- if (!--maxfire || curr < t->expires)
- return t->expires;
+static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples,
+ struct list_head *firing)
+{
+ struct posix_cputimer_base *base = pct->bases;
+ int i;
- t->firing = 1;
- list_move_tail(&t->entry, firing);
+ for (i = 0; i < CPUCLOCK_MAX; i++, base++) {
+ base->nextevt = collect_timerqueue(&base->tqhead, firing,
+ samples[i]);
}
-
- return 0;
}
static inline void check_dl_overrun(struct task_struct *tsk)
@@ -799,6 +804,20 @@ static inline void check_dl_overrun(struct task_struct *tsk)
}
}
+static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
+{
+ if (time < limit)
+ return false;
+
+ if (print_fatal_signals) {
+ pr_info("%s Watchdog Timeout (%s): %s[%d]\n",
+ rt ? "RT" : "CPU", hard ? "hard" : "soft",
+ current->comm, task_pid_nr(current));
+ }
+ __group_send_sig_info(signo, SEND_SIG_PRIV, current);
+ return true;
+}
+
/*
* Check for any per-thread CPU timers that have fired and move them off
* the tsk->cpu_timers[N] list onto the firing list. Here we update the
@@ -807,76 +826,50 @@ static inline void check_dl_overrun(struct task_struct *tsk)
static void check_thread_timers(struct task_struct *tsk,
struct list_head *firing)
{
- struct list_head *timers = tsk->cpu_timers;
- struct task_cputime *tsk_expires = &tsk->cputime_expires;
- u64 expires;
+ struct posix_cputimers *pct = &tsk->posix_cputimers;
+ u64 samples[CPUCLOCK_MAX];
unsigned long soft;
if (dl_task(tsk))
check_dl_overrun(tsk);
- /*
- * If cputime_expires is zero, then there are no active
- * per thread CPU timers.
- */
- if (task_cputime_zero(&tsk->cputime_expires))
+ if (expiry_cache_is_inactive(pct))
return;
- expires = check_timers_list(timers, firing, prof_ticks(tsk));
- tsk_expires->prof_exp = expires;
-
- expires = check_timers_list(++timers, firing, virt_ticks(tsk));
- tsk_expires->virt_exp = expires;
-
- tsk_expires->sched_exp = check_timers_list(++timers, firing,
- tsk->se.sum_exec_runtime);
+ task_sample_cputime(tsk, samples);
+ collect_posix_cputimers(pct, samples, firing);
/*
* Check for the special case thread timers.
*/
soft = task_rlimit(tsk, RLIMIT_RTTIME);
if (soft != RLIM_INFINITY) {
+ /* Task RT timeout is accounted in jiffies. RTTIME is usec */
+ unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);
unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
+ /* At the hard limit, send SIGKILL. No further action. */
if (hard != RLIM_INFINITY &&
- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
- /*
- * At the hard limit, we just die.
- * No need to calculate anything else now.
- */
- if (print_fatal_signals) {
- pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
- tsk->comm, task_pid_nr(tsk));
- }
- __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+ check_rlimit(rttime, hard, SIGKILL, true, true))
return;
- }
- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
- /*
- * At the soft limit, send a SIGXCPU every second.
- */
- if (soft < hard) {
- soft += USEC_PER_SEC;
- tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur =
- soft;
- }
- if (print_fatal_signals) {
- pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
- tsk->comm, task_pid_nr(tsk));
- }
- __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+
+ /* At the soft limit, send a SIGXCPU every second */
+ if (check_rlimit(rttime, soft, SIGXCPU, true, false)) {
+ soft += USEC_PER_SEC;
+ tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft;
}
}
- if (task_cputime_zero(tsk_expires))
+
+ if (expiry_cache_is_inactive(pct))
tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
}
static inline void stop_process_timers(struct signal_struct *sig)
{
- struct thread_group_cputimer *cputimer = &sig->cputimer;
+ struct posix_cputimers *pct = &sig->posix_cputimers;
- /* Turn off cputimer->running. This is done without locking. */
- WRITE_ONCE(cputimer->running, false);
+ /* Turn off the active flag. This is done without locking. */
+ WRITE_ONCE(pct->timers_active, false);
tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
}
@@ -898,7 +891,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
}
- if (it->expires && (!*expires || it->expires < *expires))
+ if (it->expires && it->expires < *expires)
*expires = it->expires;
}
@@ -911,87 +904,69 @@ static void check_process_timers(struct task_struct *tsk,
struct list_head *firing)
{
struct signal_struct *const sig = tsk->signal;
- u64 utime, ptime, virt_expires, prof_expires;
- u64 sum_sched_runtime, sched_expires;
- struct list_head *timers = sig->cpu_timers;
- struct task_cputime cputime;
+ struct posix_cputimers *pct = &sig->posix_cputimers;
+ u64 samples[CPUCLOCK_MAX];
unsigned long soft;
/*
- * If cputimer is not running, then there are no active
- * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
+ * If there are no active process wide timers (POSIX 1.b, itimers,
+ * RLIMIT_CPU) nothing to check. Also skip the process wide timer
+ * processing when there is already another task handling them.
*/
- if (!READ_ONCE(tsk->signal->cputimer.running))
+ if (!READ_ONCE(pct->timers_active) || pct->expiry_active)
return;
- /*
+ /*
* Signify that a thread is checking for process timers.
* Write access to this field is protected by the sighand lock.
*/
- sig->cputimer.checking_timer = true;
+ pct->expiry_active = true;
/*
- * Collect the current process totals.
+ * Collect the current process totals. Group accounting is active
+ * so the sample can be taken directly.
*/
- thread_group_cputimer(tsk, &cputime);
- utime = cputime.utime;
- ptime = utime + cputime.stime;
- sum_sched_runtime = cputime.sum_exec_runtime;
-
- prof_expires = check_timers_list(timers, firing, ptime);
- virt_expires = check_timers_list(++timers, firing, utime);
- sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
+ proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples);
+ collect_posix_cputimers(pct, samples, firing);
/*
* Check for the special case process timers.
*/
- check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
- SIGPROF);
- check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
- SIGVTALRM);
+ check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF],
+ &pct->bases[CPUCLOCK_PROF].nextevt,
+ samples[CPUCLOCK_PROF], SIGPROF);
+ check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT],
+ &pct->bases[CPUCLOCK_VIRT].nextevt,
+ samples[CPUCLOCK_VIRT], SIGVTALRM);
+
soft = task_rlimit(tsk, RLIMIT_CPU);
if (soft != RLIM_INFINITY) {
- unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
+ /* RLIMIT_CPU is in seconds. Samples are nanoseconds */
unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
- u64 x;
- if (psecs >= hard) {
- /*
- * At the hard limit, we just die.
- * No need to calculate anything else now.
- */
- if (print_fatal_signals) {
- pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
- tsk->comm, task_pid_nr(tsk));
- }
- __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+ u64 ptime = samples[CPUCLOCK_PROF];
+ u64 softns = (u64)soft * NSEC_PER_SEC;
+ u64 hardns = (u64)hard * NSEC_PER_SEC;
+
+ /* At the hard limit, send SIGKILL. No further action. */
+ if (hard != RLIM_INFINITY &&
+ check_rlimit(ptime, hardns, SIGKILL, false, true))
return;
+
+ /* At the soft limit, send a SIGXCPU every second */
+ if (check_rlimit(ptime, softns, SIGXCPU, false, false)) {
+ sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1;
+ softns += NSEC_PER_SEC;
}
- if (psecs >= soft) {
- /*
- * At the soft limit, send a SIGXCPU every second.
- */
- if (print_fatal_signals) {
- pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
- tsk->comm, task_pid_nr(tsk));
- }
- __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
- if (soft < hard) {
- soft++;
- sig->rlim[RLIMIT_CPU].rlim_cur = soft;
- }
- }
- x = soft * NSEC_PER_SEC;
- if (!prof_expires || x < prof_expires)
- prof_expires = x;
+
+ /* Update the expiry cache */
+ if (softns < pct->bases[CPUCLOCK_PROF].nextevt)
+ pct->bases[CPUCLOCK_PROF].nextevt = softns;
}
- sig->cputime_expires.prof_exp = prof_expires;
- sig->cputime_expires.virt_exp = virt_expires;
- sig->cputime_expires.sched_exp = sched_expires;
- if (task_cputime_zero(&sig->cputime_expires))
+ if (expiry_cache_is_inactive(pct))
stop_process_timers(sig);
- sig->cputimer.checking_timer = false;
+ pct->expiry_active = false;
}
/*
@@ -1000,18 +975,21 @@ static void check_process_timers(struct task_struct *tsk,
*/
static void posix_cpu_timer_rearm(struct k_itimer *timer)
{
+ clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
+ struct cpu_timer *ctmr = &timer->it.cpu;
+ struct task_struct *p = ctmr->task;
struct sighand_struct *sighand;
unsigned long flags;
- struct task_struct *p = timer->it.cpu.task;
u64 now;
- WARN_ON_ONCE(p == NULL);
+ if (WARN_ON_ONCE(!p))
+ return;
/*
* Fetch the current sample and update the timer's expiry time.
*/
if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
- cpu_clock_sample(timer->it_clock, p, &now);
+ now = cpu_clock_sample(clkid, p);
bump_cpu_timer(timer, now);
if (unlikely(p->exit_state))
return;
@@ -1031,13 +1009,13 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
* The process has been reaped.
* We can't even collect a sample any more.
*/
- timer->it.cpu.expires = 0;
+ cpu_timer_setexpires(ctmr, 0);
return;
} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
/* If the process is dying, no need to rearm */
goto unlock;
}
- cpu_timer_sample_group(timer->it_clock, p, &now);
+ now = cpu_clock_sample_group(clkid, p, true);
bump_cpu_timer(timer, now);
/* Leave the sighand locked for the call below. */
}
@@ -1051,26 +1029,24 @@ unlock:
}
/**
- * task_cputime_expired - Compare two task_cputime entities.
+ * task_cputimers_expired - Check whether posix CPU timers are expired
*
- * @sample: The task_cputime structure to be checked for expiration.
- * @expires: Expiration times, against which @sample will be checked.
+ * @samples: Array of current samples for the CPUCLOCK clocks
+ * @pct: Pointer to a posix_cputimers container
*
- * Checks @sample against @expires to see if any field of @sample has expired.
- * Returns true if any field of the former is greater than the corresponding
- * field of the latter if the latter field is set. Otherwise returns false.
+ * Returns true if any member of @samples is greater than the corresponding
+ * member of @pct->bases[CLK].nextevt. False otherwise
*/
-static inline int task_cputime_expired(const struct task_cputime *sample,
- const struct task_cputime *expires)
+static inline bool
+task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct)
{
- if (expires->utime && sample->utime >= expires->utime)
- return 1;
- if (expires->stime && sample->utime + sample->stime >= expires->stime)
- return 1;
- if (expires->sum_exec_runtime != 0 &&
- sample->sum_exec_runtime >= expires->sum_exec_runtime)
- return 1;
- return 0;
+ int i;
+
+ for (i = 0; i < CPUCLOCK_MAX; i++) {
+ if (samples[i] >= pct->bases[i].nextevt)
+ return true;
+ }
+ return false;
}
/**
@@ -1083,48 +1059,50 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
* timers and compare them with the corresponding expiration times. Return
* true if a timer has expired, else return false.
*/
-static inline int fastpath_timer_check(struct task_struct *tsk)
+static inline bool fastpath_timer_check(struct task_struct *tsk)
{
+ struct posix_cputimers *pct = &tsk->posix_cputimers;
struct signal_struct *sig;
- if (!task_cputime_zero(&tsk->cputime_expires)) {
- struct task_cputime task_sample;
+ if (!expiry_cache_is_inactive(pct)) {
+ u64 samples[CPUCLOCK_MAX];
- task_cputime(tsk, &task_sample.utime, &task_sample.stime);
- task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
- if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
- return 1;
+ task_sample_cputime(tsk, samples);
+ if (task_cputimers_expired(samples, pct))
+ return true;
}
sig = tsk->signal;
+ pct = &sig->posix_cputimers;
/*
- * Check if thread group timers expired when the cputimer is
- * running and no other thread in the group is already checking
- * for thread group cputimers. These fields are read without the
- * sighand lock. However, this is fine because this is meant to
- * be a fastpath heuristic to determine whether we should try to
- * acquire the sighand lock to check/handle timers.
+ * Check if thread group timers expired when timers are active and
+ * no other thread in the group is already handling expiry for
+ * thread group cputimers. These fields are read without the
+ * sighand lock. However, this is fine because this is meant to be
+ * a fastpath heuristic to determine whether we should try to
+ * acquire the sighand lock to handle timer expiry.
*
- * In the worst case scenario, if 'running' or 'checking_timer' gets
- * set but the current thread doesn't see the change yet, we'll wait
- * until the next thread in the group gets a scheduler interrupt to
- * handle the timer. This isn't an issue in practice because these
- * types of delays with signals actually getting sent are expected.
+ * In the worst case scenario, if concurrently timers_active is set
+ * or expiry_active is cleared, but the current thread doesn't see
+ * the change yet, the timer checks are delayed until the next
+ * thread in the group gets a scheduler interrupt to handle the
+ * timer. This isn't an issue in practice because these types of
+ * delays with signals actually getting sent are expected.
*/
- if (READ_ONCE(sig->cputimer.running) &&
- !READ_ONCE(sig->cputimer.checking_timer)) {
- struct task_cputime group_sample;
+ if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) {
+ u64 samples[CPUCLOCK_MAX];
- sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
+ proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic,
+ samples);
- if (task_cputime_expired(&group_sample, &sig->cputime_expires))
- return 1;
+ if (task_cputimers_expired(samples, pct))
+ return true;
}
if (dl_task(tsk) && tsk->dl.dl_overrun)
- return 1;
+ return true;
- return 0;
+ return false;
}
/*
@@ -1132,11 +1110,12 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
* already updated our counts. We need to check if any timers fire now.
* Interrupts are disabled.
*/
-void run_posix_cpu_timers(struct task_struct *tsk)
+void run_posix_cpu_timers(void)
{
- LIST_HEAD(firing);
+ struct task_struct *tsk = current;
struct k_itimer *timer, *next;
unsigned long flags;
+ LIST_HEAD(firing);
lockdep_assert_irqs_disabled();
@@ -1174,11 +1153,11 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* each timer's lock before clearing its firing flag, so no
* timer call will interfere.
*/
- list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
+ list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
int cpu_firing;
spin_lock(&timer->it_lock);
- list_del_init(&timer->it.cpu.entry);
+ list_del_init(&timer->it.cpu.elist);
cpu_firing = timer->it.cpu.firing;
timer->it.cpu.firing = 0;
/*
@@ -1196,16 +1175,18 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* Set one of the process-wide special case CPU timers or RLIMIT_CPU.
* The tsk->sighand->siglock must be held by the caller.
*/
-void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
+void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
u64 *newval, u64 *oldval)
{
- u64 now;
- int ret;
+ u64 now, *nextevt;
+
+ if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED))
+ return;
- WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
- ret = cpu_timer_sample_group(clock_idx, tsk, &now);
+ nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt;
+ now = cpu_clock_sample_group(clkid, tsk, true);
- if (oldval && ret != -EINVAL) {
+ if (oldval) {
/*
* We are setting itimer. The *oldval is absolute and we update
* it to be relative, *newval argument is relative and we update
@@ -1226,19 +1207,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
/*
- * Update expiration cache if we are the earliest timer, or eventually
- * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
+ * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF
+ * expiry cache is also used by RLIMIT_CPU!.
*/
- switch (clock_idx) {
- case CPUCLOCK_PROF:
- if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
- tsk->signal->cputime_expires.prof_exp = *newval;
- break;
- case CPUCLOCK_VIRT:
- if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
- tsk->signal->cputime_expires.virt_exp = *newval;
- break;
- }
+ if (*newval < *nextevt)
+ *nextevt = *newval;
tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
}
@@ -1260,6 +1233,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
timer.it_overrun = -1;
error = posix_cpu_timer_create(&timer);
timer.it_process = current;
+
if (!error) {
static struct itimerspec64 zero_it;
struct restart_block *restart;
@@ -1275,7 +1249,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
}
while (!signal_pending(current)) {
- if (timer.it.cpu.expires == 0) {
+ if (!cpu_timer_getexpires(&timer.it.cpu)) {
/*
* Our timer fired and was reset, below
* deletion can not fail.
@@ -1297,7 +1271,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
/*
* We were interrupted by a signal.
*/
- expires = timer.it.cpu.expires;
+ expires = cpu_timer_getexpires(&timer.it.cpu);
error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
if (!error) {
/*
@@ -1417,26 +1391,26 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
}
const struct k_clock clock_posix_cpu = {
- .clock_getres = posix_cpu_clock_getres,
- .clock_set = posix_cpu_clock_set,
- .clock_get = posix_cpu_clock_get,
- .timer_create = posix_cpu_timer_create,
- .nsleep = posix_cpu_nsleep,
- .timer_set = posix_cpu_timer_set,
- .timer_del = posix_cpu_timer_del,
- .timer_get = posix_cpu_timer_get,
- .timer_rearm = posix_cpu_timer_rearm,
+ .clock_getres = posix_cpu_clock_getres,
+ .clock_set = posix_cpu_clock_set,
+ .clock_get_timespec = posix_cpu_clock_get,
+ .timer_create = posix_cpu_timer_create,
+ .nsleep = posix_cpu_nsleep,
+ .timer_set = posix_cpu_timer_set,
+ .timer_del = posix_cpu_timer_del,
+ .timer_get = posix_cpu_timer_get,
+ .timer_rearm = posix_cpu_timer_rearm,
};
const struct k_clock clock_process = {
- .clock_getres = process_cpu_clock_getres,
- .clock_get = process_cpu_clock_get,
- .timer_create = process_cpu_timer_create,
- .nsleep = process_cpu_nsleep,
+ .clock_getres = process_cpu_clock_getres,
+ .clock_get_timespec = process_cpu_clock_get,
+ .timer_create = process_cpu_timer_create,
+ .nsleep = process_cpu_nsleep,
};
const struct k_clock clock_thread = {
- .clock_getres = thread_cpu_clock_getres,
- .clock_get = thread_cpu_clock_get,
- .timer_create = thread_cpu_timer_create,
+ .clock_getres = thread_cpu_clock_getres,
+ .clock_get_timespec = thread_cpu_clock_get,
+ .timer_create = thread_cpu_timer_create,
};
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 67df65f887ac..fcb3b21d8bdc 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -14,6 +14,7 @@
#include <linux/ktime.h>
#include <linux/timekeeping.h>
#include <linux/posix-timers.h>
+#include <linux/time_namespace.h>
#include <linux/compat.h>
#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
@@ -77,9 +78,11 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
break;
case CLOCK_MONOTONIC:
ktime_get_ts64(tp);
+ timens_add_monotonic(tp);
break;
case CLOCK_BOOTTIME:
ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
break;
default:
return -EINVAL;
@@ -126,6 +129,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
struct __kernel_timespec __user *, rmtp)
{
struct timespec64 t;
+ ktime_t texp;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -144,13 +148,19 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
+ texp = timespec64_to_ktime(t);
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(which_clock, texp);
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
#ifdef CONFIG_COMPAT
COMPAT_SYS_NI(timer_create);
+#endif
+
+#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
COMPAT_SYS_NI(getitimer);
COMPAT_SYS_NI(setitimer);
#endif
@@ -212,6 +222,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
struct old_timespec32 __user *, rmtp)
{
struct timespec64 t;
+ ktime_t texp;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -230,7 +241,10 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
+ texp = timespec64_to_ktime(t);
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(which_clock, texp);
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index d7f2d91acdac..ff0eb30de346 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -30,6 +30,7 @@
#include <linux/hashtable.h>
#include <linux/compat.h>
#include <linux/nospec.h>
+#include <linux/time_namespace.h>
#include "timekeeping.h"
#include "posix-timers.h"
@@ -165,12 +166,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
}
/* Get clock_realtime */
-static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
return 0;
}
+static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
+{
+ return ktime_get_real();
+}
+
/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
const struct timespec64 *tp)
@@ -187,18 +193,25 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
/*
* Get monotonic time for posix timers
*/
-static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
+static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
+{
+ return ktime_get();
+}
+
/*
* Get monotonic-raw time for posix timers
*/
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_raw_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
@@ -213,6 +226,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
struct timespec64 *tp)
{
ktime_get_coarse_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
@@ -222,18 +236,29 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *
return 0;
}
-static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
return 0;
}
-static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
+static ktime_t posix_get_boottime_ktime(const clockid_t which_clock)
+{
+ return ktime_get_boottime();
+}
+
+static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_clocktai_ts64(tp);
return 0;
}
+static ktime_t posix_get_tai_ktime(clockid_t which_clock)
+{
+ return ktime_get_clocktai();
+}
+
static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
{
tp->tv_sec = 0;
@@ -442,7 +467,7 @@ static struct k_itimer * alloc_posix_timer(void)
static void k_itimer_rcu_free(struct rcu_head *head)
{
- struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+ struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
kmem_cache_free(posix_timers_cache, tmr);
}
@@ -459,7 +484,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
}
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
- call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
+ call_rcu(&tmr->rcu, k_itimer_rcu_free);
}
static int common_timer_create(struct k_itimer *new_timer)
@@ -645,7 +670,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
const struct k_clock *kc = timr->kclock;
ktime_t now, remaining, iv;
- struct timespec64 ts64;
bool sig_none;
sig_none = timr->it_sigev_notify == SIGEV_NONE;
@@ -663,12 +687,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
return;
}
- /*
- * The timespec64 based conversion is suboptimal, but it's not
- * worth to implement yet another callback.
- */
- kc->clock_get(timr->it_clock, &ts64);
- now = timespec64_to_ktime(ts64);
+ now = kc->clock_get_ktime(timr->it_clock);
/*
* When a requeue is pending or this is a SIGEV_NONE timer move the
@@ -781,7 +800,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
* Posix magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they become CLOCK_MONOTONIC based under the
* hood. See hrtimer_init(). Update timr->kclock, so the generic
- * functions which use timr->kclock->clock_get() work.
+ * functions which use timr->kclock->clock_get_*() work.
*
* Note: it_clock stays unmodified, because the next timer_set() might
* use ABSTIME, so it needs to switch back.
@@ -805,6 +824,35 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
return hrtimer_try_to_cancel(&timr->it.real.timer);
}
+static void common_timer_wait_running(struct k_itimer *timer)
+{
+ hrtimer_cancel_wait_running(&timer->it.real.timer);
+}
+
+/*
+ * On PREEMPT_RT this prevent priority inversion against softirq kthread in
+ * case it gets preempted while executing a timer callback. See comments in
+ * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a
+ * cpu_relax().
+ */
+static struct k_itimer *timer_wait_running(struct k_itimer *timer,
+ unsigned long *flags)
+{
+ const struct k_clock *kc = READ_ONCE(timer->kclock);
+ timer_t timer_id = READ_ONCE(timer->it_id);
+
+ /* Prevent kfree(timer) after dropping the lock */
+ rcu_read_lock();
+ unlock_timer(timer, *flags);
+
+ if (!WARN_ON_ONCE(!kc->timer_wait_running))
+ kc->timer_wait_running(timer);
+
+ rcu_read_unlock();
+ /* Relock the timer. It might be not longer hashed. */
+ return lock_timer(timer_id, flags);
+}
+
/* Set a POSIX.1b interval timer. */
int common_timer_set(struct k_itimer *timr, int flags,
struct itimerspec64 *new_setting,
@@ -837,6 +885,8 @@ int common_timer_set(struct k_itimer *timr, int flags,
timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
expires = timespec64_to_ktime(new_setting->it_value);
+ if (flags & TIMER_ABSTIME)
+ expires = timens_ktime_to_host(timr->it_clock, expires);
sigev_none = timr->it_sigev_notify == SIGEV_NONE;
kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
@@ -844,13 +894,13 @@ int common_timer_set(struct k_itimer *timr, int flags,
return 0;
}
-static int do_timer_settime(timer_t timer_id, int flags,
+static int do_timer_settime(timer_t timer_id, int tmr_flags,
struct itimerspec64 *new_spec64,
struct itimerspec64 *old_spec64)
{
const struct k_clock *kc;
struct k_itimer *timr;
- unsigned long flag;
+ unsigned long flags;
int error = 0;
if (!timespec64_valid(&new_spec64->it_interval) ||
@@ -859,8 +909,9 @@ static int do_timer_settime(timer_t timer_id, int flags,
if (old_spec64)
memset(old_spec64, 0, sizeof(*old_spec64));
+
+ timr = lock_timer(timer_id, &flags);
retry:
- timr = lock_timer(timer_id, &flag);
if (!timr)
return -EINVAL;
@@ -868,13 +919,16 @@ retry:
if (WARN_ON_ONCE(!kc || !kc->timer_set))
error = -EINVAL;
else
- error = kc->timer_set(timr, flags, new_spec64, old_spec64);
+ error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);
- unlock_timer(timr, flag);
if (error == TIMER_RETRY) {
- old_spec64 = NULL; // We already got the old time...
+ // We already got the old time...
+ old_spec64 = NULL;
+ /* Unlocks and relocks the timer if it still exists */
+ timr = timer_wait_running(timr, &flags);
goto retry;
}
+ unlock_timer(timr, flags);
return error;
}
@@ -951,13 +1005,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
struct k_itimer *timer;
unsigned long flags;
-retry_delete:
timer = lock_timer(timer_id, &flags);
+
+retry_delete:
if (!timer)
return -EINVAL;
- if (timer_delete_hook(timer) == TIMER_RETRY) {
- unlock_timer(timer, flags);
+ if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) {
+ /* Unlocks and relocks the timer if it still exists */
+ timer = timer_wait_running(timer, &flags);
goto retry_delete;
}
@@ -1032,7 +1088,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp);
+ error = kc->clock_get_timespec(which_clock, &kernel_tp);
if (!error && put_timespec64(&kernel_tp, tp))
error = -EFAULT;
@@ -1114,7 +1170,7 @@ SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
if (!kc)
return -EINVAL;
- err = kc->clock_get(which_clock, &ts);
+ err = kc->clock_get_timespec(which_clock, &ts);
if (!err && put_old_timespec32(&ts, tp))
err = -EFAULT;
@@ -1165,7 +1221,22 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
static int common_nsleep(const clockid_t which_clock, int flags,
const struct timespec64 *rqtp)
{
- return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ?
+ ktime_t texp = timespec64_to_ktime(*rqtp);
+
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
+ HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+ which_clock);
+}
+
+static int common_nsleep_timens(const clockid_t which_clock, int flags,
+ const struct timespec64 *rqtp)
+{
+ ktime_t texp = timespec64_to_ktime(*rqtp);
+
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(which_clock, texp);
+
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
@@ -1226,7 +1297,8 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
static const struct k_clock clock_realtime = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_clock_realtime_get,
+ .clock_get_timespec = posix_get_realtime_timespec,
+ .clock_get_ktime = posix_get_realtime_ktime,
.clock_set = posix_clock_realtime_set,
.clock_adj = posix_clock_realtime_adj,
.nsleep = common_nsleep,
@@ -1238,13 +1310,15 @@ static const struct k_clock clock_realtime = {
.timer_forward = common_hrtimer_forward,
.timer_remaining = common_hrtimer_remaining,
.timer_try_to_cancel = common_hrtimer_try_to_cancel,
+ .timer_wait_running = common_timer_wait_running,
.timer_arm = common_hrtimer_arm,
};
static const struct k_clock clock_monotonic = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_ktime_get_ts,
- .nsleep = common_nsleep,
+ .clock_get_timespec = posix_get_monotonic_timespec,
+ .clock_get_ktime = posix_get_monotonic_ktime,
+ .nsleep = common_nsleep_timens,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
@@ -1253,27 +1327,29 @@ static const struct k_clock clock_monotonic = {
.timer_forward = common_hrtimer_forward,
.timer_remaining = common_hrtimer_remaining,
.timer_try_to_cancel = common_hrtimer_try_to_cancel,
+ .timer_wait_running = common_timer_wait_running,
.timer_arm = common_hrtimer_arm,
};
static const struct k_clock clock_monotonic_raw = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_monotonic_raw,
+ .clock_get_timespec = posix_get_monotonic_raw,
};
static const struct k_clock clock_realtime_coarse = {
.clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_realtime_coarse,
+ .clock_get_timespec = posix_get_realtime_coarse,
};
static const struct k_clock clock_monotonic_coarse = {
.clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_monotonic_coarse,
+ .clock_get_timespec = posix_get_monotonic_coarse,
};
static const struct k_clock clock_tai = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_tai,
+ .clock_get_ktime = posix_get_tai_ktime,
+ .clock_get_timespec = posix_get_tai_timespec,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
@@ -1283,13 +1359,15 @@ static const struct k_clock clock_tai = {
.timer_forward = common_hrtimer_forward,
.timer_remaining = common_hrtimer_remaining,
.timer_try_to_cancel = common_hrtimer_try_to_cancel,
+ .timer_wait_running = common_timer_wait_running,
.timer_arm = common_hrtimer_arm,
};
static const struct k_clock clock_boottime = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_boottime,
- .nsleep = common_nsleep,
+ .clock_get_ktime = posix_get_boottime_ktime,
+ .clock_get_timespec = posix_get_boottime_timespec,
+ .nsleep = common_nsleep_timens,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
@@ -1298,6 +1376,7 @@ static const struct k_clock clock_boottime = {
.timer_forward = common_hrtimer_forward,
.timer_remaining = common_hrtimer_remaining,
.timer_try_to_cancel = common_hrtimer_try_to_cancel,
+ .timer_wait_running = common_timer_wait_running,
.timer_arm = common_hrtimer_arm,
};
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index de5daa6d975a..f32a2ebba9b8 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -6,8 +6,11 @@ struct k_clock {
struct timespec64 *tp);
int (*clock_set)(const clockid_t which_clock,
const struct timespec64 *tp);
- int (*clock_get)(const clockid_t which_clock,
- struct timespec64 *tp);
+ /* Returns the clock value in the current time namespace. */
+ int (*clock_get_timespec)(const clockid_t which_clock,
+ struct timespec64 *tp);
+ /* Returns the clock value in the root time namespace. */
+ ktime_t (*clock_get_ktime)(const clockid_t which_clock);
int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx);
int (*timer_create)(struct k_itimer *timer);
int (*nsleep)(const clockid_t which_clock, int flags,
@@ -24,6 +27,7 @@ struct k_clock {
int (*timer_try_to_cancel)(struct k_itimer *timr);
void (*timer_arm)(struct k_itimer *timr, ktime_t expires,
bool absolute, bool sigev_none);
+ void (*timer_wait_running)(struct k_itimer *timr);
};
extern const struct k_clock clock_posix_cpu;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 142b07619918..e4332e3e2d56 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -17,6 +17,8 @@
#include <linux/seqlock.h>
#include <linux/bitops.h>
+#include "timekeeping.h"
+
/**
* struct clock_read_data - data required to read from sched_clock()
*
@@ -167,14 +169,15 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
- unsigned long r;
+ unsigned long r, flags;
char r_unit;
struct clock_read_data rd;
if (cd.rate > rate)
return;
- WARN_ON(!irqs_disabled());
+ /* Cannot register a sched_clock with interrupts on */
+ local_irq_save(flags);
/* Calculate the mult/shift to convert counter ticks to ns. */
clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
@@ -231,6 +234,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
enable_sched_clock_irqtime();
+ local_irq_restore(flags);
+
pr_debug("Registered %pS as sched_clock source\n", read);
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 5be6154e2fd2..b5a65e212df2 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -42,34 +42,39 @@ static int bc_shutdown(struct clock_event_device *evt)
*/
static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
{
- int bc_moved;
/*
- * We try to cancel the timer first. If the callback is on
- * flight on some other cpu then we let it handle it. If we
- * were able to cancel the timer nothing can rearm it as we
- * own broadcast_lock.
+ * This is called either from enter/exit idle code or from the
+ * broadcast handler. In all cases tick_broadcast_lock is held.
*
- * However we can also be called from the event handler of
- * ce_broadcast_hrtimer itself when it expires. We cannot
- * restart the timer because we are in the callback, but we
- * can set the expiry time and let the callback return
- * HRTIMER_RESTART.
+ * hrtimer_cancel() cannot be called here neither from the
+ * broadcast handler nor from the enter/exit idle code. The idle
+ * code can run into the problem described in bc_shutdown() and the
+ * broadcast handler cannot wait for itself to complete for obvious
+ * reasons.
*
- * Since we are in the idle loop at this point and because
- * hrtimer_{start/cancel} functions call into tracing,
- * calls to these functions must be bound within RCU_NONIDLE.
+ * Each caller tries to arm the hrtimer on its own CPU, but if the
+ * hrtimer callbback function is currently running, then
+ * hrtimer_start() cannot move it and the timer stays on the CPU on
+ * which it is assigned at the moment.
+ *
+ * As this can be called from idle code, the hrtimer_start()
+ * invocation has to be wrapped with RCU_NONIDLE() as
+ * hrtimer_start() can call into tracing.
*/
- RCU_NONIDLE({
- bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
- if (bc_moved)
- hrtimer_start(&bctimer, expires,
- HRTIMER_MODE_ABS_PINNED);});
- if (bc_moved) {
- /* Bind the "device" to the cpu */
- bc->bound_on = smp_processor_id();
- } else if (bc->bound_on == smp_processor_id()) {
- hrtimer_set_expires(&bctimer, expires);
- }
+ RCU_NONIDLE( {
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /*
+ * The core tick broadcast mode expects bc->bound_on to be set
+ * correctly to prevent a CPU which has the broadcast hrtimer
+ * armed from going deep idle.
+ *
+ * As tick_broadcast_lock is held, nothing can change the cpu
+ * base which was just established in hrtimer_start() above. So
+ * the below access is safe even without holding the hrtimer
+ * base lock.
+ */
+ bc->bound_on = bctimer.base->cpu_base->cpu;
+ } );
return 0;
}
@@ -95,16 +100,12 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
{
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
- if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
- if (ce_broadcast_hrtimer.next_event != KTIME_MAX)
- return HRTIMER_RESTART;
-
return HRTIMER_NORESTART;
}
void tick_setup_hrtimer_broadcast(void)
{
- hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
bctimer.function = bc_handler;
clockevents_register_device(&ce_broadcast_hrtimer);
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 59225b484e4e..7e5d3524e924 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -11,6 +11,7 @@
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
+#include <linux/nmi.h>
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
@@ -558,6 +559,7 @@ void tick_unfreeze(void)
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
} else {
+ touch_softlockup_watchdog();
tick_resume_local();
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index be9707f68024..a792d21cac64 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -58,8 +58,9 @@ static void tick_do_update_jiffies64(ktime_t now)
/*
* Do a quick check without holding jiffies_lock:
+ * The READ_ONCE() pairs with two updates done later in this function.
*/
- delta = ktime_sub(now, last_jiffies_update);
+ delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
if (delta < tick_period)
return;
@@ -70,8 +71,9 @@ static void tick_do_update_jiffies64(ktime_t now)
if (delta >= tick_period) {
delta = ktime_sub(delta, tick_period);
- last_jiffies_update = ktime_add(last_jiffies_update,
- tick_period);
+ /* Pairs with the lockless read in this function. */
+ WRITE_ONCE(last_jiffies_update,
+ ktime_add(last_jiffies_update, tick_period));
/* Slow path for long timeouts */
if (unlikely(delta >= tick_period)) {
@@ -79,8 +81,10 @@ static void tick_do_update_jiffies64(ktime_t now)
ticks = ktime_divns(delta, incr);
- last_jiffies_update = ktime_add_ns(last_jiffies_update,
- incr * ticks);
+ /* Pairs with the lockless read in this function. */
+ WRITE_ONCE(last_jiffies_update,
+ ktime_add_ns(last_jiffies_update,
+ incr * ticks));
}
do_timer(++ticks);
@@ -172,6 +176,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
bool tick_nohz_full_running;
+EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;
static bool check_tick_dependency(atomic_t *dep)
@@ -198,6 +203,11 @@ static bool check_tick_dependency(atomic_t *dep)
return true;
}
+ if (val & TICK_DEP_MASK_RCU) {
+ trace_tick_stop(0, TICK_DEP_MASK_RCU);
+ return true;
+ }
+
return false;
}
@@ -324,6 +334,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
preempt_enable();
}
}
+EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);
void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
@@ -331,6 +342,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
+EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
/*
* Set a per-task tick dependency. Posix CPU timers need this in order to elapse
@@ -344,11 +356,13 @@ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
*/
tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
}
+EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
}
+EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
/*
* Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
@@ -397,6 +411,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
}
+EXPORT_SYMBOL_GPL(tick_nohz_full_setup);
static int tick_nohz_cpu_down(unsigned int cpu)
{
@@ -634,10 +649,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
/* Forward the time to expire in the future */
hrtimer_forward(&ts->sched_timer, now, tick_period);
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
- else
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start_expires(&ts->sched_timer,
+ HRTIMER_MODE_ABS_PINNED_HARD);
+ } else {
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+ }
/*
* Reset to make sure next tick stop doesn't get fooled by past
@@ -802,7 +819,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
}
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start(&ts->sched_timer, tick,
+ HRTIMER_MODE_ABS_PINNED_HARD);
} else {
hrtimer_set_expires(&ts->sched_timer, tick);
tick_program_event(tick, 1);
@@ -1116,7 +1134,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
unsigned long ticks;
- if (vtime_accounting_cpu_enabled())
+ if (vtime_accounting_enabled_this_cpu())
return;
/*
* We stopped the tick in idle. Update process times would miss the
@@ -1230,7 +1248,7 @@ static void tick_nohz_switch_to_nohz(void)
* Recycle the hrtimer in ts, so we can share the
* hrtimer_forward with the highres code.
*/
- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
/* Get the next period */
next = tick_init_jiffy_update();
@@ -1327,7 +1345,7 @@ void tick_setup_sched_timer(void)
/*
* Emulate tick processing via per-CPU hrtimers:
*/
- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
ts->sched_timer.function = tick_sched_timer;
/* Get the next period (per-CPU) */
@@ -1342,7 +1360,7 @@ void tick_setup_sched_timer(void)
}
hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
}
#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 5c54ca632d08..cdd7386115ff 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -59,9 +59,9 @@ EXPORT_SYMBOL(sys_tz);
* why not move it into the appropriate arch directory (for those
* architectures that need it).
*/
-SYSCALL_DEFINE1(time, time_t __user *, tloc)
+SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc)
{
- time_t i = (time_t)ktime_get_real_seconds();
+ __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds();
if (tloc) {
if (put_user(i,tloc))
@@ -78,7 +78,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc)
* architectures that need it).
*/
-SYSCALL_DEFINE1(stime, time_t __user *, tptr)
+SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr)
{
struct timespec64 tv;
int err;
@@ -137,7 +137,7 @@ SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
#endif /* __ARCH_WANT_SYS_TIME32 */
#endif
-SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
+SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv,
struct timezone __user *, tz)
{
if (likely(tv != NULL)) {
@@ -179,7 +179,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
return error;
if (tz) {
- /* Verify we're witin the +-15 hrs range */
+ /* Verify we're within the +-15 hrs range */
if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
return -EINVAL;
@@ -196,22 +196,21 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
return 0;
}
-SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
+SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
struct timezone __user *, tz)
{
struct timespec64 new_ts;
- struct timeval user_tv;
struct timezone new_tz;
if (tv) {
- if (copy_from_user(&user_tv, tv, sizeof(*tv)))
+ if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
+ get_user(new_ts.tv_nsec, &tv->tv_usec))
return -EFAULT;
- if (!timeval_valid(&user_tv))
+ if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
return -EINVAL;
- new_ts.tv_sec = user_tv.tv_sec;
- new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+ new_ts.tv_nsec *= NSEC_PER_USEC;
}
if (tz) {
if (copy_from_user(&new_tz, tz, sizeof(*tz)))
@@ -245,18 +244,17 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
struct timezone __user *, tz)
{
struct timespec64 new_ts;
- struct timeval user_tv;
struct timezone new_tz;
if (tv) {
- if (compat_get_timeval(&user_tv, tv))
+ if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
+ get_user(new_ts.tv_nsec, &tv->tv_usec))
return -EFAULT;
- if (!timeval_valid(&user_tv))
+ if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
return -EINVAL;
- new_ts.tv_sec = user_tv.tv_sec;
- new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+ new_ts.tv_nsec *= NSEC_PER_USEC;
}
if (tz) {
if (copy_from_user(&new_tz, tz, sizeof(*tz)))
@@ -267,7 +265,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
}
#endif
-#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+#ifdef CONFIG_64BIT
SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
struct __kernel_timex txc; /* Local copy of parameter */
@@ -550,18 +548,21 @@ EXPORT_SYMBOL(set_normalized_timespec64);
*/
struct timespec64 ns_to_timespec64(const s64 nsec)
{
- struct timespec64 ts;
+ struct timespec64 ts = { 0, 0 };
s32 rem;
- if (!nsec)
- return (struct timespec64) {0, 0};
-
- ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
- if (unlikely(rem < 0)) {
- ts.tv_sec--;
- rem += NSEC_PER_SEC;
+ if (likely(nsec > 0)) {
+ ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ ts.tv_nsec = rem;
+ } else if (nsec < 0) {
+ /*
+ * With negative times, tv_sec points to the earlier
+ * second, and tv_nsec counts the nanoseconds since
+ * then, so tv_nsec is always a positive number.
+ */
+ ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1;
+ ts.tv_nsec = NSEC_PER_SEC - rem - 1;
}
- ts.tv_nsec = rem;
return ts;
}
@@ -625,10 +626,12 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
* The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
* value to a scaled second value.
*/
-static unsigned long
-__timespec64_to_jiffies(u64 sec, long nsec)
+
+unsigned long
+timespec64_to_jiffies(const struct timespec64 *value)
{
- nsec = nsec + TICK_NSEC - 1;
+ u64 sec = value->tv_sec;
+ long nsec = value->tv_nsec + TICK_NSEC - 1;
if (sec >= MAX_SEC_IN_JIFFIES){
sec = MAX_SEC_IN_JIFFIES;
@@ -639,18 +642,6 @@ __timespec64_to_jiffies(u64 sec, long nsec)
(NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
}
-
-static unsigned long
-__timespec_to_jiffies(unsigned long sec, long nsec)
-{
- return __timespec64_to_jiffies((u64)sec, nsec);
-}
-
-unsigned long
-timespec64_to_jiffies(const struct timespec64 *value)
-{
- return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec);
-}
EXPORT_SYMBOL(timespec64_to_jiffies);
void
@@ -668,44 +659,6 @@ jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
EXPORT_SYMBOL(jiffies_to_timespec64);
/*
- * We could use a similar algorithm to timespec_to_jiffies (with a
- * different multiplier for usec instead of nsec). But this has a
- * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
- * usec value, since it's not necessarily integral.
- *
- * We could instead round in the intermediate scaled representation
- * (i.e. in units of 1/2^(large scale) jiffies) but that's also
- * perilous: the scaling introduces a small positive error, which
- * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
- * units to the intermediate before shifting) leads to accidental
- * overflow and overestimates.
- *
- * At the cost of one additional multiplication by a constant, just
- * use the timespec implementation.
- */
-unsigned long
-timeval_to_jiffies(const struct timeval *value)
-{
- return __timespec_to_jiffies(value->tv_sec,
- value->tv_usec * NSEC_PER_USEC);
-}
-EXPORT_SYMBOL(timeval_to_jiffies);
-
-void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
-{
- /*
- * Convert jiffies to nanoseconds and separate with
- * one divide.
- */
- u32 rem;
-
- value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
- NSEC_PER_SEC, &rem);
- value->tv_usec = rem / NSEC_PER_USEC;
-}
-EXPORT_SYMBOL(jiffies_to_timeval);
-
-/*
* Convert jiffies/jiffies_64 to clock_t and back.
*/
clock_t jiffies_to_clock_t(unsigned long x)
@@ -880,10 +833,11 @@ int get_timespec64(struct timespec64 *ts,
ts->tv_sec = kts.tv_sec;
- /* Zero out the padding for 32 bit systems or in compat mode */
- if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall())
+ /* Zero out the padding in compat mode */
+ if (in_compat_syscall())
kts.tv_nsec &= 0xFFFFFFFFUL;
+ /* In 32-bit mode, this drops the padding */
ts->tv_nsec = kts.tv_nsec;
return 0;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d911c8470149..ca69290bee2a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -146,6 +146,11 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
tk->offs_boot = ktime_add(tk->offs_boot, delta);
+ /*
+ * Timespec representation for VDSO update to avoid 64bit division
+ * on every update.
+ */
+ tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}
/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 343c7ba33b1c..4820823515e9 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -196,6 +196,10 @@ EXPORT_SYMBOL(jiffies_64);
struct timer_base {
raw_spinlock_t lock;
struct timer_list *running_timer;
+#ifdef CONFIG_PREEMPT_RT
+ spinlock_t expiry_lock;
+ atomic_t timer_waiters;
+#endif
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
@@ -1227,7 +1231,78 @@ int try_to_del_timer_sync(struct timer_list *timer)
}
EXPORT_SYMBOL(try_to_del_timer_sync);
-#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT_RT
+static __init void timer_base_init_expiry_lock(struct timer_base *base)
+{
+ spin_lock_init(&base->expiry_lock);
+}
+
+static inline void timer_base_lock_expiry(struct timer_base *base)
+{
+ spin_lock(&base->expiry_lock);
+}
+
+static inline void timer_base_unlock_expiry(struct timer_base *base)
+{
+ spin_unlock(&base->expiry_lock);
+}
+
+/*
+ * The counterpart to del_timer_wait_running().
+ *
+ * If there is a waiter for base->expiry_lock, then it was waiting for the
+ * timer callback to finish. Drop expiry_lock and reaquire it. That allows
+ * the waiter to acquire the lock and make progress.
+ */
+static void timer_sync_wait_running(struct timer_base *base)
+{
+ if (atomic_read(&base->timer_waiters)) {
+ spin_unlock(&base->expiry_lock);
+ spin_lock(&base->expiry_lock);
+ }
+}
+
+/*
+ * This function is called on PREEMPT_RT kernels when the fast path
+ * deletion of a timer failed because the timer callback function was
+ * running.
+ *
+ * This prevents priority inversion, if the softirq thread on a remote CPU
+ * got preempted, and it prevents a life lock when the task which tries to
+ * delete a timer preempted the softirq thread running the timer callback
+ * function.
+ */
+static void del_timer_wait_running(struct timer_list *timer)
+{
+ u32 tf;
+
+ tf = READ_ONCE(timer->flags);
+ if (!(tf & TIMER_MIGRATING)) {
+ struct timer_base *base = get_timer_base(tf);
+
+ /*
+ * Mark the base as contended and grab the expiry lock,
+ * which is held by the softirq across the timer
+ * callback. Drop the lock immediately so the softirq can
+ * expire the next timer. In theory the timer could already
+ * be running again, but that's more than unlikely and just
+ * causes another wait loop.
+ */
+ atomic_inc(&base->timer_waiters);
+ spin_lock_bh(&base->expiry_lock);
+ atomic_dec(&base->timer_waiters);
+ spin_unlock_bh(&base->expiry_lock);
+ }
+}
+#else
+static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
+static inline void timer_base_lock_expiry(struct timer_base *base) { }
+static inline void timer_base_unlock_expiry(struct timer_base *base) { }
+static inline void timer_sync_wait_running(struct timer_base *base) { }
+static inline void del_timer_wait_running(struct timer_list *timer) { }
+#endif
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -1266,6 +1341,8 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
*/
int del_timer_sync(struct timer_list *timer)
{
+ int ret;
+
#ifdef CONFIG_LOCKDEP
unsigned long flags;
@@ -1283,12 +1360,17 @@ int del_timer_sync(struct timer_list *timer)
* could lead to deadlock.
*/
WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
- for (;;) {
- int ret = try_to_del_timer_sync(timer);
- if (ret >= 0)
- return ret;
- cpu_relax();
- }
+
+ do {
+ ret = try_to_del_timer_sync(timer);
+
+ if (unlikely(ret < 0)) {
+ del_timer_wait_running(timer);
+ cpu_relax();
+ }
+ } while (ret < 0);
+
+ return ret;
}
EXPORT_SYMBOL(del_timer_sync);
#endif
@@ -1360,10 +1442,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ base->running_timer = NULL;
raw_spin_lock(&base->lock);
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ base->running_timer = NULL;
+ timer_sync_wait_running(base);
raw_spin_lock_irq(&base->lock);
}
}
@@ -1593,24 +1678,26 @@ void timer_clear_idle(void)
static int collect_expired_timers(struct timer_base *base,
struct hlist_head *heads)
{
+ unsigned long now = READ_ONCE(jiffies);
+
/*
* NOHZ optimization. After a long idle sleep we need to forward the
* base to current jiffies. Avoid a loop by searching the bitfield for
* the next expiring timer.
*/
- if ((long)(jiffies - base->clk) > 2) {
+ if ((long)(now - base->clk) > 2) {
unsigned long next = __next_timer_interrupt(base);
/*
* If the next timer is ahead of time forward to current
* jiffies, otherwise forward to the next expiry time:
*/
- if (time_after(next, jiffies)) {
+ if (time_after(next, now)) {
/*
* The call site will increment base->clk and then
* terminate the expiry loop immediately.
*/
- base->clk = jiffies;
+ base->clk = now;
return 0;
}
base->clk = next;
@@ -1643,7 +1730,7 @@ void update_process_times(int user_tick)
#endif
scheduler_tick();
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
- run_posix_cpu_timers(p);
+ run_posix_cpu_timers();
}
/**
@@ -1658,6 +1745,7 @@ static inline void __run_timers(struct timer_base *base)
if (!time_after_eq(jiffies, base->clk))
return;
+ timer_base_lock_expiry(base);
raw_spin_lock_irq(&base->lock);
/*
@@ -1684,8 +1772,8 @@ static inline void __run_timers(struct timer_base *base)
while (levels--)
expire_timers(base, heads + levels);
}
- base->running_timer = NULL;
raw_spin_unlock_irq(&base->lock);
+ timer_base_unlock_expiry(base);
}
/*
@@ -1930,6 +2018,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
+ timer_base_init_expiry_lock(base);
}
}
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 8cf3596a4ce6..9577c89179cd 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -17,7 +17,7 @@ static inline void update_vdso_data(struct vdso_data *vdata,
struct timekeeper *tk)
{
struct vdso_timestamp *vdso_ts;
- u64 nsec;
+ u64 nsec, sec;
vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
@@ -28,11 +28,6 @@ static inline void update_vdso_data(struct vdso_data *vdata,
vdata[CS_RAW].mult = tk->tkr_raw.mult;
vdata[CS_RAW].shift = tk->tkr_raw.shift;
- /* CLOCK_REALTIME */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
- vdso_ts->sec = tk->xtime_sec;
- vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
-
/* CLOCK_MONOTONIC */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
@@ -45,33 +40,31 @@ static inline void update_vdso_data(struct vdso_data *vdata,
}
vdso_ts->nsec = nsec;
- /* CLOCK_MONOTONIC_RAW */
- vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
- vdso_ts->sec = tk->raw_sec;
- vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
+ /* Copy MONOTONIC time for BOOTTIME */
+ sec = vdso_ts->sec;
+ /* Add the boot offset */
+ sec += tk->monotonic_to_boot.tv_sec;
+ nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift;
/* CLOCK_BOOTTIME */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
- vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec;
- nsec += ((u64)(tk->wall_to_monotonic.tv_nsec +
- ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift);
+ vdso_ts->sec = sec;
+
while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
vdso_ts->sec++;
}
vdso_ts->nsec = nsec;
+ /* CLOCK_MONOTONIC_RAW */
+ vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
+ vdso_ts->sec = tk->raw_sec;
+ vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
+
/* CLOCK_TAI */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
-
- /*
- * Read without the seqlock held by clock_getres().
- * Note: No need to have a second copy.
- */
- WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
}
void update_vsyscall(struct timekeeper *tk)
@@ -80,20 +73,17 @@ void update_vsyscall(struct timekeeper *tk)
struct vdso_timestamp *vdso_ts;
u64 nsec;
- if (__arch_update_vdso_data()) {
- /*
- * Some architectures might want to skip the update of the
- * data page.
- */
- return;
- }
-
/* copy vsyscall data */
vdso_write_begin(vdata);
vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk);
vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk);
+ /* CLOCK_REALTIME also required for time() */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
+ vdso_ts->sec = tk->xtime_sec;
+ vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
+
/* CLOCK_REALTIME_COARSE */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
@@ -106,7 +96,17 @@ void update_vsyscall(struct timekeeper *tk)
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
- if (__arch_use_vsyscall(vdata))
+ /*
+ * Read without the seqlock held by clock_getres().
+ * Note: No need to have a second copy.
+ */
+ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
+
+ /*
+ * Architectures can opt out of updating the high resolution part
+ * of the VDSO.
+ */
+ if (__arch_update_vdso_data())
update_vdso_data(vdata, tk);
__arch_update_vsyscall(vdata, tk);
@@ -120,10 +120,8 @@ void update_vsyscall_tz(void)
{
struct vdso_data *vdata = __arch_get_k_vdso_data();
- if (__arch_use_vsyscall(vdata)) {
- vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest;
- vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime;
- }
+ vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest;
+ vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime;
__arch_sync_vdso_data(vdata);
}
diff --git a/kernel/torture.c b/kernel/torture.c
index a8d9bdfba7c3..7c13f5558b71 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -263,7 +263,6 @@ static void torture_onoff_cleanup(void)
onoff_task = NULL;
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
}
-EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
/*
* Print online/offline testing statistics.
@@ -449,7 +448,6 @@ static void torture_shuffle_cleanup(void)
}
shuffler_task = NULL;
}
-EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
/*
* Variables for auto-shutdown. This allows "lights out" torture runs
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 98da8998c25c..91e885194dbc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -33,6 +33,9 @@ config HAVE_DYNAMIC_FTRACE
config HAVE_DYNAMIC_FTRACE_WITH_REGS
bool
+config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ bool
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
@@ -76,7 +79,7 @@ config FTRACE_NMI_ENTER
config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
- select GLOB
+ select GLOB
bool
config CONTEXT_SWITCH_TRACER
@@ -106,7 +109,6 @@ config PREEMPTIRQ_TRACEPOINTS
config TRACING
bool
- select DEBUG_FS
select RING_BUFFER
select STACKTRACE if STACKTRACE_SUPPORT
select TRACEPOINTS
@@ -139,6 +141,15 @@ menuconfig FTRACE
if FTRACE
+config BOOTTIME_TRACING
+ bool "Boot-time Tracing support"
+ depends on BOOT_CONFIG && TRACING
+ default y
+ help
+ Enable developer to setup ftrace subsystem via supplemental
+ kernel cmdline at boot time for debugging (tracing) driver
+ initialization and boot process.
+
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
@@ -146,7 +157,7 @@ config FUNCTION_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select GLOB
- select TASKS_RCU if PREEMPT
+ select TASKS_RCU if PREEMPTION
help
Enable the kernel to trace every kernel function. This is done
by using a compiler feature to insert a small, 5-byte No-Operation
@@ -170,6 +181,77 @@ config FUNCTION_GRAPH_TRACER
the return value. This is done by setting the current return
address on the current task structure into a stack of calls.
+config DYNAMIC_FTRACE
+ bool "enable/disable function tracing dynamically"
+ depends on FUNCTION_TRACER
+ depends on HAVE_DYNAMIC_FTRACE
+ default y
+ help
+ This option will modify all the calls to function tracing
+ dynamically (will patch them out of the binary image and
+ replace them with a No-Op instruction) on boot up. During
+ compile time, a table is made of all the locations that ftrace
+ can function trace, and this table is linked into the kernel
+ image. When this is enabled, functions can be individually
+ enabled, and the functions not enabled will not affect
+ performance of the system.
+
+ See the files in /sys/kernel/debug/tracing:
+ available_filter_functions
+ set_ftrace_filter
+ set_ftrace_notrace
+
+ This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
+ otherwise has native performance as long as no tracing is active.
+
+config DYNAMIC_FTRACE_WITH_REGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
+
+config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+
+config FUNCTION_PROFILER
+ bool "Kernel function profiler"
+ depends on FUNCTION_TRACER
+ default n
+ help
+ This option enables the kernel function profiler. A file is created
+ in debugfs called function_profile_enabled which defaults to zero.
+ When a 1 is echoed into this file profiling begins, and when a
+ zero is entered, profiling stops. A "functions" file is created in
+ the trace_stat directory; this file shows the list of functions that
+ have been hit and their counters.
+
+ If in doubt, say N.
+
+config STACK_TRACER
+ bool "Trace max stack"
+ depends on HAVE_FUNCTION_TRACER
+ select FUNCTION_TRACER
+ select STACKTRACE
+ select KALLSYMS
+ help
+ This special tracer records the maximum stack footprint of the
+ kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
+
+ This tracer works by hooking into every function call that the
+ kernel executes, and keeping a maximum stack depth value and
+ stack-trace saved. If this is configured with DYNAMIC_FTRACE
+ then it will not have any overhead while the stack tracer
+ is disabled.
+
+ To enable the stack tracer on bootup, pass in 'stacktrace'
+ on the kernel command line.
+
+ The stack tracer can also be enabled or disabled via the
+ sysctl kernel.stack_tracer_enabled
+
+ Say N if unsure.
+
config TRACE_PREEMPT_TOGGLE
bool
help
@@ -179,7 +261,7 @@ config TRACE_PREEMPT_TOGGLE
config PREEMPTIRQ_EVENTS
bool "Enable trace events for preempt and irq disable/enable"
select TRACE_IRQFLAGS
- select TRACE_PREEMPT_TOGGLE if PREEMPT
+ select TRACE_PREEMPT_TOGGLE if PREEMPTION
select GENERIC_TRACER
default n
help
@@ -214,7 +296,7 @@ config PREEMPT_TRACER
bool "Preemption-off Latency Tracer"
default n
depends on !ARCH_USES_GETTIMEOFFSET
- depends on PREEMPT
+ depends on PREEMPTION
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
@@ -280,6 +362,19 @@ config HWLAT_TRACER
file. Every time a latency is greater than tracing_thresh, it will
be recorded into the ring buffer.
+config MMIOTRACE
+ bool "Memory mapped IO tracing"
+ depends on HAVE_MMIOTRACE_SUPPORT && PCI
+ select GENERIC_TRACER
+ help
+ Mmiotrace traces Memory Mapped I/O access and is meant for
+ debugging and reverse engineering. It is called from the ioremap
+ implementation and works via page faults. Tracing is disabled by
+ default and can be enabled at run-time.
+
+ See Documentation/trace/mmiotrace.rst.
+ If you are not helping to develop drivers, say N.
+
config ENABLE_DEFAULT_TRACERS
bool "Trace process context switches and events"
depends on !GENERIC_TRACER
@@ -308,7 +403,7 @@ config TRACER_SNAPSHOT
cat snapshot
config TRACER_SNAPSHOT_PER_CPU_SWAP
- bool "Allow snapshot to swap per CPU"
+ bool "Allow snapshot to swap per CPU"
depends on TRACER_SNAPSHOT
select RING_BUFFER_ALLOW_SWAP
help
@@ -408,30 +503,6 @@ config BRANCH_TRACER
Say N if unsure.
-config STACK_TRACER
- bool "Trace max stack"
- depends on HAVE_FUNCTION_TRACER
- select FUNCTION_TRACER
- select STACKTRACE
- select KALLSYMS
- help
- This special tracer records the maximum stack footprint of the
- kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
-
- This tracer works by hooking into every function call that the
- kernel executes, and keeping a maximum stack depth value and
- stack-trace saved. If this is configured with DYNAMIC_FTRACE
- then it will not have any overhead while the stack tracer
- is disabled.
-
- To enable the stack tracer on bootup, pass in 'stacktrace'
- on the kernel command line.
-
- The stack tracer can also be enabled or disabled via the
- sysctl kernel.stack_tracer_enabled
-
- Say N if unsure.
-
config BLK_DEV_IO_TRACE
bool "Support for tracing block IO actions"
depends on SYSFS
@@ -520,7 +591,8 @@ config BPF_EVENTS
bool
default y
help
- This allows the user to attach BPF programs to kprobe events.
+ This allows the user to attach BPF programs to kprobe, uprobe, and
+ tracepoint events.
config DYNAMIC_EVENTS
def_bool n
@@ -528,48 +600,6 @@ config DYNAMIC_EVENTS
config PROBE_EVENTS
def_bool n
-config DYNAMIC_FTRACE
- bool "enable/disable function tracing dynamically"
- depends on FUNCTION_TRACER
- depends on HAVE_DYNAMIC_FTRACE
- default y
- help
- This option will modify all the calls to function tracing
- dynamically (will patch them out of the binary image and
- replace them with a No-Op instruction) on boot up. During
- compile time, a table is made of all the locations that ftrace
- can function trace, and this table is linked into the kernel
- image. When this is enabled, functions can be individually
- enabled, and the functions not enabled will not affect
- performance of the system.
-
- See the files in /sys/kernel/debug/tracing:
- available_filter_functions
- set_ftrace_filter
- set_ftrace_notrace
-
- This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
- otherwise has native performance as long as no tracing is active.
-
-config DYNAMIC_FTRACE_WITH_REGS
- def_bool y
- depends on DYNAMIC_FTRACE
- depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
-
-config FUNCTION_PROFILER
- bool "Kernel function profiler"
- depends on FUNCTION_TRACER
- default n
- help
- This option enables the kernel function profiler. A file is created
- in debugfs called function_profile_enabled which defaults to zero.
- When a 1 is echoed into this file profiling begins, and when a
- zero is entered, profiling stops. A "functions" file is created in
- the trace_stat directory; this file shows the list of functions that
- have been hit and their counters.
-
- If in doubt, say N.
-
config BPF_KPROBE_OVERRIDE
bool "Enable BPF programs to override a kprobed function"
depends on BPF_EVENTS
@@ -584,54 +614,6 @@ config FTRACE_MCOUNT_RECORD
depends on DYNAMIC_FTRACE
depends on HAVE_FTRACE_MCOUNT_RECORD
-config FTRACE_SELFTEST
- bool
-
-config FTRACE_STARTUP_TEST
- bool "Perform a startup test on ftrace"
- depends on GENERIC_TRACER
- select FTRACE_SELFTEST
- help
- This option performs a series of startup tests on ftrace. On bootup
- a series of tests are made to verify that the tracer is
- functioning properly. It will do tests on all the configured
- tracers of ftrace.
-
-config EVENT_TRACE_STARTUP_TEST
- bool "Run selftest on trace events"
- depends on FTRACE_STARTUP_TEST
- default y
- help
- This option performs a test on all trace events in the system.
- It basically just enables each event and runs some code that
- will trigger events (not necessarily the event it enables)
- This may take some time run as there are a lot of events.
-
-config EVENT_TRACE_TEST_SYSCALLS
- bool "Run selftest on syscall events"
- depends on EVENT_TRACE_STARTUP_TEST
- help
- This option will also enable testing every syscall event.
- It only enables the event and disables it and runs various loads
- with the event enabled. This adds a bit more time for kernel boot
- up since it runs this on every system call defined.
-
- TBD - enable a way to actually call the syscalls as we test their
- events
-
-config MMIOTRACE
- bool "Memory mapped IO tracing"
- depends on HAVE_MMIOTRACE_SUPPORT && PCI
- select GENERIC_TRACER
- help
- Mmiotrace traces Memory Mapped I/O access and is meant for
- debugging and reverse engineering. It is called from the ioremap
- implementation and works via page faults. Tracing is disabled by
- default and can be enabled at run-time.
-
- See Documentation/trace/mmiotrace.rst.
- If you are not helping to develop drivers, say N.
-
config TRACING_MAP
bool
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -663,18 +645,17 @@ config HIST_TRIGGERS
See Documentation/trace/histogram.rst.
If in doubt, say N.
-config MMIOTRACE_TEST
- tristate "Test module for mmiotrace"
- depends on MMIOTRACE && m
+config TRACE_EVENT_INJECT
+ bool "Trace event injection"
+ depends on TRACING
help
- This is a dumb module for testing mmiotrace. It is very dangerous
- as it will write garbage to IO memory starting at a given address.
- However, it should be safe to use on e.g. unused portion of VRAM.
+ Allow user-space to inject a specific trace event into the ring
+ buffer. This is mainly used for testing purpose.
- Say N, unless you absolutely know what you are doing.
+ If unsure, say N.
config TRACEPOINT_BENCHMARK
- bool "Add tracepoint that benchmarks tracepoints"
+ bool "Add tracepoint that benchmarks tracepoints"
help
This option creates the tracepoint "benchmark:benchmark_event".
When the tracepoint is enabled, it kicks off a kernel thread that
@@ -719,11 +700,86 @@ config RING_BUFFER_BENCHMARK
If unsure, say N.
+config TRACE_EVAL_MAP_FILE
+ bool "Show eval mappings for trace events"
+ depends on TRACING
+ help
+ The "print fmt" of the trace events will show the enum/sizeof names
+ instead of their values. This can cause problems for user space tools
+ that use this string to parse the raw data as user space does not know
+ how to convert the string to its value.
+
+ To fix this, there's a special macro in the kernel that can be used
+ to convert an enum/sizeof into its value. If this macro is used, then
+ the print fmt strings will be converted to their values.
+
+ If something does not get converted properly, this option can be
+ used to show what enums/sizeof the kernel tried to convert.
+
+ This option is for debugging the conversions. A file is created
+ in the tracing directory called "eval_map" that will show the
+ names matched with their values and what trace event system they
+ belong too.
+
+ Normally, the mapping of the strings to values will be freed after
+ boot up or module load. With this option, they will not be freed, as
+ they are needed for the "eval_map" file. Enabling this option will
+ increase the memory footprint of the running kernel.
+
+ If unsure, say N.
+
+config GCOV_PROFILE_FTRACE
+ bool "Enable GCOV profiling on ftrace subsystem"
+ depends on GCOV_KERNEL
+ help
+ Enable GCOV profiling on ftrace subsystem for checking
+ which functions/lines are tested.
+
+ If unsure, say N.
+
+ Note that on a kernel compiled with this config, ftrace will
+ run significantly slower.
+
+config FTRACE_SELFTEST
+ bool
+
+config FTRACE_STARTUP_TEST
+ bool "Perform a startup test on ftrace"
+ depends on GENERIC_TRACER
+ select FTRACE_SELFTEST
+ help
+ This option performs a series of startup tests on ftrace. On bootup
+ a series of tests are made to verify that the tracer is
+ functioning properly. It will do tests on all the configured
+ tracers of ftrace.
+
+config EVENT_TRACE_STARTUP_TEST
+ bool "Run selftest on trace events"
+ depends on FTRACE_STARTUP_TEST
+ default y
+ help
+ This option performs a test on all trace events in the system.
+ It basically just enables each event and runs some code that
+ will trigger events (not necessarily the event it enables)
+ This may take some time run as there are a lot of events.
+
+config EVENT_TRACE_TEST_SYSCALLS
+ bool "Run selftest on syscall events"
+ depends on EVENT_TRACE_STARTUP_TEST
+ help
+ This option will also enable testing every syscall event.
+ It only enables the event and disables it and runs various loads
+ with the event enabled. This adds a bit more time for kernel boot
+ up since it runs this on every system call defined.
+
+ TBD - enable a way to actually call the syscalls as we test their
+ events
+
config RING_BUFFER_STARTUP_TEST
bool "Ring buffer startup self test"
depends on RING_BUFFER
help
- Run a simple self test on the ring buffer on boot up. Late in the
+ Run a simple self test on the ring buffer on boot up. Late in the
kernel boot sequence, the test will start that kicks off
a thread per cpu. Each thread will write various size events
into the ring buffer. Another thread is created to send IPIs
@@ -742,8 +798,18 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
+config MMIOTRACE_TEST
+ tristate "Test module for mmiotrace"
+ depends on MMIOTRACE && m
+ help
+ This is a dumb module for testing mmiotrace. It is very dangerous
+ as it will write garbage to IO memory starting at a given address.
+ However, it should be safe to use on e.g. unused portion of VRAM.
+
+ Say N, unless you absolutely know what you are doing.
+
config PREEMPTIRQ_DELAY_TEST
- tristate "Preempt / IRQ disable delay thread to test latency tracers"
+ tristate "Test module to create a preempt / IRQ disable delay thread to test latency tracers"
depends on m
help
Select this option to build a test module that can help test latency
@@ -751,51 +817,36 @@ config PREEMPTIRQ_DELAY_TEST
configurable delay. The module busy waits for the duration of the
critical section.
- For example, the following invocation forces a one-time irq-disabled
- critical section for 500us:
- modprobe preemptirq_delay_test test_mode=irq delay=500000
+ For example, the following invocation generates a burst of three
+ irq-disabled critical sections for 500us:
+ modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3
If unsure, say N
-config TRACE_EVAL_MAP_FILE
- bool "Show eval mappings for trace events"
- depends on TRACING
- help
- The "print fmt" of the trace events will show the enum/sizeof names
- instead of their values. This can cause problems for user space tools
- that use this string to parse the raw data as user space does not know
- how to convert the string to its value.
-
- To fix this, there's a special macro in the kernel that can be used
- to convert an enum/sizeof into its value. If this macro is used, then
- the print fmt strings will be converted to their values.
-
- If something does not get converted properly, this option can be
- used to show what enums/sizeof the kernel tried to convert.
-
- This option is for debugging the conversions. A file is created
- in the tracing directory called "eval_map" that will show the
- names matched with their values and what trace event system they
- belong too.
+config SYNTH_EVENT_GEN_TEST
+ tristate "Test module for in-kernel synthetic event generation"
+ depends on HIST_TRIGGERS
+ help
+ This option creates a test module to check the base
+ functionality of in-kernel synthetic event definition and
+ generation.
- Normally, the mapping of the strings to values will be freed after
- boot up or module load. With this option, they will not be freed, as
- they are needed for the "eval_map" file. Enabling this option will
- increase the memory footprint of the running kernel.
+ To test, insert the module, and then check the trace buffer
+ for the generated sample events.
- If unsure, say N
+ If unsure, say N.
-config GCOV_PROFILE_FTRACE
- bool "Enable GCOV profiling on ftrace subsystem"
- depends on GCOV_KERNEL
+config KPROBE_EVENT_GEN_TEST
+ tristate "Test module for in-kernel kprobe event generation"
+ depends on KPROBE_EVENTS
help
- Enable GCOV profiling on ftrace subsystem for checking
- which functions/lines are tested.
+ This option creates a test module to check the base
+ functionality of in-kernel kprobe event definition.
- If unsure, say N.
+ To test, insert the module, and then check the trace buffer
+ for the generated kprobe events.
- Note that on a kernel compiled with this config, ftrace will
- run significantly slower.
+ If unsure, say N.
endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c2b2148bb1d2..f9dcd19165fa 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -44,6 +44,8 @@ obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o
+obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o
+obj-$(CONFIG_KPROBE_EVENT_GEN_TEST) += kprobe_event_gen_test.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += trace_preemptirq.o
@@ -69,6 +71,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
@@ -82,6 +85,7 @@ endif
obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
+obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2d6e93ab0478..0735ae8545d8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -64,19 +64,18 @@ static void blk_unregister_tracepoints(void);
* Send out a notify message.
*/
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
- const void *data, size_t len,
- union kernfs_node_id *cgid)
+ const void *data, size_t len, u64 cgid)
{
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
- struct ring_buffer *buffer = NULL;
+ struct trace_buffer *buffer = NULL;
int pc = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
- ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
+ ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (blk_tracer) {
- buffer = blk_tr->trace_buffer.buffer;
+ buffer = blk_tr->array_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len + cgid_len,
@@ -100,8 +99,8 @@ record_it:
t->pid = pid;
t->cpu = cpu;
t->pdu_len = len + cgid_len;
- if (cgid)
- memcpy((void *)t + sizeof(*t), cgid, cgid_len);
+ if (cgid_len)
+ memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
@@ -122,7 +121,7 @@ static void trace_note_tsk(struct task_struct *tsk)
spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
- sizeof(tsk->comm), NULL);
+ sizeof(tsk->comm), 0);
}
spin_unlock_irqrestore(&running_trace_lock, flags);
}
@@ -139,7 +138,7 @@ static void trace_note_time(struct blk_trace *bt)
words[1] = now.tv_nsec;
local_irq_save(flags);
- trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL);
+ trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0);
local_irq_restore(flags);
}
@@ -172,9 +171,9 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP
trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
- blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL);
+ blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
#else
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0);
#endif
local_irq_restore(flags);
}
@@ -212,18 +211,18 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int op, int op_flags, u32 what, int error, int pdu_len,
- void *pdu_data, union kernfs_node_id *cgid)
+ void *pdu_data, u64 cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
- struct ring_buffer *buffer = NULL;
+ struct trace_buffer *buffer = NULL;
struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
pid_t pid;
int cpu, pc = 0;
bool blk_tracer = blk_tracer_enabled;
- ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
+ ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
@@ -249,7 +248,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (blk_tracer) {
tracing_record_cmdline(current);
- buffer = blk_tr->trace_buffer.buffer;
+ buffer = blk_tr->array_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len + cgid_len,
@@ -294,7 +293,7 @@ record_it:
t->pdu_len = pdu_len + cgid_len;
if (cgid_len)
- memcpy((void *)t + sizeof(*t), cgid, cgid_len);
+ memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
if (pdu_len)
memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
@@ -751,31 +750,29 @@ void blk_trace_shutdown(struct request_queue *q)
}
#ifdef CONFIG_BLK_CGROUP
-static union kernfs_node_id *
-blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
struct blk_trace *bt = q->blk_trace;
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
- return NULL;
+ return 0;
if (!bio->bi_blkg)
- return NULL;
- return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup);
+ return 0;
+ return cgroup_id(bio_blkcg(bio)->css.cgroup);
}
#else
-static union kernfs_node_id *
-blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
- return NULL;
+ return 0;
}
#endif
-static union kernfs_node_id *
+static u64
blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
{
if (!rq->bio)
- return NULL;
+ return 0;
/* Use the first bio */
return blk_trace_bio_get_cgid(q, rq->bio);
}
@@ -797,8 +794,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
*
**/
static void blk_add_trace_rq(struct request *rq, int error,
- unsigned int nr_bytes, u32 what,
- union kernfs_node_id *cgid)
+ unsigned int nr_bytes, u32 what, u64 cgid)
{
struct blk_trace *bt = rq->q->blk_trace;
@@ -913,7 +909,7 @@ static void blk_add_trace_getrq(void *ignore,
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
- NULL, NULL);
+ NULL, 0);
}
}
@@ -929,7 +925,7 @@ static void blk_add_trace_sleeprq(void *ignore,
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
- 0, 0, NULL, NULL);
+ 0, 0, NULL, 0);
}
}
@@ -938,7 +934,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -955,7 +951,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
}
}
@@ -1172,19 +1168,17 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
{
- return (void *)(te_blk_io_trace(ent) + 1) +
- (has_cg ? sizeof(union kernfs_node_id) : 0);
+ return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0);
}
-static inline const void *cgid_start(const struct trace_entry *ent)
+static inline u64 t_cgid(const struct trace_entry *ent)
{
- return (void *)(te_blk_io_trace(ent) + 1);
+ return *(u64 *)(te_blk_io_trace(ent) + 1);
}
static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
{
- return te_blk_io_trace(ent)->pdu_len -
- (has_cg ? sizeof(union kernfs_node_id) : 0);
+ return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0);
}
static inline u32 t_action(const struct trace_entry *ent)
@@ -1257,7 +1251,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
fill_rwbs(rwbs, t);
if (has_cg) {
- const union kernfs_node_id *id = cgid_start(iter->ent);
+ u64 id = t_cgid(iter->ent);
if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
char blkcg_name_buf[NAME_MAX + 1] = "<...>";
@@ -1267,11 +1261,25 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
MAJOR(t->device), MINOR(t->device),
blkcg_name_buf, act, rwbs);
- } else
+ } else {
+ /*
+ * The cgid portion used to be "INO,GEN". Userland
+ * builds a FILEID_INO32_GEN fid out of them and
+ * opens the cgroup using open_by_handle_at(2).
+ * While 32bit ino setups are still the same, 64bit
+ * ones now use the 64bit ino as the whole ID and
+ * no longer use generation.
+ *
+ * Regarldess of the content, always output
+ * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can
+ * be mapped back to @id on both 64 and 32bit ino
+ * setups. See __kernfs_fh_to_dentry().
+ */
trace_seq_printf(&iter->seq,
- "%3d,%-3d %x,%-x %2s %3s ",
+ "%3d,%-3d %llx,%-llx %2s %3s ",
MAJOR(t->device), MINOR(t->device),
- id->ino, id->generation, act, rwbs);
+ id & U32_MAX, id >> 32, act, rwbs);
+ }
} else
trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
MAJOR(t->device), MINOR(t->device), act, rwbs);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d14576..19e793aa441a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -138,19 +138,140 @@ static const struct bpf_func_proto bpf_override_return_proto = {
};
#endif
-BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
+BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
+ const void __user *, unsafe_ptr)
{
- int ret;
+ int ret = probe_user_read(dst, unsafe_ptr, size);
+
+ if (unlikely(ret < 0))
+ memset(dst, 0, size);
+
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_probe_read_user_proto = {
+ .func = bpf_probe_read_user,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
+ const void __user *, unsafe_ptr)
+{
+ int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size);
- ret = probe_kernel_read(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
return ret;
}
-static const struct bpf_func_proto bpf_probe_read_proto = {
- .func = bpf_probe_read,
+static const struct bpf_func_proto bpf_probe_read_user_str_proto = {
+ .func = bpf_probe_read_user_str,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static __always_inline int
+bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr,
+ const bool compat)
+{
+ int ret = security_locked_down(LOCKDOWN_BPF_READ);
+
+ if (unlikely(ret < 0))
+ goto out;
+ ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) :
+ probe_kernel_read_strict(dst, unsafe_ptr, size);
+ if (unlikely(ret < 0))
+out:
+ memset(dst, 0, size);
+ return ret;
+}
+
+BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
+ const void *, unsafe_ptr)
+{
+ return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false);
+}
+
+static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
+ .func = bpf_probe_read_kernel,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
+ const void *, unsafe_ptr)
+{
+ return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true);
+}
+
+static const struct bpf_func_proto bpf_probe_read_compat_proto = {
+ .func = bpf_probe_read_compat,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static __always_inline int
+bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr,
+ const bool compat)
+{
+ int ret = security_locked_down(LOCKDOWN_BPF_READ);
+
+ if (unlikely(ret < 0))
+ goto out;
+ /*
+ * The strncpy_from_unsafe_*() call will likely not fill the entire
+ * buffer, but that's okay in this circumstance as we're probing
+ * arbitrary memory anyway similar to bpf_probe_read_*() and might
+ * as well probe the stack. Thus, memory is explicitly cleared
+ * only in error case, so that improper users ignoring return
+ * code altogether don't copy garbage; otherwise length of string
+ * is returned that can be used for bpf_perf_event_output() et al.
+ */
+ ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) :
+ strncpy_from_unsafe_strict(dst, unsafe_ptr, size);
+ if (unlikely(ret < 0))
+out:
+ memset(dst, 0, size);
+ return ret;
+}
+
+BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
+ const void *, unsafe_ptr)
+{
+ return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false);
+}
+
+static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
+ .func = bpf_probe_read_kernel_str,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
+ const void *, unsafe_ptr)
+{
+ return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true);
+}
+
+static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
+ .func = bpf_probe_read_compat_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
@@ -158,7 +279,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
+BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
u32, size)
{
/*
@@ -181,10 +302,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
- if (!access_ok(unsafe_ptr, size))
- return -EPERM;
- return probe_kernel_write(unsafe_ptr, src, size);
+ return probe_user_write(unsafe_ptr, src, size);
}
static const struct bpf_func_proto bpf_probe_write_user_proto = {
@@ -500,14 +619,17 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
-static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
+static DEFINE_PER_CPU(int, bpf_event_output_nest_level);
+struct bpf_nested_pt_regs {
+ struct pt_regs regs[3];
+};
+static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs);
+static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
- struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
- struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+ int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
struct perf_raw_frag frag = {
.copy = ctx_copy,
.size = ctx_size,
@@ -522,12 +644,25 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
.data = meta,
},
};
+ struct perf_sample_data *sd;
+ struct pt_regs *regs;
+ u64 ret;
+
+ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
+ ret = -EBUSY;
+ goto out;
+ }
+ sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]);
+ regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]);
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;
- return __bpf_perf_event_output(regs, map, flags, sd);
+ ret = __bpf_perf_event_output(regs, map, flags, sd);
+out:
+ this_cpu_dec(bpf_event_output_nest_level);
+ return ret;
}
BPF_CALL_0(bpf_get_current_task)
@@ -564,40 +699,11 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
- const void *, unsafe_ptr)
-{
- int ret;
-
- /*
- * The strncpy_from_unsafe() call will likely not fill the entire
- * buffer, but that's okay in this circumstance as we're probing
- * arbitrary memory anyway similar to bpf_probe_read() and might
- * as well probe the stack. Thus, memory is explicitly cleared
- * only in error case, so that improper users ignoring return
- * code altogether don't copy garbage; otherwise length of string
- * is returned that can be used for bpf_perf_event_output() et al.
- */
- ret = strncpy_from_unsafe(dst, unsafe_ptr, size);
- if (unlikely(ret < 0))
- memset(dst, 0, size);
-
- return ret;
-}
-
-static const struct bpf_func_proto bpf_probe_read_str_proto = {
- .func = bpf_probe_read_str,
- .gpl_only = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_ANYTHING,
-};
-
struct send_signal_irq_work {
struct irq_work irq_work;
struct task_struct *task;
u32 sig;
+ enum pid_type type;
};
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
@@ -607,10 +713,10 @@ static void do_bpf_send_signal(struct irq_work *entry)
struct send_signal_irq_work *work;
work = container_of(entry, struct send_signal_irq_work, irq_work);
- group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
+ group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
}
-BPF_CALL_1(bpf_send_signal, u32, sig)
+static int bpf_send_signal_common(u32 sig, enum pid_type type)
{
struct send_signal_irq_work *work = NULL;
@@ -634,7 +740,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
return -EINVAL;
work = this_cpu_ptr(&send_signal_work);
- if (work->irq_work.flags & IRQ_WORK_BUSY)
+ if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
return -EBUSY;
/* Add the current task, which is the target of sending signal,
@@ -643,11 +749,17 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
*/
work->task = current;
work->sig = sig;
+ work->type = type;
irq_work_queue(&work->irq_work);
return 0;
}
- return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
+ return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
+}
+
+BPF_CALL_1(bpf_send_signal, u32, sig)
+{
+ return bpf_send_signal_common(sig, PIDTYPE_TGID);
}
static const struct bpf_func_proto bpf_send_signal_proto = {
@@ -657,6 +769,18 @@ static const struct bpf_func_proto bpf_send_signal_proto = {
.arg1_type = ARG_ANYTHING,
};
+BPF_CALL_1(bpf_send_signal_thread, u32, sig)
+{
+ return bpf_send_signal_common(sig, PIDTYPE_PID);
+}
+
+static const struct bpf_func_proto bpf_send_signal_thread_proto = {
+ .func = bpf_send_signal_thread,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -673,8 +797,6 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_pop_elem_proto;
case BPF_FUNC_map_peek_elem:
return &bpf_map_peek_elem_proto;
- case BPF_FUNC_probe_read:
- return &bpf_probe_read_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_tail_call:
@@ -701,14 +823,26 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_probe_read_user:
+ return &bpf_probe_read_user_proto;
+ case BPF_FUNC_probe_read_kernel:
+ return &bpf_probe_read_kernel_proto;
+ case BPF_FUNC_probe_read:
+ return &bpf_probe_read_compat_proto;
+ case BPF_FUNC_probe_read_user_str:
+ return &bpf_probe_read_user_str_proto;
+ case BPF_FUNC_probe_read_kernel_str:
+ return &bpf_probe_read_kernel_str_proto;
case BPF_FUNC_probe_read_str:
- return &bpf_probe_read_str_proto;
+ return &bpf_probe_read_compat_str_proto;
#ifdef CONFIG_CGROUPS
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
+ case BPF_FUNC_send_signal_thread:
+ return &bpf_send_signal_thread_proto;
default:
return NULL;
}
@@ -969,6 +1103,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+extern const struct bpf_func_proto bpf_skb_output_proto;
+
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
{
@@ -1036,13 +1172,25 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
+static const struct bpf_func_proto *
+tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+#ifdef CONFIG_NET
+ case BPF_FUNC_skb_output:
+ return &bpf_skb_output_proto;
+#endif
+ default:
+ return raw_tp_prog_func_proto(func_id, prog);
+ }
+}
+
static bool raw_tp_prog_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- /* largest tracepoint in the kernel has 12 args */
- if (off < 0 || off >= sizeof(__u64) * 12)
+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
return false;
if (type != BPF_READ)
return false;
@@ -1051,6 +1199,20 @@ static bool raw_tp_prog_is_valid_access(int off, int size,
return true;
}
+static bool tracing_prog_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+ return false;
+ if (type != BPF_READ)
+ return false;
+ if (off % size != 0)
+ return false;
+ return btf_ctx_access(off, size, type, prog, info);
+}
+
const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
.get_func_proto = raw_tp_prog_func_proto,
.is_valid_access = raw_tp_prog_is_valid_access,
@@ -1059,6 +1221,14 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
const struct bpf_prog_ops raw_tracepoint_prog_ops = {
};
+const struct bpf_verifier_ops tracing_verifier_ops = {
+ .get_func_proto = tracing_prog_func_proto,
+ .is_valid_access = tracing_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops tracing_prog_ops = {
+};
+
static bool raw_tp_writable_prog_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8dfd5021b933..1af321dec0f1 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -96,11 +96,34 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
return 0;
}
+/*
+ * Not all archs define MCOUNT_INSN_SIZE which is used to look for direct
+ * functions. But those archs currently don't support direct functions
+ * anyway, and ftrace_find_rec_direct() is just a stub for them.
+ * Define MCOUNT_INSN_SIZE to keep those archs compiling.
+ */
+#ifndef MCOUNT_INSN_SIZE
+/* Make sure this only works without direct calls */
+# ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+# error MCOUNT_INSN_SIZE not defined with direct calls enabled
+# endif
+# define MCOUNT_INSN_SIZE 0
+#endif
+
int function_graph_enter(unsigned long ret, unsigned long func,
unsigned long frame_pointer, unsigned long *retp)
{
struct ftrace_graph_ent trace;
+ /*
+ * Skip graph tracing if the return location is served by direct trampoline,
+ * since call sequence and return addresses is unpredicatable anymore.
+ * Ex: BPF trampoline may call original function and may skip frame
+ * depending on type of BPF programs attached.
+ */
+ if (ftrace_direct_func_count &&
+ ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE))
+ return -EBUSY;
trace.func = func;
trace.depth = ++current->curr_ret_depth;
@@ -276,7 +299,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
int index = task->curr_ret_stack;
int i;
- if (ret != (unsigned long)return_to_handler)
+ if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
return ret;
if (index < 0)
@@ -294,7 +317,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
{
int task_idx;
- if (ret != (unsigned long)return_to_handler)
+ if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
return ret;
task_idx = task->curr_ret_stack;
@@ -332,9 +355,14 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
return 0;
}
+/*
+ * Simply points to ftrace_stub, but with the proper protocol.
+ * Defined by the linker script in linux/vmlinux.lds.h
+ */
+extern void ftrace_stub_graph(struct ftrace_graph_ret *);
+
/* The callbacks that hook a function */
-trace_func_graph_ret_t ftrace_graph_return =
- (trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -614,7 +642,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
goto out;
ftrace_graph_active--;
- ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+ ftrace_graph_return = ftrace_stub_graph;
ftrace_graph_entry = ftrace_graph_entry_stub;
__ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eca34503f178..3f7ee102868a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,6 +18,7 @@
#include <linux/clocksource.h>
#include <linux/sched/task.h>
#include <linux/kallsyms.h>
+#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/tracefs.h>
#include <linux/hardirq.h>
@@ -61,8 +62,6 @@
})
/* hash bits for specific function selection */
-#define FTRACE_HASH_BITS 7
-#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
@@ -145,7 +144,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
{
struct trace_array *tr = op->private;
- if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))
+ if (tr && this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid))
return;
op->saved_func(ip, parent_ip, op, regs);
@@ -325,6 +324,8 @@ int __register_ftrace_function(struct ftrace_ops *ops)
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
#endif
+ if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT))
+ return -EBUSY;
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
@@ -462,10 +463,10 @@ static void *function_stat_start(struct tracer_stat *trace)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* function graph compares on total time */
-static int function_stat_cmp(void *p1, void *p2)
+static int function_stat_cmp(const void *p1, const void *p2)
{
- struct ftrace_profile *a = p1;
- struct ftrace_profile *b = p2;
+ const struct ftrace_profile *a = p1;
+ const struct ftrace_profile *b = p2;
if (a->time < b->time)
return -1;
@@ -476,10 +477,10 @@ static int function_stat_cmp(void *p1, void *p2)
}
#else
/* not function graph compares against hits */
-static int function_stat_cmp(void *p1, void *p2)
+static int function_stat_cmp(const void *p1, const void *p2)
{
- struct ftrace_profile *a = p1;
- struct ftrace_profile *b = p2;
+ const struct ftrace_profile *a = p1;
+ const struct ftrace_profile *b = p2;
if (a->counter < b->counter)
return -1;
@@ -523,8 +524,7 @@ static int function_stat_show(struct seq_file *m, void *v)
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- avg = rec->time;
- do_div(avg, rec->counter);
+ avg = div64_ul(rec->time, rec->counter);
if (tracing_thresh && (avg < tracing_thresh))
goto out;
#endif
@@ -550,7 +550,8 @@ static int function_stat_show(struct seq_file *m, void *v)
* Divide only 1000 for ns^2 -> us^2 conversion.
* trace_print_graph_duration will divide 1000 again.
*/
- do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
+ stddev = div64_ul(stddev,
+ rec->counter * (rec->counter - 1) * 1000);
}
trace_seq_init(&s);
@@ -1017,11 +1018,6 @@ static bool update_all_ops;
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
-struct ftrace_func_entry {
- struct hlist_node hlist;
- unsigned long ip;
-};
-
struct ftrace_func_probe {
struct ftrace_probe_ops *probe_ops;
struct ftrace_ops ops;
@@ -1105,9 +1101,6 @@ struct ftrace_page {
#define ENTRY_SIZE sizeof(struct dyn_ftrace)
#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
-/* estimate from running different kernels */
-#define NR_TO_INIT 10000
-
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
@@ -1369,24 +1362,16 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
struct ftrace_hash *new_hash);
-static struct ftrace_hash *
-__ftrace_hash_move(struct ftrace_hash *src)
+static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
{
struct ftrace_func_entry *entry;
- struct hlist_node *tn;
- struct hlist_head *hhd;
struct ftrace_hash *new_hash;
- int size = src->count;
+ struct hlist_head *hhd;
+ struct hlist_node *tn;
int bits = 0;
int i;
/*
- * If the new source is empty, just return the empty_hash.
- */
- if (ftrace_hash_empty(src))
- return EMPTY_HASH;
-
- /*
* Make the hash size about 1/2 the # found
*/
for (size /= 2; size; size >>= 1)
@@ -1410,10 +1395,23 @@ __ftrace_hash_move(struct ftrace_hash *src)
__add_hash_entry(new_hash, entry);
}
}
-
return new_hash;
}
+static struct ftrace_hash *
+__ftrace_hash_move(struct ftrace_hash *src)
+{
+ int size = src->count;
+
+ /*
+ * If the new source is empty, just return the empty_hash.
+ */
+ if (ftrace_hash_empty(src))
+ return EMPTY_HASH;
+
+ return dup_hash(src, size);
+}
+
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1533,6 +1531,26 @@ static int ftrace_cmp_recs(const void *a, const void *b)
return 0;
}
+static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec = NULL;
+ struct dyn_ftrace key;
+
+ key.ip = start;
+ key.flags = end; /* overload flags, as it is unsigned long */
+
+ for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ if (end < pg->records[0].ip ||
+ start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+ continue;
+ rec = bsearch(&key, pg->records, pg->index,
+ sizeof(struct dyn_ftrace),
+ ftrace_cmp_recs);
+ }
+ return rec;
+}
+
/**
* ftrace_location_range - return the first address of a traced location
* if it touches the given ip range
@@ -1547,23 +1565,11 @@ static int ftrace_cmp_recs(const void *a, const void *b)
*/
unsigned long ftrace_location_range(unsigned long start, unsigned long end)
{
- struct ftrace_page *pg;
struct dyn_ftrace *rec;
- struct dyn_ftrace key;
-
- key.ip = start;
- key.flags = end; /* overload flags, as it is unsigned long */
- for (pg = ftrace_pages_start; pg; pg = pg->next) {
- if (end < pg->records[0].ip ||
- start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
- continue;
- rec = bsearch(&key, pg->records, pg->index,
- sizeof(struct dyn_ftrace),
- ftrace_cmp_recs);
- if (rec)
- return rec->ip;
- }
+ rec = lookup_rec(start, end);
+ if (rec)
+ return rec->ip;
return 0;
}
@@ -1714,6 +1720,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
return false;
+ if (ops->flags & FTRACE_OPS_FL_DIRECT)
+ rec->flags |= FTRACE_FL_DIRECT;
+
/*
* If there's only a single callback registered to a
* function, and the ops has a trampoline registered
@@ -1742,6 +1751,15 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
rec->flags--;
/*
+ * Only the internal direct_ops should have the
+ * DIRECT flag set. Thus, if it is removing a
+ * function, then that function should no longer
+ * be direct.
+ */
+ if (ops->flags & FTRACE_OPS_FL_DIRECT)
+ rec->flags &= ~FTRACE_FL_DIRECT;
+
+ /*
* If the rec had REGS enabled and the ops that is
* being removed had REGS set, then see if there is
* still any ops for this record that wants regs.
@@ -2076,15 +2094,34 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
* If enabling and the REGS flag does not match the REGS_EN, or
* the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
* this record. Set flags to fail the compare against ENABLED.
+ * Same for direct calls.
*/
if (flag) {
- if (!(rec->flags & FTRACE_FL_REGS) !=
+ if (!(rec->flags & FTRACE_FL_REGS) !=
!(rec->flags & FTRACE_FL_REGS_EN))
flag |= FTRACE_FL_REGS;
- if (!(rec->flags & FTRACE_FL_TRAMP) !=
+ if (!(rec->flags & FTRACE_FL_TRAMP) !=
!(rec->flags & FTRACE_FL_TRAMP_EN))
flag |= FTRACE_FL_TRAMP;
+
+ /*
+ * Direct calls are special, as count matters.
+ * We must test the record for direct, if the
+ * DIRECT and DIRECT_EN do not match, but only
+ * if the count is 1. That's because, if the
+ * count is something other than one, we do not
+ * want the direct enabled (it will be done via the
+ * direct helper). But if DIRECT_EN is set, and
+ * the count is not one, we need to clear it.
+ */
+ if (ftrace_rec_count(rec) == 1) {
+ if (!(rec->flags & FTRACE_FL_DIRECT) !=
+ !(rec->flags & FTRACE_FL_DIRECT_EN))
+ flag |= FTRACE_FL_DIRECT;
+ } else if (rec->flags & FTRACE_FL_DIRECT_EN) {
+ flag |= FTRACE_FL_DIRECT;
+ }
}
/* If the state of this record hasn't changed, then do nothing */
@@ -2109,6 +2146,25 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
else
rec->flags &= ~FTRACE_FL_TRAMP_EN;
}
+ if (flag & FTRACE_FL_DIRECT) {
+ /*
+ * If there's only one user (direct_ops helper)
+ * then we can call the direct function
+ * directly (no ftrace trampoline).
+ */
+ if (ftrace_rec_count(rec) == 1) {
+ if (rec->flags & FTRACE_FL_DIRECT)
+ rec->flags |= FTRACE_FL_DIRECT_EN;
+ else
+ rec->flags &= ~FTRACE_FL_DIRECT_EN;
+ } else {
+ /*
+ * Can only call directly if there's
+ * only one callback to the function.
+ */
+ rec->flags &= ~FTRACE_FL_DIRECT_EN;
+ }
+ }
}
/*
@@ -2138,7 +2194,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
* and REGS states. The _EN flags must be disabled though.
*/
rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
- FTRACE_FL_REGS_EN);
+ FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN);
}
ftrace_bug_type = FTRACE_BUG_NOP;
@@ -2293,6 +2349,47 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
return NULL;
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/* Protected by rcu_tasks for reading, and direct_mutex for writing */
+static struct ftrace_hash *direct_functions = EMPTY_HASH;
+static DEFINE_MUTEX(direct_mutex);
+int ftrace_direct_func_count;
+
+/*
+ * Search the direct_functions hash to see if the given instruction pointer
+ * has a direct caller attached to it.
+ */
+unsigned long ftrace_find_rec_direct(unsigned long ip)
+{
+ struct ftrace_func_entry *entry;
+
+ entry = __ftrace_lookup_ip(direct_functions, ip);
+ if (!entry)
+ return 0;
+
+ return entry->direct;
+}
+
+static void call_direct_funcs(unsigned long ip, unsigned long pip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ unsigned long addr;
+
+ addr = ftrace_find_rec_direct(ip);
+ if (!addr)
+ return;
+
+ arch_ftrace_set_direct_caller(regs, addr);
+}
+
+struct ftrace_ops direct_ops = {
+ .func = call_direct_funcs,
+ .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
+ | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
+ | FTRACE_OPS_FL_PERMANENT,
+};
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
/**
* ftrace_get_addr_new - Get the call address to set to
* @rec: The ftrace record descriptor
@@ -2306,6 +2403,15 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
{
struct ftrace_ops *ops;
+ unsigned long addr;
+
+ if ((rec->flags & FTRACE_FL_DIRECT) &&
+ (ftrace_rec_count(rec) == 1)) {
+ addr = ftrace_find_rec_direct(rec->ip);
+ if (addr)
+ return addr;
+ WARN_ON_ONCE(1);
+ }
/* Trampolines take precedence over regs */
if (rec->flags & FTRACE_FL_TRAMP) {
@@ -2338,6 +2444,15 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
{
struct ftrace_ops *ops;
+ unsigned long addr;
+
+ /* Direct calls take precedence over trampolines */
+ if (rec->flags & FTRACE_FL_DIRECT_EN) {
+ addr = ftrace_find_rec_direct(rec->ip);
+ if (addr)
+ return addr;
+ WARN_ON_ONCE(1);
+ }
/* Trampolines take precedence over regs */
if (rec->flags & FTRACE_FL_TRAMP_EN) {
@@ -2493,14 +2608,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
}
static int
-ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
+ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec)
{
int ret;
if (unlikely(ftrace_disabled))
return 0;
- ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
+ ret = ftrace_init_nop(mod, rec);
if (ret) {
ftrace_bug_type = FTRACE_BUG_INIT;
ftrace_bug(ret, rec);
@@ -2814,7 +2929,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
* synchornize_rcu_tasks() will wait for those tasks to
* execute and either schedule voluntarily or enter user space.
*/
- if (IS_ENABLED(CONFIG_PREEMPT))
+ if (IS_ENABLED(CONFIG_PREEMPTION))
synchronize_rcu_tasks();
free_ops:
@@ -2860,6 +2975,8 @@ static void ftrace_shutdown_sysctl(void)
static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
+unsigned long ftrace_number_of_pages;
+unsigned long ftrace_number_of_groups;
static inline int ops_traces_mod(struct ftrace_ops *ops)
{
@@ -2942,7 +3059,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
* to the NOP instructions.
*/
if (!__is_defined(CC_USING_NOP_MCOUNT) &&
- !ftrace_code_disable(mod, p))
+ !ftrace_nop_initialize(mod, p))
break;
update_cnt++;
@@ -2984,6 +3101,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
goto again;
}
+ ftrace_number_of_pages += 1 << order;
+ ftrace_number_of_groups++;
+
cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
pg->size = cnt;
@@ -3039,6 +3159,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
start_pg = pg->next;
kfree(pg);
pg = start_pg;
+ ftrace_number_of_pages -= 1 << order;
+ ftrace_number_of_groups--;
}
pr_info("ftrace: FAILED to allocate memory for functions\n");
return NULL;
@@ -3095,6 +3217,14 @@ t_probe_next(struct seq_file *m, loff_t *pos)
hnd = &iter->probe_entry->hlist;
hash = iter->probe->ops.func_hash->filter_hash;
+
+ /*
+ * A probe being registered may temporarily have an empty hash
+ * and it's at the end of the func_probes list.
+ */
+ if (!hash || hash == EMPTY_HASH)
+ return NULL;
+
size = 1 << hash->size_bits;
retry:
@@ -3441,10 +3571,11 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_ENABLED) {
struct ftrace_ops *ops;
- seq_printf(m, " (%ld)%s%s",
+ seq_printf(m, " (%ld)%s%s%s",
ftrace_rec_count(rec),
rec->flags & FTRACE_FL_REGS ? " R" : " ",
- rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
+ rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ",
+ rec->flags & FTRACE_FL_DIRECT ? " D" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
if (ops) {
@@ -3460,6 +3591,13 @@ static int t_show(struct seq_file *m, void *v)
} else {
add_trampoline_func(m, NULL, rec);
}
+ if (rec->flags & FTRACE_FL_DIRECT) {
+ unsigned long direct;
+
+ direct = ftrace_find_rec_direct(rec->ip);
+ if (direct)
+ seq_printf(m, "\n\tdirect-->%pS", (void *)direct);
+ }
}
seq_putc(m, '\n');
@@ -3478,6 +3616,11 @@ static int
ftrace_avail_open(struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
@@ -3497,6 +3640,15 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
+ /*
+ * This shows us what functions are currently being
+ * traced and by what. Not sure if we want lockdown
+ * to hide such critical information for an admin.
+ * Although, perhaps it can show information we don't
+ * want people to see, but if something is tracing
+ * something, we probably want to know about it.
+ */
+
iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
if (!iter)
return -ENOMEM;
@@ -3532,21 +3684,22 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
struct ftrace_hash *hash;
struct list_head *mod_head;
struct trace_array *tr = ops->private;
- int ret = 0;
+ int ret = -ENOMEM;
ftrace_ops_init(ops);
if (unlikely(ftrace_disabled))
return -ENODEV;
+ if (tracing_check_open_get_tr(tr))
+ return -ENODEV;
+
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter)
- return -ENOMEM;
+ goto out;
- if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
- kfree(iter);
- return -ENOMEM;
- }
+ if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX))
+ goto out;
iter->ops = ops;
iter->flags = flag;
@@ -3576,13 +3729,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
if (!iter->hash) {
trace_parser_put(&iter->parser);
- kfree(iter);
- ret = -ENOMEM;
goto out_unlock;
}
} else
iter->hash = hash;
+ ret = 0;
+
if (file->f_mode & FMODE_READ) {
iter->pg = ftrace_pages_start;
@@ -3594,7 +3747,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
/* Failed */
free_ftrace_hash(iter->hash);
trace_parser_put(&iter->parser);
- kfree(iter);
}
} else
file->private_data = iter;
@@ -3602,6 +3754,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
out_unlock:
mutex_unlock(&ops->func_hash->regex_lock);
+ out:
+ if (ret) {
+ kfree(iter);
+ if (tr)
+ trace_array_put(tr);
+ }
+
return ret;
}
@@ -3610,6 +3769,7 @@ ftrace_filter_open(struct inode *inode, struct file *file)
{
struct ftrace_ops *ops = inode->i_private;
+ /* Checks for tracefs lockdown */
return ftrace_regex_open(ops,
FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES,
inode, file);
@@ -3620,6 +3780,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
{
struct ftrace_ops *ops = inode->i_private;
+ /* Checks for tracefs lockdown */
return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE,
inode, file);
}
@@ -4320,12 +4481,21 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
mutex_unlock(&ftrace_lock);
+ /*
+ * Note, there's a small window here that the func_hash->filter_hash
+ * may be NULL or empty. Need to be carefule when reading the loop.
+ */
mutex_lock(&probe->ops.func_hash->regex_lock);
orig_hash = &probe->ops.func_hash->filter_hash;
old_hash = *orig_hash;
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
+ if (!hash) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
ret = ftrace_match_records(hash, glob, strlen(glob));
/* Nothing found? */
@@ -4759,6 +4929,366 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+
+struct ftrace_direct_func {
+ struct list_head next;
+ unsigned long addr;
+ int count;
+};
+
+static LIST_HEAD(ftrace_direct_funcs);
+
+/**
+ * ftrace_find_direct_func - test an address if it is a registered direct caller
+ * @addr: The address of a registered direct caller
+ *
+ * This searches to see if a ftrace direct caller has been registered
+ * at a specific address, and if so, it returns a descriptor for it.
+ *
+ * This can be used by architecture code to see if an address is
+ * a direct caller (trampoline) attached to a fentry/mcount location.
+ * This is useful for the function_graph tracer, as it may need to
+ * do adjustments if it traced a location that also has a direct
+ * trampoline attached to it.
+ */
+struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
+{
+ struct ftrace_direct_func *entry;
+ bool found = false;
+
+ /* May be called by fgraph trampoline (protected by rcu tasks) */
+ list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) {
+ if (entry->addr == addr) {
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ return entry;
+
+ return NULL;
+}
+
+/**
+ * register_ftrace_direct - Call a custom trampoline directly
+ * @ip: The address of the nop at the beginning of a function
+ * @addr: The address of the trampoline to call at @ip
+ *
+ * This is used to connect a direct call from the nop location (@ip)
+ * at the start of ftrace traced functions. The location that it calls
+ * (@addr) must be able to handle a direct call, and save the parameters
+ * of the function being traced, and restore them (or inject new ones
+ * if needed), before returning.
+ *
+ * Returns:
+ * 0 on success
+ * -EBUSY - Another direct function is already attached (there can be only one)
+ * -ENODEV - @ip does not point to a ftrace nop location (or not supported)
+ * -ENOMEM - There was an allocation failure.
+ */
+int register_ftrace_direct(unsigned long ip, unsigned long addr)
+{
+ struct ftrace_direct_func *direct;
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *free_hash = NULL;
+ struct dyn_ftrace *rec;
+ int ret = -EBUSY;
+
+ mutex_lock(&direct_mutex);
+
+ /* See if there's a direct function at @ip already */
+ if (ftrace_find_rec_direct(ip))
+ goto out_unlock;
+
+ ret = -ENODEV;
+ rec = lookup_rec(ip, ip);
+ if (!rec)
+ goto out_unlock;
+
+ /*
+ * Check if the rec says it has a direct call but we didn't
+ * find one earlier?
+ */
+ if (WARN_ON(rec->flags & FTRACE_FL_DIRECT))
+ goto out_unlock;
+
+ /* Make sure the ip points to the exact record */
+ if (ip != rec->ip) {
+ ip = rec->ip;
+ /* Need to check this ip for a direct. */
+ if (ftrace_find_rec_direct(ip))
+ goto out_unlock;
+ }
+
+ ret = -ENOMEM;
+ if (ftrace_hash_empty(direct_functions) ||
+ direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
+ struct ftrace_hash *new_hash;
+ int size = ftrace_hash_empty(direct_functions) ? 0 :
+ direct_functions->count + 1;
+
+ if (size < 32)
+ size = 32;
+
+ new_hash = dup_hash(direct_functions, size);
+ if (!new_hash)
+ goto out_unlock;
+
+ free_hash = direct_functions;
+ direct_functions = new_hash;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto out_unlock;
+
+ direct = ftrace_find_direct_func(addr);
+ if (!direct) {
+ direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+ if (!direct) {
+ kfree(entry);
+ goto out_unlock;
+ }
+ direct->addr = addr;
+ direct->count = 0;
+ list_add_rcu(&direct->next, &ftrace_direct_funcs);
+ ftrace_direct_func_count++;
+ }
+
+ entry->ip = ip;
+ entry->direct = addr;
+ __add_hash_entry(direct_functions, entry);
+
+ ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
+ if (ret)
+ remove_hash_entry(direct_functions, entry);
+
+ if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
+ ret = register_ftrace_function(&direct_ops);
+ if (ret)
+ ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
+ }
+
+ if (ret) {
+ kfree(entry);
+ if (!direct->count) {
+ list_del_rcu(&direct->next);
+ synchronize_rcu_tasks();
+ kfree(direct);
+ if (free_hash)
+ free_ftrace_hash(free_hash);
+ free_hash = NULL;
+ ftrace_direct_func_count--;
+ }
+ } else {
+ direct->count++;
+ }
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (free_hash) {
+ synchronize_rcu_tasks();
+ free_ftrace_hash(free_hash);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_direct);
+
+static struct ftrace_func_entry *find_direct_entry(unsigned long *ip,
+ struct dyn_ftrace **recp)
+{
+ struct ftrace_func_entry *entry;
+ struct dyn_ftrace *rec;
+
+ rec = lookup_rec(*ip, *ip);
+ if (!rec)
+ return NULL;
+
+ entry = __ftrace_lookup_ip(direct_functions, rec->ip);
+ if (!entry) {
+ WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+ return NULL;
+ }
+
+ WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
+
+ /* Passed in ip just needs to be on the call site */
+ *ip = rec->ip;
+
+ if (recp)
+ *recp = rec;
+
+ return entry;
+}
+
+int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
+{
+ struct ftrace_direct_func *direct;
+ struct ftrace_func_entry *entry;
+ int ret = -ENODEV;
+
+ mutex_lock(&direct_mutex);
+
+ entry = find_direct_entry(&ip, NULL);
+ if (!entry)
+ goto out_unlock;
+
+ if (direct_functions->count == 1)
+ unregister_ftrace_function(&direct_ops);
+
+ ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
+
+ WARN_ON(ret);
+
+ remove_hash_entry(direct_functions, entry);
+
+ direct = ftrace_find_direct_func(addr);
+ if (!WARN_ON(!direct)) {
+ /* This is the good path (see the ! before WARN) */
+ direct->count--;
+ WARN_ON(direct->count < 0);
+ if (!direct->count) {
+ list_del_rcu(&direct->next);
+ synchronize_rcu_tasks();
+ kfree(direct);
+ ftrace_direct_func_count--;
+ }
+ }
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
+
+static struct ftrace_ops stub_ops = {
+ .func = ftrace_stub,
+};
+
+/**
+ * ftrace_modify_direct_caller - modify ftrace nop directly
+ * @entry: The ftrace hash entry of the direct helper for @rec
+ * @rec: The record representing the function site to patch
+ * @old_addr: The location that the site at @rec->ip currently calls
+ * @new_addr: The location that the site at @rec->ip should call
+ *
+ * An architecture may overwrite this function to optimize the
+ * changing of the direct callback on an ftrace nop location.
+ * This is called with the ftrace_lock mutex held, and no other
+ * ftrace callbacks are on the associated record (@rec). Thus,
+ * it is safe to modify the ftrace record, where it should be
+ * currently calling @old_addr directly, to call @new_addr.
+ *
+ * Safety checks should be made to make sure that the code at
+ * @rec->ip is currently calling @old_addr. And this must
+ * also update entry->direct to @new_addr.
+ */
+int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
+ struct dyn_ftrace *rec,
+ unsigned long old_addr,
+ unsigned long new_addr)
+{
+ unsigned long ip = rec->ip;
+ int ret;
+
+ /*
+ * The ftrace_lock was used to determine if the record
+ * had more than one registered user to it. If it did,
+ * we needed to prevent that from changing to do the quick
+ * switch. But if it did not (only a direct caller was attached)
+ * then this function is called. But this function can deal
+ * with attached callers to the rec that we care about, and
+ * since this function uses standard ftrace calls that take
+ * the ftrace_lock mutex, we need to release it.
+ */
+ mutex_unlock(&ftrace_lock);
+
+ /*
+ * By setting a stub function at the same address, we force
+ * the code to call the iterator and the direct_ops helper.
+ * This means that @ip does not call the direct call, and
+ * we can simply modify it.
+ */
+ ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0);
+ if (ret)
+ goto out_lock;
+
+ ret = register_ftrace_function(&stub_ops);
+ if (ret) {
+ ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
+ goto out_lock;
+ }
+
+ entry->direct = new_addr;
+
+ /*
+ * By removing the stub, we put back the direct call, calling
+ * the @new_addr.
+ */
+ unregister_ftrace_function(&stub_ops);
+ ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
+
+ out_lock:
+ mutex_lock(&ftrace_lock);
+
+ return ret;
+}
+
+/**
+ * modify_ftrace_direct - Modify an existing direct call to call something else
+ * @ip: The instruction pointer to modify
+ * @old_addr: The address that the current @ip calls directly
+ * @new_addr: The address that the @ip should call
+ *
+ * This modifies a ftrace direct caller at an instruction pointer without
+ * having to disable it first. The direct call will switch over to the
+ * @new_addr without missing anything.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -ENODEV : the @ip given has no direct caller attached
+ * -EINVAL : the @old_addr does not match the current direct caller
+ */
+int modify_ftrace_direct(unsigned long ip,
+ unsigned long old_addr, unsigned long new_addr)
+{
+ struct ftrace_func_entry *entry;
+ struct dyn_ftrace *rec;
+ int ret = -ENODEV;
+
+ mutex_lock(&direct_mutex);
+
+ mutex_lock(&ftrace_lock);
+ entry = find_direct_entry(&ip, &rec);
+ if (!entry)
+ goto out_unlock;
+
+ ret = -EINVAL;
+ if (entry->direct != old_addr)
+ goto out_unlock;
+
+ /*
+ * If there's no other ftrace callback on the rec->ip location,
+ * then it can be changed directly by the architecture.
+ * If there is another caller, then we just need to change the
+ * direct caller helper to point to @new_addr.
+ */
+ if (ftrace_rec_count(rec) == 1) {
+ ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr);
+ } else {
+ entry->direct = new_addr;
+ ret = 0;
+ }
+
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
+ mutex_unlock(&direct_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(modify_ftrace_direct);
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
/**
* ftrace_set_filter_ip - set a function to filter on in ftrace by address
* @ops - the ops to set the filter with
@@ -4929,7 +5459,7 @@ static void __init set_ftrace_early_graph(char *buf, int enable)
struct ftrace_hash *hash;
hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
- if (WARN_ON(!hash))
+ if (MEM_FAIL(!hash, "Failed to allocate hash\n"))
return;
while (buf) {
@@ -5020,6 +5550,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
mutex_unlock(&iter->ops->func_hash->regex_lock);
free_ftrace_hash(iter->hash);
+ if (iter->tr)
+ trace_array_put(iter->tr);
kfree(iter);
return 0;
@@ -5059,8 +5591,8 @@ static const struct file_operations ftrace_notrace_fops = {
static DEFINE_MUTEX(graph_lock);
-struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH;
-struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH;
+struct ftrace_hash __rcu *ftrace_graph_hash = EMPTY_HASH;
+struct ftrace_hash __rcu *ftrace_graph_notrace_hash = EMPTY_HASH;
enum graph_filter_type {
GRAPH_FILTER_NOTRACE = 0,
@@ -5177,9 +5709,13 @@ static int
__ftrace_graph_open(struct inode *inode, struct file *file,
struct ftrace_graph_data *fgd)
{
- int ret = 0;
+ int ret;
struct ftrace_hash *new_hash = NULL;
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
if (file->f_mode & FMODE_WRITE) {
const int size_bits = FTRACE_HASH_DEFAULT_BITS;
@@ -5331,8 +5867,15 @@ ftrace_graph_release(struct inode *inode, struct file *file)
mutex_unlock(&graph_lock);
- /* Wait till all users are no longer using the old hash */
- synchronize_rcu();
+ /*
+ * We need to do a hard force of sched synchronization.
+ * This is because we use preempt_disable() to do RCU, but
+ * the function tracers can be called where RCU is not watching
+ * (like before user_exit()). We can not rely on the RCU
+ * infrastructure to do the synchronization, thus we must do it
+ * ourselves.
+ */
+ schedule_on_each_cpu(ftrace_sync);
free_ftrace_hash(old_hash);
}
@@ -5771,6 +6314,8 @@ void ftrace_release_mod(struct module *mod)
free_pages((unsigned long)pg->records, order);
tmp_page = pg->next;
kfree(pg);
+ ftrace_number_of_pages -= 1 << order;
+ ftrace_number_of_groups--;
}
}
@@ -6019,11 +6564,7 @@ clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash)
{
struct ftrace_func_entry *entry;
- if (ftrace_hash_empty(hash))
- return;
-
- entry = __ftrace_lookup_ip(hash, func->ip);
-
+ entry = ftrace_lookup_ip(hash, func->ip);
/*
* Do not allow this rec to match again.
* Yeah, it may waste some memory, but will be removed
@@ -6057,7 +6598,7 @@ static void add_to_clear_hash_list(struct list_head *clear_list,
func = kmalloc(sizeof(*func), GFP_KERNEL);
if (!func) {
- WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n");
+ MEM_FAIL(1, "alloc failure, ftrace filter could be stale\n");
return;
}
@@ -6116,6 +6657,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
free_pages((unsigned long)pg->records, order);
+ ftrace_number_of_pages -= 1 << order;
+ ftrace_number_of_groups--;
kfree(pg);
pg = container_of(last_pg, struct ftrace_page, next);
if (!(*last_pg))
@@ -6171,6 +6714,9 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
+ pr_info("ftrace: allocated %ld pages with %ld groups\n",
+ ftrace_number_of_pages, ftrace_number_of_groups);
+
set_ftrace_early_filters();
return;
@@ -6378,7 +6924,7 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->function_pids);
- this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
trace_ignore_this_task(pid_list, next));
}
@@ -6432,7 +6978,7 @@ static void clear_ftrace_pids(struct trace_array *tr)
unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
for_each_possible_cpu(cpu)
- per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false;
+ per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = false;
rcu_assign_pointer(tr->function_pids, NULL);
@@ -6487,9 +7033,10 @@ static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
struct trace_array *tr = m->private;
struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
- if (v == FTRACE_NO_PIDS)
+ if (v == FTRACE_NO_PIDS) {
+ (*pos)++;
return NULL;
-
+ }
return trace_pid_next(pid_list, v, pos);
}
@@ -6524,8 +7071,9 @@ ftrace_pid_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret = 0;
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
@@ -6555,7 +7103,7 @@ static void ignore_task_cpu(void *data)
pid_list = rcu_dereference_protected(tr->function_pids,
mutex_is_locked(&ftrace_lock));
- this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
trace_ignore_this_task(pid_list, current));
}
@@ -6710,6 +7258,18 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
}
EXPORT_SYMBOL_GPL(unregister_ftrace_function);
+static bool is_permanent_ops_registered(void)
+{
+ struct ftrace_ops *op;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op->flags & FTRACE_OPS_FL_PERMANENT)
+ return true;
+ } while_for_each_ftrace_op(op);
+
+ return false;
+}
+
int
ftrace_enable_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -6727,8 +7287,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
goto out;
- last_ftrace_enabled = !!ftrace_enabled;
-
if (ftrace_enabled) {
/* we are starting ftrace again */
@@ -6739,12 +7297,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
ftrace_startup_sysctl();
} else {
+ if (is_permanent_ops_registered()) {
+ ftrace_enabled = true;
+ ret = -EBUSY;
+ goto out;
+ }
+
/* stopping ftrace calls (just send to ftrace_stub) */
ftrace_trace_function = ftrace_stub;
ftrace_shutdown_sysctl();
}
+ last_ftrace_enabled = !!ftrace_enabled;
out:
mutex_unlock(&ftrace_lock);
return ret;
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 0515a2096f90..0456e0a3dab1 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -6,22 +6,22 @@
/*
* Traverse the ftrace_global_list, invoking all entries. The reason that we
- * can use rcu_dereference_raw_notrace() is that elements removed from this list
+ * can use rcu_dereference_raw_check() is that elements removed from this list
* are simply leaked, so there is no need to interact with a grace-period
- * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
+ * mechanism. The rcu_dereference_raw_check() calls are needed to handle
* concurrent insertions into the ftrace_global_list.
*
* Silly Alpha and silly pointer-speculation compiler optimizations!
*/
#define do_for_each_ftrace_op(op, list) \
- op = rcu_dereference_raw_notrace(list); \
+ op = rcu_dereference_raw_check(list); \
do
/*
* Optimized for just a single item in the list (as that is the normal case).
*/
#define while_for_each_ftrace_op(op) \
- while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
+ while (likely(op = rcu_dereference_raw_check((op)->next)) && \
unlikely((op) != &ftrace_list_end))
extern struct ftrace_ops __rcu *ftrace_ops_list;
diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
new file mode 100644
index 000000000000..18b0f1cbb947
--- /dev/null
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test module for in-kernel kprobe event creation and generation.
+ *
+ * Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <linux/trace_events.h>
+
+/*
+ * This module is a simple test of basic functionality for in-kernel
+ * kprobe/kretprobe event creation. The first test uses
+ * kprobe_event_gen_cmd_start(), kprobe_event_add_fields() and
+ * kprobe_event_gen_cmd_end() to create a kprobe event, which is then
+ * enabled in order to generate trace output. The second creates a
+ * kretprobe event using kretprobe_event_gen_cmd_start() and
+ * kretprobe_event_gen_cmd_end(), and is also then enabled.
+ *
+ * To test, select CONFIG_KPROBE_EVENT_GEN_TEST and build the module.
+ * Then:
+ *
+ * # insmod kernel/trace/kprobe_event_gen_test.ko
+ * # cat /sys/kernel/debug/tracing/trace
+ *
+ * You should see many instances of the "gen_kprobe_test" and
+ * "gen_kretprobe_test" events in the trace buffer.
+ *
+ * To remove the events, remove the module:
+ *
+ * # rmmod kprobe_event_gen_test
+ *
+ */
+
+static struct trace_event_file *gen_kprobe_test;
+static struct trace_event_file *gen_kretprobe_test;
+
+/*
+ * Test to make sure we can create a kprobe event, then add more
+ * fields.
+ */
+static int __init test_gen_kprobe_cmd(void)
+{
+ struct dynevent_cmd cmd;
+ char *buf;
+ int ret;
+
+ /* Create a buffer to hold the generated command */
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Before generating the command, initialize the cmd object */
+ kprobe_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ /*
+ * Define the gen_kprobe_test event with the first 2 kprobe
+ * fields.
+ */
+ ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test",
+ "do_sys_open",
+ "dfd=%ax", "filename=%dx");
+ if (ret)
+ goto free;
+
+ /* Use kprobe_event_add_fields to add the rest of the fields */
+
+ ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)");
+ if (ret)
+ goto free;
+
+ /*
+ * This actually creates the event.
+ */
+ ret = kprobe_event_gen_cmd_end(&cmd);
+ if (ret)
+ goto free;
+
+ /*
+ * Now get the gen_kprobe_test event file. We need to prevent
+ * the instance and event from disappearing from underneath
+ * us, which trace_get_event_file() does (though in this case
+ * we're using the top-level instance which never goes away).
+ */
+ gen_kprobe_test = trace_get_event_file(NULL, "kprobes",
+ "gen_kprobe_test");
+ if (IS_ERR(gen_kprobe_test)) {
+ ret = PTR_ERR(gen_kprobe_test);
+ goto delete;
+ }
+
+ /* Enable the event or you won't see anything */
+ ret = trace_array_set_clr_event(gen_kprobe_test->tr,
+ "kprobes", "gen_kprobe_test", true);
+ if (ret) {
+ trace_put_event_file(gen_kprobe_test);
+ goto delete;
+ }
+ out:
+ return ret;
+ delete:
+ /* We got an error after creating the event, delete it */
+ ret = kprobe_event_delete("gen_kprobe_test");
+ free:
+ kfree(buf);
+
+ goto out;
+}
+
+/*
+ * Test to make sure we can create a kretprobe event.
+ */
+static int __init test_gen_kretprobe_cmd(void)
+{
+ struct dynevent_cmd cmd;
+ char *buf;
+ int ret;
+
+ /* Create a buffer to hold the generated command */
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Before generating the command, initialize the cmd object */
+ kprobe_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ /*
+ * Define the kretprobe event.
+ */
+ ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test",
+ "do_sys_open",
+ "$retval");
+ if (ret)
+ goto free;
+
+ /*
+ * This actually creates the event.
+ */
+ ret = kretprobe_event_gen_cmd_end(&cmd);
+ if (ret)
+ goto free;
+
+ /*
+ * Now get the gen_kretprobe_test event file. We need to
+ * prevent the instance and event from disappearing from
+ * underneath us, which trace_get_event_file() does (though in
+ * this case we're using the top-level instance which never
+ * goes away).
+ */
+ gen_kretprobe_test = trace_get_event_file(NULL, "kprobes",
+ "gen_kretprobe_test");
+ if (IS_ERR(gen_kretprobe_test)) {
+ ret = PTR_ERR(gen_kretprobe_test);
+ goto delete;
+ }
+
+ /* Enable the event or you won't see anything */
+ ret = trace_array_set_clr_event(gen_kretprobe_test->tr,
+ "kprobes", "gen_kretprobe_test", true);
+ if (ret) {
+ trace_put_event_file(gen_kretprobe_test);
+ goto delete;
+ }
+ out:
+ return ret;
+ delete:
+ /* We got an error after creating the event, delete it */
+ ret = kprobe_event_delete("gen_kretprobe_test");
+ free:
+ kfree(buf);
+
+ goto out;
+}
+
+static int __init kprobe_event_gen_test_init(void)
+{
+ int ret;
+
+ ret = test_gen_kprobe_cmd();
+ if (ret)
+ return ret;
+
+ ret = test_gen_kretprobe_cmd();
+ if (ret) {
+ WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
+ "kprobes",
+ "gen_kretprobe_test", false));
+ trace_put_event_file(gen_kretprobe_test);
+ WARN_ON(kprobe_event_delete("gen_kretprobe_test"));
+ }
+
+ return ret;
+}
+
+static void __exit kprobe_event_gen_test_exit(void)
+{
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
+ "kprobes",
+ "gen_kprobe_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_kprobe_test);
+
+ /* Now unregister and free the event */
+ WARN_ON(kprobe_event_delete("gen_kprobe_test"));
+
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
+ "kprobes",
+ "gen_kretprobe_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_kretprobe_test);
+
+ /* Now unregister and free the event */
+ WARN_ON(kprobe_event_delete("gen_kretprobe_test"));
+}
+
+module_init(kprobe_event_gen_test_init)
+module_exit(kprobe_event_gen_test_exit)
+
+MODULE_AUTHOR("Tom Zanussi");
+MODULE_DESCRIPTION("kprobe event generation test");
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index d8765c952fab..31c0fad4cb9e 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -10,18 +10,25 @@
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kernel.h>
+#include <linux/kobject.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/string.h>
+#include <linux/sysfs.h>
static ulong delay = 100;
-static char test_mode[10] = "irq";
+static char test_mode[12] = "irq";
+static uint burst_size = 1;
-module_param_named(delay, delay, ulong, S_IRUGO);
-module_param_string(test_mode, test_mode, 10, S_IRUGO);
-MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)");
-MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)");
+module_param_named(delay, delay, ulong, 0444);
+module_param_string(test_mode, test_mode, 12, 0444);
+module_param_named(burst_size, burst_size, uint, 0444);
+MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)");
+MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)");
+MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)");
+
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
static void busy_wait(ulong time)
{
@@ -34,37 +41,136 @@ static void busy_wait(ulong time)
} while ((end - start) < (time * 1000));
}
-static int preemptirq_delay_run(void *data)
+static __always_inline void irqoff_test(void)
{
unsigned long flags;
+ local_irq_save(flags);
+ busy_wait(delay);
+ local_irq_restore(flags);
+}
- if (!strcmp(test_mode, "irq")) {
- local_irq_save(flags);
- busy_wait(delay);
- local_irq_restore(flags);
- } else if (!strcmp(test_mode, "preempt")) {
- preempt_disable();
- busy_wait(delay);
- preempt_enable();
+static __always_inline void preemptoff_test(void)
+{
+ preempt_disable();
+ busy_wait(delay);
+ preempt_enable();
+}
+
+static void execute_preemptirqtest(int idx)
+{
+ if (!strcmp(test_mode, "irq"))
+ irqoff_test();
+ else if (!strcmp(test_mode, "preempt"))
+ preemptoff_test();
+ else if (!strcmp(test_mode, "alternate")) {
+ if (idx % 2 == 0)
+ irqoff_test();
+ else
+ preemptoff_test();
}
+}
+
+#define DECLARE_TESTFN(POSTFIX) \
+ static void preemptirqtest_##POSTFIX(int idx) \
+ { \
+ execute_preemptirqtest(idx); \
+ } \
+/*
+ * We create 10 different functions, so that we can get 10 different
+ * backtraces.
+ */
+DECLARE_TESTFN(0)
+DECLARE_TESTFN(1)
+DECLARE_TESTFN(2)
+DECLARE_TESTFN(3)
+DECLARE_TESTFN(4)
+DECLARE_TESTFN(5)
+DECLARE_TESTFN(6)
+DECLARE_TESTFN(7)
+DECLARE_TESTFN(8)
+DECLARE_TESTFN(9)
+
+static void (*testfuncs[])(int) = {
+ preemptirqtest_0,
+ preemptirqtest_1,
+ preemptirqtest_2,
+ preemptirqtest_3,
+ preemptirqtest_4,
+ preemptirqtest_5,
+ preemptirqtest_6,
+ preemptirqtest_7,
+ preemptirqtest_8,
+ preemptirqtest_9,
+};
+
+#define NR_TEST_FUNCS ARRAY_SIZE(testfuncs)
+
+static int preemptirq_delay_run(void *data)
+{
+ int i;
+ int s = MIN(burst_size, NR_TEST_FUNCS);
+
+ for (i = 0; i < s; i++)
+ (testfuncs[i])(i);
return 0;
}
-static int __init preemptirq_delay_init(void)
+static struct task_struct *preemptirq_start_test(void)
{
char task_name[50];
- struct task_struct *test_task;
snprintf(task_name, sizeof(task_name), "%s_test", test_mode);
+ return kthread_run(preemptirq_delay_run, NULL, task_name);
+}
+
+
+static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ preemptirq_start_test();
+ return count;
+}
+
+static struct kobj_attribute trigger_attribute =
+ __ATTR(trigger, 0200, NULL, trigger_store);
+
+static struct attribute *attrs[] = {
+ &trigger_attribute.attr,
+ NULL,
+};
+
+static struct attribute_group attr_group = {
+ .attrs = attrs,
+};
+
+static struct kobject *preemptirq_delay_kobj;
+
+static int __init preemptirq_delay_init(void)
+{
+ struct task_struct *test_task;
+ int retval;
+
+ test_task = preemptirq_start_test();
+ retval = PTR_ERR_OR_ZERO(test_task);
+ if (retval != 0)
+ return retval;
+
+ preemptirq_delay_kobj = kobject_create_and_add("preemptirq_delay_test",
+ kernel_kobj);
+ if (!preemptirq_delay_kobj)
+ return -ENOMEM;
+
+ retval = sysfs_create_group(preemptirq_delay_kobj, &attr_group);
+ if (retval)
+ kobject_put(preemptirq_delay_kobj);
- test_task = kthread_run(preemptirq_delay_run, NULL, task_name);
- return PTR_ERR_OR_ZERO(test_task);
+ return retval;
}
static void __exit preemptirq_delay_exit(void)
{
- return;
+ kobject_put(preemptirq_delay_kobj);
}
module_init(preemptirq_delay_init)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 66358d66c933..61f0e92ace99 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -11,6 +11,7 @@
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/irq_work.h>
+#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kthread.h> /* for self test */
@@ -299,8 +300,6 @@ u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
-#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
-
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -442,7 +441,7 @@ enum {
struct ring_buffer_per_cpu {
int cpu;
atomic_t record_disabled;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
@@ -481,7 +480,7 @@ struct ring_buffer_per_cpu {
struct rb_irq_work irq_work;
};
-struct ring_buffer {
+struct trace_buffer {
unsigned flags;
int cpus;
atomic_t record_disabled;
@@ -517,7 +516,7 @@ struct ring_buffer_iter {
*
* Returns the number of pages used by a per_cpu buffer of the ring buffer.
*/
-size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu)
+size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
{
return buffer->buffers[cpu]->nr_pages;
}
@@ -529,7 +528,7 @@ size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu)
*
* Returns the number of pages that have content in the ring buffer.
*/
-size_t ring_buffer_nr_dirty_pages(struct ring_buffer *buffer, int cpu)
+size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
{
size_t read;
size_t cnt;
@@ -572,7 +571,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full)
+int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
{
struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
DEFINE_WAIT(wait);
@@ -683,7 +682,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full)
* Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers,
* zero otherwise.
*/
-__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -741,13 +740,13 @@ __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
/* Up this if you want to test the TIME_EXTENTS and normalization */
#define DEBUG_SHIFT 0
-static inline u64 rb_time_stamp(struct ring_buffer *buffer)
+static inline u64 rb_time_stamp(struct trace_buffer *buffer)
{
/* shift to debug/test normalization and TIME_EXTENTS */
return buffer->clock() << DEBUG_SHIFT;
}
-u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
+u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
{
u64 time;
@@ -759,7 +758,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
-void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer,
int cpu, u64 *ts)
{
/* Just stupid testing the normalize function and deltas */
@@ -1282,7 +1281,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
}
static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu)
+rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
@@ -1367,16 +1366,17 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
* __ring_buffer_alloc - allocate a new ring_buffer
* @size: the size in bytes per cpu that is needed.
* @flags: attributes to set for the ring buffer.
+ * @key: ring buffer reader_lock_key.
*
* Currently the only flag that is available is the RB_FL_OVERWRITE
* flag. This flag means that the buffer will overwrite old data
* when the buffer wraps. If this flag is not set, the buffer will
* drop data when the tail hits the head.
*/
-struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
long nr_pages;
int bsize;
int cpu;
@@ -1446,7 +1446,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
* @buffer: the buffer to free.
*/
void
-ring_buffer_free(struct ring_buffer *buffer)
+ring_buffer_free(struct trace_buffer *buffer)
{
int cpu;
@@ -1462,18 +1462,18 @@ ring_buffer_free(struct ring_buffer *buffer)
}
EXPORT_SYMBOL_GPL(ring_buffer_free);
-void ring_buffer_set_clock(struct ring_buffer *buffer,
+void ring_buffer_set_clock(struct trace_buffer *buffer,
u64 (*clock)(void))
{
buffer->clock = clock;
}
-void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
+void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs)
{
buffer->time_stamp_abs = abs;
}
-bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
+bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer)
{
return buffer->time_stamp_abs;
}
@@ -1711,7 +1711,7 @@ static void update_pages_handler(struct work_struct *work)
*
* Returns 0 on success and < 0 on failure.
*/
-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
+int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
int cpu_id)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -1890,7 +1890,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);
-void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
+void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val)
{
mutex_lock(&buffer->mutex);
if (val)
@@ -2205,7 +2205,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
{
struct buffer_page *tail_page = info->tail_page;
struct buffer_page *commit_page = cpu_buffer->commit_page;
- struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct trace_buffer *buffer = cpu_buffer->buffer;
struct buffer_page *next_page;
int ret;
@@ -2329,11 +2329,11 @@ static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
/**
* rb_update_event - update event type and data
+ * @cpu_buffer: The per cpu buffer of the @event
* @event: the event to update
- * @type: the type of event
- * @length: the size of the event field in the ring buffer
+ * @info: The info to update the @event with (contains length and delta)
*
- * Update the type and data fields of the event. The length
+ * Update the type and data fields of the @event. The length
* is the actual size that is written to the ring buffer,
* and with this, we can determine what to place into the
* data field.
@@ -2608,7 +2608,7 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
}
static __always_inline void
-rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
size_t nr_pages;
size_t dirty;
@@ -2732,7 +2732,7 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
* Call this function before calling another ring_buffer_lock_reserve() and
* call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
*/
-void ring_buffer_nest_start(struct ring_buffer *buffer)
+void ring_buffer_nest_start(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
@@ -2752,7 +2752,7 @@ void ring_buffer_nest_start(struct ring_buffer *buffer)
* Must be called after ring_buffer_nest_start() and after the
* ring_buffer_unlock_commit().
*/
-void ring_buffer_nest_end(struct ring_buffer *buffer)
+void ring_buffer_nest_end(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
@@ -2774,7 +2774,7 @@ void ring_buffer_nest_end(struct ring_buffer *buffer)
*
* Must be paired with ring_buffer_lock_reserve.
*/
-int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+int ring_buffer_unlock_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -2867,7 +2867,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
}
static __always_inline struct ring_buffer_event *
-rb_reserve_next_event(struct ring_buffer *buffer,
+rb_reserve_next_event(struct trace_buffer *buffer,
struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length)
{
@@ -2960,7 +2960,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
* If NULL is returned, then nothing has been allocated or locked.
*/
struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
+ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
@@ -3061,7 +3061,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
* If this function is called, do not call ring_buffer_unlock_commit on
* the event.
*/
-void ring_buffer_discard_commit(struct ring_buffer *buffer,
+void ring_buffer_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3112,7 +3112,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
* Note, like ring_buffer_lock_reserve, the length is the length of the data
* and not the length of the event which would hold the header.
*/
-int ring_buffer_write(struct ring_buffer *buffer,
+int ring_buffer_write(struct trace_buffer *buffer,
unsigned long length,
void *data)
{
@@ -3192,7 +3192,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
*
* The caller should call synchronize_rcu() after this.
*/
-void ring_buffer_record_disable(struct ring_buffer *buffer)
+void ring_buffer_record_disable(struct trace_buffer *buffer)
{
atomic_inc(&buffer->record_disabled);
}
@@ -3205,7 +3205,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
* Note, multiple disables will need the same number of enables
* to truly enable the writing (much like preempt_disable).
*/
-void ring_buffer_record_enable(struct ring_buffer *buffer)
+void ring_buffer_record_enable(struct trace_buffer *buffer)
{
atomic_dec(&buffer->record_disabled);
}
@@ -3222,7 +3222,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
* it works like an on/off switch, where as the disable() version
* must be paired with a enable().
*/
-void ring_buffer_record_off(struct ring_buffer *buffer)
+void ring_buffer_record_off(struct trace_buffer *buffer)
{
unsigned int rd;
unsigned int new_rd;
@@ -3245,7 +3245,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off);
* it works like an on/off switch, where as the enable() version
* must be paired with a disable().
*/
-void ring_buffer_record_on(struct ring_buffer *buffer)
+void ring_buffer_record_on(struct trace_buffer *buffer)
{
unsigned int rd;
unsigned int new_rd;
@@ -3263,7 +3263,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_on);
*
* Returns true if the ring buffer is in a state that it accepts writes.
*/
-bool ring_buffer_record_is_on(struct ring_buffer *buffer)
+bool ring_buffer_record_is_on(struct trace_buffer *buffer)
{
return !atomic_read(&buffer->record_disabled);
}
@@ -3279,7 +3279,7 @@ bool ring_buffer_record_is_on(struct ring_buffer *buffer)
* ring_buffer_record_disable(), as that is a temporary disabling of
* the ring buffer.
*/
-bool ring_buffer_record_is_set_on(struct ring_buffer *buffer)
+bool ring_buffer_record_is_set_on(struct trace_buffer *buffer)
{
return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF);
}
@@ -3294,7 +3294,7 @@ bool ring_buffer_record_is_set_on(struct ring_buffer *buffer)
*
* The caller should call synchronize_rcu() after this.
*/
-void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3314,7 +3314,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
* Note, multiple disables will need the same number of enables
* to truly enable the writing (much like preempt_disable).
*/
-void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3344,7 +3344,7 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
* @buffer: The ring buffer
* @cpu: The per CPU buffer to read from.
*/
-u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
+u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
{
unsigned long flags;
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3377,7 +3377,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
* @buffer: The ring buffer
* @cpu: The per CPU buffer to read from.
*/
-unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
+unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
@@ -3397,7 +3397,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the entries from.
*/
-unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3416,7 +3416,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the number of overruns from
*/
-unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
@@ -3439,7 +3439,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
* @cpu: The per CPU buffer to get the number of overruns from
*/
unsigned long
-ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
@@ -3461,7 +3461,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
* @cpu: The per CPU buffer to get the number of overruns from
*/
unsigned long
-ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
+ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
@@ -3482,7 +3482,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
* @cpu: The per CPU buffer to get the number of events read
*/
unsigned long
-ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -3501,7 +3501,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
* Returns the total number of entries in the ring buffer
* (all CPU entries)
*/
-unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+unsigned long ring_buffer_entries(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long entries = 0;
@@ -3524,7 +3524,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries);
* Returns the total number of overruns in the ring buffer
* (all CPU entries)
*/
-unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long overruns = 0;
@@ -3948,7 +3948,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_peek);
static struct ring_buffer_event *
rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
int nr_loops = 0;
@@ -4076,7 +4076,7 @@ rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked)
* not consume the data.
*/
struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
unsigned long *lost_events)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
@@ -4140,7 +4140,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
* and eventually empty the ring buffer if the producer is slower.
*/
struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
unsigned long *lost_events)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -4200,7 +4200,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* This overall must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@@ -4330,8 +4330,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
/**
* ring_buffer_size - return the size of the ring buffer (in bytes)
* @buffer: The ring buffer.
+ * @cpu: The CPU to get ring buffer size from.
*/
-unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
+unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
{
/*
* Earlier, this method returned
@@ -4397,7 +4398,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
* @buffer: The ring buffer to reset a per cpu buffer of
* @cpu: The CPU buffer to be reset
*/
-void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
unsigned long flags;
@@ -4434,7 +4435,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
* ring_buffer_reset - reset a ring buffer
* @buffer: The ring buffer to reset all cpu buffers
*/
-void ring_buffer_reset(struct ring_buffer *buffer)
+void ring_buffer_reset(struct trace_buffer *buffer)
{
int cpu;
@@ -4447,7 +4448,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
* rind_buffer_empty - is the ring buffer empty?
* @buffer: The ring buffer to test
*/
-bool ring_buffer_empty(struct ring_buffer *buffer)
+bool ring_buffer_empty(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
@@ -4477,7 +4478,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
* @buffer: The ring buffer
* @cpu: The CPU buffer to test
*/
-bool ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
@@ -4503,14 +4504,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
* @buffer_a: One buffer to swap with
* @buffer_b: The other buffer to swap with
+ * @cpu: the CPU of the buffers to swap
*
* This function is useful for tracers that want to take a "snapshot"
* of a CPU buffer and has another back up buffer lying around.
* it is expected that the tracer handles the cpu buffer not being
* used at the moment.
*/
-int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
- struct ring_buffer *buffer_b, int cpu)
+int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
+ struct trace_buffer *buffer_b, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer_a;
struct ring_buffer_per_cpu *cpu_buffer_b;
@@ -4589,7 +4591,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* Returns:
* The page allocated, or ERR_PTR
*/
-void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
+void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = NULL;
@@ -4636,7 +4638,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
-void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct buffer_data_page *bpage = data;
@@ -4696,7 +4698,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* >=0 if data has been transferred, returns the offset of consumed data.
* <0 if no data has been transferred.
*/
-int ring_buffer_read_page(struct ring_buffer *buffer,
+int ring_buffer_read_page(struct trace_buffer *buffer,
void **data_page, size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
@@ -4867,12 +4869,12 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_page);
*/
int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
long nr_pages_same;
int cpu_i;
unsigned long nr_pages;
- buffer = container_of(node, struct ring_buffer, node);
+ buffer = container_of(node, struct trace_buffer, node);
if (cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
@@ -4922,7 +4924,7 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
static struct task_struct *rb_threads[NR_CPUS] __initdata;
struct rb_test_data {
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long events;
unsigned long bytes_written;
unsigned long bytes_alloc;
@@ -5064,10 +5066,15 @@ static __init int rb_hammer_test(void *arg)
static __init int test_ringbuffer(void)
{
struct task_struct *rb_hammer;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
int cpu;
int ret = 0;
+ if (security_locked_down(LOCKDOWN_TRACEFS)) {
+ pr_warn("Lockdown is enabled, skipping ring buffer tests\n");
+ return 0;
+ }
+
pr_info("Running ring buffer tests...\n");
buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0564f6db0561..8df0aa810950 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -29,7 +29,7 @@ static int reader_finish;
static DECLARE_COMPLETION(read_start);
static DECLARE_COMPLETION(read_done);
-static struct ring_buffer *buffer;
+static struct trace_buffer *buffer;
static struct task_struct *producer;
static struct task_struct *consumer;
static unsigned long read;
@@ -267,12 +267,12 @@ static void ring_buffer_producer(void)
if (consumer && !(cnt % wakeup_interval))
wake_up_process(consumer);
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
/*
- * If we are a non preempt kernel, the 10 second run will
+ * If we are a non preempt kernel, the 10 seconds run will
* stop everything while it runs. Instead, we will call
* cond_resched and also add any time that was lost by a
- * rescedule.
+ * reschedule.
*
* Do a cond resched at the same frequency we would wake up
* the reader.
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
new file mode 100644
index 000000000000..4aefe003cb7c
--- /dev/null
+++ b/kernel/trace/synth_event_gen_test.c
@@ -0,0 +1,523 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test module for in-kernel sythetic event creation and generation.
+ *
+ * Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <linux/trace_events.h>
+
+/*
+ * This module is a simple test of basic functionality for in-kernel
+ * synthetic event creation and generation, the first and second tests
+ * using synth_event_gen_cmd_start() and synth_event_add_field(), the
+ * third uses synth_event_create() to do it all at once with a static
+ * field array.
+ *
+ * Following that are a few examples using the created events to test
+ * various ways of tracing a synthetic event.
+ *
+ * To test, select CONFIG_SYNTH_EVENT_GEN_TEST and build the module.
+ * Then:
+ *
+ * # insmod kernel/trace/synth_event_gen_test.ko
+ * # cat /sys/kernel/debug/tracing/trace
+ *
+ * You should see several events in the trace buffer -
+ * "create_synth_test", "empty_synth_test", and several instances of
+ * "gen_synth_test".
+ *
+ * To remove the events, remove the module:
+ *
+ * # rmmod synth_event_gen_test
+ *
+ */
+
+static struct trace_event_file *create_synth_test;
+static struct trace_event_file *empty_synth_test;
+static struct trace_event_file *gen_synth_test;
+
+/*
+ * Test to make sure we can create a synthetic event, then add more
+ * fields.
+ */
+static int __init test_gen_synth_cmd(void)
+{
+ struct dynevent_cmd cmd;
+ u64 vals[7];
+ char *buf;
+ int ret;
+
+ /* Create a buffer to hold the generated command */
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Before generating the command, initialize the cmd object */
+ synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ /*
+ * Create the empty gen_synth_test synthetic event with the
+ * first 4 fields.
+ */
+ ret = synth_event_gen_cmd_start(&cmd, "gen_synth_test", THIS_MODULE,
+ "pid_t", "next_pid_field",
+ "char[16]", "next_comm_field",
+ "u64", "ts_ns",
+ "u64", "ts_ms");
+ if (ret)
+ goto free;
+
+ /* Use synth_event_add_field to add the rest of the fields */
+
+ ret = synth_event_add_field(&cmd, "unsigned int", "cpu");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "char[64]", "my_string_field");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "int", "my_int_field");
+ if (ret)
+ goto free;
+
+ ret = synth_event_gen_cmd_end(&cmd);
+ if (ret)
+ goto free;
+
+ /*
+ * Now get the gen_synth_test event file. We need to prevent
+ * the instance and event from disappearing from underneath
+ * us, which trace_get_event_file() does (though in this case
+ * we're using the top-level instance which never goes away).
+ */
+ gen_synth_test = trace_get_event_file(NULL, "synthetic",
+ "gen_synth_test");
+ if (IS_ERR(gen_synth_test)) {
+ ret = PTR_ERR(gen_synth_test);
+ goto delete;
+ }
+
+ /* Enable the event or you won't see anything */
+ ret = trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic", "gen_synth_test", true);
+ if (ret) {
+ trace_put_event_file(gen_synth_test);
+ goto delete;
+ }
+
+ /* Create some bogus values just for testing */
+
+ vals[0] = 777; /* next_pid_field */
+ vals[1] = (u64)"hula hoops"; /* next_comm_field */
+ vals[2] = 1000000; /* ts_ns */
+ vals[3] = 1000; /* ts_ms */
+ vals[4] = smp_processor_id(); /* cpu */
+ vals[5] = (u64)"thneed"; /* my_string_field */
+ vals[6] = 598; /* my_int_field */
+
+ /* Now generate a gen_synth_test event */
+ ret = synth_event_trace_array(gen_synth_test, vals, ARRAY_SIZE(vals));
+ out:
+ return ret;
+ delete:
+ /* We got an error after creating the event, delete it */
+ synth_event_delete("gen_synth_test");
+ free:
+ kfree(buf);
+
+ goto out;
+}
+
+/*
+ * Test to make sure we can create an initially empty synthetic event,
+ * then add all the fields.
+ */
+static int __init test_empty_synth_event(void)
+{
+ struct dynevent_cmd cmd;
+ u64 vals[7];
+ char *buf;
+ int ret;
+
+ /* Create a buffer to hold the generated command */
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Before generating the command, initialize the cmd object */
+ synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ /*
+ * Create the empty_synth_test synthetic event with no fields.
+ */
+ ret = synth_event_gen_cmd_start(&cmd, "empty_synth_test", THIS_MODULE);
+ if (ret)
+ goto free;
+
+ /* Use synth_event_add_field to add all of the fields */
+
+ ret = synth_event_add_field(&cmd, "pid_t", "next_pid_field");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "char[16]", "next_comm_field");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "u64", "ts_ns");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "u64", "ts_ms");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "unsigned int", "cpu");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "char[64]", "my_string_field");
+ if (ret)
+ goto free;
+
+ ret = synth_event_add_field(&cmd, "int", "my_int_field");
+ if (ret)
+ goto free;
+
+ /* All fields have been added, close and register the synth event */
+
+ ret = synth_event_gen_cmd_end(&cmd);
+ if (ret)
+ goto free;
+
+ /*
+ * Now get the empty_synth_test event file. We need to
+ * prevent the instance and event from disappearing from
+ * underneath us, which trace_get_event_file() does (though in
+ * this case we're using the top-level instance which never
+ * goes away).
+ */
+ empty_synth_test = trace_get_event_file(NULL, "synthetic",
+ "empty_synth_test");
+ if (IS_ERR(empty_synth_test)) {
+ ret = PTR_ERR(empty_synth_test);
+ goto delete;
+ }
+
+ /* Enable the event or you won't see anything */
+ ret = trace_array_set_clr_event(empty_synth_test->tr,
+ "synthetic", "empty_synth_test", true);
+ if (ret) {
+ trace_put_event_file(empty_synth_test);
+ goto delete;
+ }
+
+ /* Create some bogus values just for testing */
+
+ vals[0] = 777; /* next_pid_field */
+ vals[1] = (u64)"tiddlywinks"; /* next_comm_field */
+ vals[2] = 1000000; /* ts_ns */
+ vals[3] = 1000; /* ts_ms */
+ vals[4] = smp_processor_id(); /* cpu */
+ vals[5] = (u64)"thneed_2.0"; /* my_string_field */
+ vals[6] = 399; /* my_int_field */
+
+ /* Now trace an empty_synth_test event */
+ ret = synth_event_trace_array(empty_synth_test, vals, ARRAY_SIZE(vals));
+ out:
+ return ret;
+ delete:
+ /* We got an error after creating the event, delete it */
+ synth_event_delete("empty_synth_test");
+ free:
+ kfree(buf);
+
+ goto out;
+}
+
+static struct synth_field_desc create_synth_test_fields[] = {
+ { .type = "pid_t", .name = "next_pid_field" },
+ { .type = "char[16]", .name = "next_comm_field" },
+ { .type = "u64", .name = "ts_ns" },
+ { .type = "u64", .name = "ts_ms" },
+ { .type = "unsigned int", .name = "cpu" },
+ { .type = "char[64]", .name = "my_string_field" },
+ { .type = "int", .name = "my_int_field" },
+};
+
+/*
+ * Test synthetic event creation all at once from array of field
+ * descriptors.
+ */
+static int __init test_create_synth_event(void)
+{
+ u64 vals[7];
+ int ret;
+
+ /* Create the create_synth_test event with the fields above */
+ ret = synth_event_create("create_synth_test",
+ create_synth_test_fields,
+ ARRAY_SIZE(create_synth_test_fields),
+ THIS_MODULE);
+ if (ret)
+ goto out;
+
+ /*
+ * Now get the create_synth_test event file. We need to
+ * prevent the instance and event from disappearing from
+ * underneath us, which trace_get_event_file() does (though in
+ * this case we're using the top-level instance which never
+ * goes away).
+ */
+ create_synth_test = trace_get_event_file(NULL, "synthetic",
+ "create_synth_test");
+ if (IS_ERR(create_synth_test)) {
+ ret = PTR_ERR(create_synth_test);
+ goto delete;
+ }
+
+ /* Enable the event or you won't see anything */
+ ret = trace_array_set_clr_event(create_synth_test->tr,
+ "synthetic", "create_synth_test", true);
+ if (ret) {
+ trace_put_event_file(create_synth_test);
+ goto delete;
+ }
+
+ /* Create some bogus values just for testing */
+
+ vals[0] = 777; /* next_pid_field */
+ vals[1] = (u64)"tiddlywinks"; /* next_comm_field */
+ vals[2] = 1000000; /* ts_ns */
+ vals[3] = 1000; /* ts_ms */
+ vals[4] = smp_processor_id(); /* cpu */
+ vals[5] = (u64)"thneed"; /* my_string_field */
+ vals[6] = 398; /* my_int_field */
+
+ /* Now generate a create_synth_test event */
+ ret = synth_event_trace_array(create_synth_test, vals, ARRAY_SIZE(vals));
+ out:
+ return ret;
+ delete:
+ /* We got an error after creating the event, delete it */
+ ret = synth_event_delete("create_synth_test");
+
+ goto out;
+}
+
+/*
+ * Test tracing a synthetic event by reserving trace buffer space,
+ * then filling in fields one after another.
+ */
+static int __init test_add_next_synth_val(void)
+{
+ struct synth_event_trace_state trace_state;
+ int ret;
+
+ /* Start by reserving space in the trace buffer */
+ ret = synth_event_trace_start(gen_synth_test, &trace_state);
+ if (ret)
+ return ret;
+
+ /* Write some bogus values into the trace buffer, one after another */
+
+ /* next_pid_field */
+ ret = synth_event_add_next_val(777, &trace_state);
+ if (ret)
+ goto out;
+
+ /* next_comm_field */
+ ret = synth_event_add_next_val((u64)"slinky", &trace_state);
+ if (ret)
+ goto out;
+
+ /* ts_ns */
+ ret = synth_event_add_next_val(1000000, &trace_state);
+ if (ret)
+ goto out;
+
+ /* ts_ms */
+ ret = synth_event_add_next_val(1000, &trace_state);
+ if (ret)
+ goto out;
+
+ /* cpu */
+ ret = synth_event_add_next_val(smp_processor_id(), &trace_state);
+ if (ret)
+ goto out;
+
+ /* my_string_field */
+ ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state);
+ if (ret)
+ goto out;
+
+ /* my_int_field */
+ ret = synth_event_add_next_val(395, &trace_state);
+ out:
+ /* Finally, commit the event */
+ ret = synth_event_trace_end(&trace_state);
+
+ return ret;
+}
+
+/*
+ * Test tracing a synthetic event by reserving trace buffer space,
+ * then filling in fields using field names, which can be done in any
+ * order.
+ */
+static int __init test_add_synth_val(void)
+{
+ struct synth_event_trace_state trace_state;
+ int ret;
+
+ /* Start by reserving space in the trace buffer */
+ ret = synth_event_trace_start(gen_synth_test, &trace_state);
+ if (ret)
+ return ret;
+
+ /* Write some bogus values into the trace buffer, using field names */
+
+ ret = synth_event_add_val("ts_ns", 1000000, &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("ts_ms", 1000, &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("next_pid_field", 777, &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("next_comm_field", (u64)"silly putty",
+ &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("my_string_field", (u64)"thneed_9",
+ &trace_state);
+ if (ret)
+ goto out;
+
+ ret = synth_event_add_val("my_int_field", 3999, &trace_state);
+ out:
+ /* Finally, commit the event */
+ ret = synth_event_trace_end(&trace_state);
+
+ return ret;
+}
+
+/*
+ * Test tracing a synthetic event all at once from array of values.
+ */
+static int __init test_trace_synth_event(void)
+{
+ int ret;
+
+ /* Trace some bogus values just for testing */
+ ret = synth_event_trace(create_synth_test, 7, /* number of values */
+ 444, /* next_pid_field */
+ (u64)"clackers", /* next_comm_field */
+ 1000000, /* ts_ns */
+ 1000, /* ts_ms */
+ smp_processor_id(), /* cpu */
+ (u64)"Thneed", /* my_string_field */
+ 999); /* my_int_field */
+ return ret;
+}
+
+static int __init synth_event_gen_test_init(void)
+{
+ int ret;
+
+ ret = test_gen_synth_cmd();
+ if (ret)
+ return ret;
+
+ ret = test_empty_synth_event();
+ if (ret) {
+ WARN_ON(trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic",
+ "gen_synth_test", false));
+ trace_put_event_file(gen_synth_test);
+ WARN_ON(synth_event_delete("gen_synth_test"));
+ goto out;
+ }
+
+ ret = test_create_synth_event();
+ if (ret) {
+ WARN_ON(trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic",
+ "gen_synth_test", false));
+ trace_put_event_file(gen_synth_test);
+ WARN_ON(synth_event_delete("gen_synth_test"));
+
+ WARN_ON(trace_array_set_clr_event(empty_synth_test->tr,
+ "synthetic",
+ "empty_synth_test", false));
+ trace_put_event_file(empty_synth_test);
+ WARN_ON(synth_event_delete("empty_synth_test"));
+ goto out;
+ }
+
+ ret = test_add_next_synth_val();
+ WARN_ON(ret);
+
+ ret = test_add_synth_val();
+ WARN_ON(ret);
+
+ ret = test_trace_synth_event();
+ WARN_ON(ret);
+ out:
+ return ret;
+}
+
+static void __exit synth_event_gen_test_exit(void)
+{
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic",
+ "gen_synth_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_synth_test);
+
+ /* Now unregister and free the synthetic event */
+ WARN_ON(synth_event_delete("gen_synth_test"));
+
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(empty_synth_test->tr,
+ "synthetic",
+ "empty_synth_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(empty_synth_test);
+
+ /* Now unregister and free the synthetic event */
+ WARN_ON(synth_event_delete("empty_synth_test"));
+
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(create_synth_test->tr,
+ "synthetic",
+ "create_synth_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(create_synth_test);
+
+ /* Now unregister and free the synthetic event */
+ WARN_ON(synth_event_delete("create_synth_test"));
+}
+
+module_init(synth_event_gen_test_init)
+module_exit(synth_event_gen_test_exit)
+
+MODULE_AUTHOR("Tom Zanussi");
+MODULE_DESCRIPTION("synthetic event generation test");
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 525a97fbbc60..c797a15a1fc7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
#include <linux/stacktrace.h>
#include <linux/writeback.h>
#include <linux/kallsyms.h>
+#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/irqflags.h>
@@ -44,6 +45,9 @@
#include <linux/trace.h>
#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
+#include <linux/fsnotify.h>
+#include <linux/irq_work.h>
+#include <linux/workqueue.h>
#include "trace.h"
#include "trace_output.h"
@@ -158,8 +162,8 @@ union trace_eval_map_item {
static union trace_eval_map_item *trace_eval_maps;
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
-static int tracing_set_tracer(struct trace_array *tr, const char *buf);
-static void ftrace_trace_userstack(struct ring_buffer *buffer,
+int tracing_set_tracer(struct trace_array *tr, const char *buf);
+static void ftrace_trace_userstack(struct trace_buffer *buffer,
unsigned long flags, int pc);
#define MAX_TRACER_SIZE 100
@@ -297,15 +301,44 @@ static void __trace_array_put(struct trace_array *this_tr)
this_tr->ref--;
}
+/**
+ * trace_array_put - Decrement the reference counter for this trace array.
+ *
+ * NOTE: Use this when we no longer need the trace array returned by
+ * trace_array_get_by_name(). This ensures the trace array can be later
+ * destroyed.
+ *
+ */
void trace_array_put(struct trace_array *this_tr)
{
+ if (!this_tr)
+ return;
+
mutex_lock(&trace_types_lock);
__trace_array_put(this_tr);
mutex_unlock(&trace_types_lock);
}
+EXPORT_SYMBOL_GPL(trace_array_put);
+
+int tracing_check_open_get_tr(struct trace_array *tr)
+{
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (tr && trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ return 0;
+}
int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
@@ -570,7 +603,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
return read;
}
-static u64 buffer_ftrace_now(struct trace_buffer *buf, int cpu)
+static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
{
u64 ts;
@@ -586,7 +619,7 @@ static u64 buffer_ftrace_now(struct trace_buffer *buf, int cpu)
u64 ftrace_now(int cpu)
{
- return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
+ return buffer_ftrace_now(&global_trace.array_buffer, cpu);
}
/**
@@ -714,22 +747,22 @@ static inline void trace_access_lock_init(void)
#endif
#ifdef CONFIG_STACKTRACE
-static void __ftrace_trace_stack(struct ring_buffer *buffer,
+static void __ftrace_trace_stack(struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs);
#else
-static inline void __ftrace_trace_stack(struct ring_buffer *buffer,
+static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs)
{
}
static inline void ftrace_trace_stack(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs)
{
@@ -747,7 +780,7 @@ trace_event_setup(struct ring_buffer_event *event,
}
static __always_inline struct ring_buffer_event *
-__trace_buffer_lock_reserve(struct ring_buffer *buffer,
+__trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
unsigned long flags, int pc)
@@ -763,8 +796,8 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer,
void tracer_tracing_on(struct trace_array *tr)
{
- if (tr->trace_buffer.buffer)
- ring_buffer_record_on(tr->trace_buffer.buffer);
+ if (tr->array_buffer.buffer)
+ ring_buffer_record_on(tr->array_buffer.buffer);
/*
* This flag is looked at when buffers haven't been allocated
* yet, or by some tracers (like irqsoff), that just want to
@@ -792,7 +825,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
static __always_inline void
-__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event)
{
__this_cpu_write(trace_taskinfo_save, true);
@@ -815,7 +848,7 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve
int __trace_puts(unsigned long ip, const char *str, int size)
{
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
int alloc;
@@ -832,11 +865,14 @@ int __trace_puts(unsigned long ip, const char *str, int size)
alloc = sizeof(*entry) + size + 2; /* possible \n added */
local_save_flags(irq_flags);
- buffer = global_trace.trace_buffer.buffer;
+ buffer = global_trace.array_buffer.buffer;
+ ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
irq_flags, pc);
- if (!event)
- return 0;
+ if (!event) {
+ size = 0;
+ goto out;
+ }
entry = ring_buffer_event_data(event);
entry->ip = ip;
@@ -852,7 +888,8 @@ int __trace_puts(unsigned long ip, const char *str, int size)
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
-
+ out:
+ ring_buffer_nest_end(buffer);
return size;
}
EXPORT_SYMBOL_GPL(__trace_puts);
@@ -865,10 +902,11 @@ EXPORT_SYMBOL_GPL(__trace_puts);
int __trace_bputs(unsigned long ip, const char *str)
{
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct bputs_entry *entry;
unsigned long irq_flags;
int size = sizeof(struct bputs_entry);
+ int ret = 0;
int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
@@ -880,11 +918,13 @@ int __trace_bputs(unsigned long ip, const char *str)
return 0;
local_save_flags(irq_flags);
- buffer = global_trace.trace_buffer.buffer;
+ buffer = global_trace.array_buffer.buffer;
+
+ ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
irq_flags, pc);
if (!event)
- return 0;
+ goto out;
entry = ring_buffer_event_data(event);
entry->ip = ip;
@@ -893,7 +933,10 @@ int __trace_bputs(unsigned long ip, const char *str)
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
- return 1;
+ ret = 1;
+ out:
+ ring_buffer_nest_end(buffer);
+ return ret;
}
EXPORT_SYMBOL_GPL(__trace_bputs);
@@ -1003,9 +1046,9 @@ void *tracing_cond_snapshot_data(struct trace_array *tr)
}
EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
-static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
- struct trace_buffer *size_buf, int cpu_id);
-static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
+static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
+ struct array_buffer *size_buf, int cpu_id);
+static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
@@ -1015,7 +1058,7 @@ int tracing_alloc_snapshot_instance(struct trace_array *tr)
/* allocate spare buffer */
ret = resize_buffer_duplicate_size(&tr->max_buffer,
- &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
+ &tr->array_buffer, RING_BUFFER_ALL_CPUS);
if (ret < 0)
return ret;
@@ -1218,8 +1261,8 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
void tracer_tracing_off(struct trace_array *tr)
{
- if (tr->trace_buffer.buffer)
- ring_buffer_record_off(tr->trace_buffer.buffer);
+ if (tr->array_buffer.buffer)
+ ring_buffer_record_off(tr->array_buffer.buffer);
/*
* This flag is looked at when buffers haven't been allocated
* yet, or by some tracers (like irqsoff), that just want to
@@ -1261,8 +1304,8 @@ void disable_trace_on_warning(void)
*/
bool tracer_tracing_is_on(struct trace_array *tr)
{
- if (tr->trace_buffer.buffer)
- return ring_buffer_record_is_on(tr->trace_buffer.buffer);
+ if (tr->array_buffer.buffer)
+ return ring_buffer_record_is_on(tr->array_buffer.buffer);
return !tr->buffer_disabled;
}
@@ -1479,6 +1522,74 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
}
unsigned long __read_mostly tracing_thresh;
+static const struct file_operations tracing_max_lat_fops;
+
+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ defined(CONFIG_FSNOTIFY)
+
+static struct workqueue_struct *fsnotify_wq;
+
+static void latency_fsnotify_workfn(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array,
+ fsnotify_work);
+ fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
+ tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+}
+
+static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
+{
+ struct trace_array *tr = container_of(iwork, struct trace_array,
+ fsnotify_irqwork);
+ queue_work(fsnotify_wq, &tr->fsnotify_work);
+}
+
+static void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+ INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
+ init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
+ tr->d_max_latency = trace_create_file("tracing_max_latency", 0644,
+ d_tracer, &tr->max_latency,
+ &tracing_max_lat_fops);
+}
+
+__init static int latency_fsnotify_init(void)
+{
+ fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (!fsnotify_wq) {
+ pr_err("Unable to allocate tr_max_lat_wq\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+late_initcall_sync(latency_fsnotify_init);
+
+void latency_fsnotify(struct trace_array *tr)
+{
+ if (!fsnotify_wq)
+ return;
+ /*
+ * We cannot call queue_work(&tr->fsnotify_work) from here because it's
+ * possible that we are called from __schedule() or do_idle(), which
+ * could cause a deadlock.
+ */
+ irq_work_queue(&tr->fsnotify_irqwork);
+}
+
+/*
+ * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ * defined(CONFIG_FSNOTIFY)
+ */
+#else
+
+#define trace_create_maxlat_file(tr, d_tracer) \
+ trace_create_file("tracing_max_latency", 0644, d_tracer, \
+ &tr->max_latency, &tracing_max_lat_fops)
+
+#endif
#ifdef CONFIG_TRACER_MAX_TRACE
/*
@@ -1489,8 +1600,8 @@ unsigned long __read_mostly tracing_thresh;
static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct trace_buffer *trace_buf = &tr->trace_buffer;
- struct trace_buffer *max_buf = &tr->max_buffer;
+ struct array_buffer *trace_buf = &tr->array_buffer;
+ struct array_buffer *max_buf = &tr->max_buffer;
struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
@@ -1518,6 +1629,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
/* record this tasks comm */
tracing_record_cmdline(tsk);
+ latency_fsnotify(tr);
}
/**
@@ -1547,8 +1659,8 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
arch_spin_lock(&tr->max_lock);
- /* Inherit the recordable setting from trace_buffer */
- if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer))
+ /* Inherit the recordable setting from array_buffer */
+ if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
ring_buffer_record_on(tr->max_buffer.buffer);
else
ring_buffer_record_off(tr->max_buffer.buffer);
@@ -1557,7 +1669,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data))
goto out_unlock;
#endif
- swap(tr->trace_buffer.buffer, tr->max_buffer.buffer);
+ swap(tr->array_buffer.buffer, tr->max_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
@@ -1567,9 +1679,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
/**
* update_max_tr_single - only copy one trace over, and reset the rest
- * @tr - tracer
- * @tsk - task with the latency
- * @cpu - the cpu of the buffer to copy.
+ * @tr: tracer
+ * @tsk: task with the latency
+ * @cpu: the cpu of the buffer to copy.
*
* Flip the trace of a single CPU buffer between the @tr and the max_tr.
*/
@@ -1590,7 +1702,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
arch_spin_lock(&tr->max_lock);
- ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
+ ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu);
if (ret == -EBUSY) {
/*
@@ -1616,7 +1728,7 @@ static int wait_on_pipe(struct trace_iterator *iter, int full)
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file,
full);
}
@@ -1667,7 +1779,7 @@ static int run_tracer_selftest(struct tracer *type)
* internal tracing to verify that everything is in order.
* If we fail, we do not register this tracer.
*/
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
tr->current_trace = type;
@@ -1693,7 +1805,7 @@ static int run_tracer_selftest(struct tracer *type)
return -1;
}
/* Only reset on passing, to avoid touching corrupted buffers */
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
if (type->use_max_tr) {
@@ -1767,7 +1879,7 @@ static void __init apply_trace_boot_options(void);
/**
* register_tracer - register a tracer with the ftrace system.
- * @type - the plugin for the tracer
+ * @type: the plugin for the tracer
*
* Register a new plugin tracer.
*/
@@ -1786,6 +1898,12 @@ int __init register_tracer(struct tracer *type)
return -1;
}
+ if (security_locked_down(LOCKDOWN_TRACEFS)) {
+ pr_warn("Can not register tracer %s due to lockdown\n",
+ type->name);
+ return -EPERM;
+ }
+
mutex_lock(&trace_types_lock);
tracing_selftest_running = true;
@@ -1854,9 +1972,9 @@ int __init register_tracer(struct tracer *type)
return ret;
}
-void tracing_reset(struct trace_buffer *buf, int cpu)
+static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
{
- struct ring_buffer *buffer = buf->buffer;
+ struct trace_buffer *buffer = buf->buffer;
if (!buffer)
return;
@@ -1870,9 +1988,9 @@ void tracing_reset(struct trace_buffer *buf, int cpu)
ring_buffer_record_enable(buffer);
}
-void tracing_reset_online_cpus(struct trace_buffer *buf)
+void tracing_reset_online_cpus(struct array_buffer *buf)
{
- struct ring_buffer *buffer = buf->buffer;
+ struct trace_buffer *buffer = buf->buffer;
int cpu;
if (!buffer)
@@ -1900,7 +2018,7 @@ void tracing_reset_all_online_cpus(void)
if (!tr->clear_trace)
continue;
tr->clear_trace = false;
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
tracing_reset_online_cpus(&tr->max_buffer);
#endif
@@ -1990,7 +2108,7 @@ int is_tracing_stopped(void)
*/
void tracing_start(void)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long flags;
if (tracing_disabled)
@@ -2009,7 +2127,7 @@ void tracing_start(void)
/* Prevent the buffers from switching */
arch_spin_lock(&global_trace.max_lock);
- buffer = global_trace.trace_buffer.buffer;
+ buffer = global_trace.array_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
@@ -2027,7 +2145,7 @@ void tracing_start(void)
static void tracing_start_tr(struct trace_array *tr)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long flags;
if (tracing_disabled)
@@ -2048,7 +2166,7 @@ static void tracing_start_tr(struct trace_array *tr)
goto out;
}
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
@@ -2064,7 +2182,7 @@ static void tracing_start_tr(struct trace_array *tr)
*/
void tracing_stop(void)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long flags;
raw_spin_lock_irqsave(&global_trace.start_lock, flags);
@@ -2074,7 +2192,7 @@ void tracing_stop(void)
/* Prevent the buffers from switching */
arch_spin_lock(&global_trace.max_lock);
- buffer = global_trace.trace_buffer.buffer;
+ buffer = global_trace.array_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
@@ -2092,7 +2210,7 @@ void tracing_stop(void)
static void tracing_stop_tr(struct trace_array *tr)
{
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long flags;
/* If global, we need to also stop the max tracer */
@@ -2103,7 +2221,7 @@ static void tracing_stop_tr(struct trace_array *tr)
if (tr->stop_count++)
goto out;
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
@@ -2230,9 +2348,9 @@ static bool tracing_record_taskinfo_skip(int flags)
/**
* tracing_record_taskinfo - record the task info of a task
*
- * @task - task to record
- * @flags - TRACE_RECORD_CMDLINE for recording comm
- * - TRACE_RECORD_TGID for recording tgid
+ * @task: task to record
+ * @flags: TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
*/
void tracing_record_taskinfo(struct task_struct *task, int flags)
{
@@ -2258,10 +2376,10 @@ void tracing_record_taskinfo(struct task_struct *task, int flags)
/**
* tracing_record_taskinfo_sched_switch - record task info for sched_switch
*
- * @prev - previous task during sched_switch
- * @next - next task during sched_switch
- * @flags - TRACE_RECORD_CMDLINE for recording comm
- * TRACE_RECORD_TGID for recording tgid
+ * @prev: previous task during sched_switch
+ * @next: next task during sched_switch
+ * @flags: TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
*/
void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
struct task_struct *next, int flags)
@@ -2334,7 +2452,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
struct ring_buffer_event *
-trace_buffer_lock_reserve(struct ring_buffer *buffer,
+trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
unsigned long flags, int pc)
@@ -2453,10 +2571,10 @@ void trace_buffered_event_disable(void)
preempt_enable();
}
-static struct ring_buffer *temp_buffer;
+static struct trace_buffer *temp_buffer;
struct ring_buffer_event *
-trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
+trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
struct trace_event_file *trace_file,
int type, unsigned long len,
unsigned long flags, int pc)
@@ -2464,7 +2582,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
struct ring_buffer_event *entry;
int val;
- *current_rb = trace_file->tr->trace_buffer.buffer;
+ *current_rb = trace_file->tr->array_buffer.buffer;
if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
(EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
@@ -2502,6 +2620,7 @@ static DEFINE_MUTEX(tracepoint_printk_mutex);
static void output_printk(struct trace_event_buffer *fbuffer)
{
struct trace_event_call *event_call;
+ struct trace_event_file *file;
struct trace_event *event;
unsigned long flags;
struct trace_iterator *iter = tracepoint_print_iter;
@@ -2515,6 +2634,12 @@ static void output_printk(struct trace_event_buffer *fbuffer)
!event_call->event.funcs->trace)
return;
+ file = fbuffer->trace_file;
+ if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
+ (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
+ !filter_match_preds(file->filter, fbuffer->entry)))
+ return;
+
event = &fbuffer->trace_file->event_call->event;
spin_lock_irqsave(&tracepoint_iter_lock, flags);
@@ -2565,9 +2690,9 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
if (static_key_false(&tracepoint_printk_key.key))
output_printk(fbuffer);
- event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
+ event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
- fbuffer->flags, fbuffer->pc);
+ fbuffer->flags, fbuffer->pc, fbuffer->regs);
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
@@ -2581,7 +2706,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
# define STACK_SKIP 3
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc,
struct pt_regs *regs)
@@ -2602,7 +2727,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
* Similar to trace_buffer_unlock_commit_regs() but do not dump stack.
*/
void
-trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer,
+trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
__buffer_unlock_commit(buffer, event);
@@ -2642,10 +2767,10 @@ static void ftrace_exports(struct ring_buffer_event *event)
preempt_disable_notrace();
- export = rcu_dereference_raw_notrace(ftrace_exports_list);
+ export = rcu_dereference_raw_check(ftrace_exports_list);
while (export) {
trace_process_export(export, event);
- export = rcu_dereference_raw_notrace(export->next);
+ export = rcu_dereference_raw_check(export->next);
}
preempt_enable_notrace();
@@ -2737,7 +2862,7 @@ trace_function(struct trace_array *tr,
int pc)
{
struct trace_event_call *call = &event_function;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
@@ -2775,7 +2900,7 @@ struct ftrace_stacks {
static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
-static void __ftrace_trace_stack(struct ring_buffer *buffer,
+static void __ftrace_trace_stack(struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs)
{
@@ -2850,7 +2975,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
}
static inline void ftrace_trace_stack(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs)
{
@@ -2863,7 +2988,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc)
{
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
if (rcu_is_watching()) {
__ftrace_trace_stack(buffer, flags, skip, pc, NULL);
@@ -2901,7 +3026,7 @@ void trace_dump_stack(int skip)
/* Skip 1 to skip this function. */
skip++;
#endif
- __ftrace_trace_stack(global_trace.trace_buffer.buffer,
+ __ftrace_trace_stack(global_trace.array_buffer.buffer,
flags, skip, preempt_count(), NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -2910,7 +3035,7 @@ EXPORT_SYMBOL_GPL(trace_dump_stack);
static DEFINE_PER_CPU(int, user_stack_count);
static void
-ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
+ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
@@ -2955,7 +3080,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
preempt_enable();
}
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
-static void ftrace_trace_userstack(struct ring_buffer *buffer,
+static void ftrace_trace_userstack(struct trace_buffer *buffer,
unsigned long flags, int pc)
{
}
@@ -3001,7 +3126,7 @@ static int alloc_percpu_trace_buffer(void)
struct trace_buffer_struct *buffers;
buffers = alloc_percpu(struct trace_buffer_struct);
- if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
+ if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
return -ENOMEM;
trace_percpu_buffer = buffers;
@@ -3046,7 +3171,7 @@ void trace_printk_init_buffers(void)
* directly here. If the global_trace.buffer is already
* allocated here, then this was called by module code.
*/
- if (global_trace.trace_buffer.buffer)
+ if (global_trace.array_buffer.buffer)
tracing_start_cmdline_record();
}
EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
@@ -3072,13 +3197,15 @@ static void trace_printk_start_stop_comm(int enabled)
/**
* trace_vbprintk - write binary msg to tracing buffer
- *
+ * @ip: The address of the caller
+ * @fmt: The string format to write to the buffer
+ * @args: Arguments for @fmt
*/
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
struct trace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct trace_array *tr = &global_trace;
struct bprint_entry *entry;
unsigned long flags;
@@ -3103,11 +3230,12 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
- goto out;
+ goto out_put;
local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
+ ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
flags, pc);
if (!event)
@@ -3123,6 +3251,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
}
out:
+ ring_buffer_nest_end(buffer);
+out_put:
put_trace_buf();
out_nobuffer:
@@ -3135,7 +3265,7 @@ EXPORT_SYMBOL_GPL(trace_vbprintk);
__printf(3, 0)
static int
-__trace_array_vprintk(struct ring_buffer *buffer,
+__trace_array_vprintk(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, va_list args)
{
struct trace_event_call *call = &event_print;
@@ -3165,6 +3295,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
local_save_flags(flags);
size = sizeof(*entry) + len + 1;
+ ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
flags, pc);
if (!event)
@@ -3179,6 +3310,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
}
out:
+ ring_buffer_nest_end(buffer);
put_trace_buf();
out_nobuffer:
@@ -3192,7 +3324,7 @@ __printf(3, 0)
int trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args)
{
- return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
+ return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
}
__printf(3, 0)
@@ -3205,6 +3337,9 @@ int trace_array_printk(struct trace_array *tr,
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
+ if (!tr)
+ return -ENOENT;
+
va_start(ap, fmt);
ret = trace_array_vprintk(tr, ip, fmt, ap);
va_end(ap);
@@ -3213,7 +3348,7 @@ int trace_array_printk(struct trace_array *tr,
EXPORT_SYMBOL_GPL(trace_array_printk);
__printf(3, 4)
-int trace_array_printk_buf(struct ring_buffer *buffer,
+int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...)
{
int ret;
@@ -3254,7 +3389,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
if (buf_iter)
event = ring_buffer_iter_peek(buf_iter, ts);
else
- event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
+ event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts,
lost_events);
if (event) {
@@ -3269,7 +3404,7 @@ static struct trace_entry *
__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
unsigned long *missing_events, u64 *ent_ts)
{
- struct ring_buffer *buffer = iter->trace_buffer->buffer;
+ struct trace_buffer *buffer = iter->array_buffer->buffer;
struct trace_entry *ent, *next = NULL;
unsigned long lost_events = 0, next_lost = 0;
int cpu_file = iter->cpu_file;
@@ -3346,7 +3481,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
static void trace_consume(struct trace_iterator *iter)
{
- ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
+ ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts,
&iter->lost_events);
}
@@ -3384,7 +3519,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
unsigned long entries = 0;
u64 ts;
- per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
+ per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0;
buf_iter = trace_buffer_iter(iter, cpu);
if (!buf_iter)
@@ -3398,13 +3533,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
* by the timestamp being before the start of the buffer.
*/
while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
- if (ts >= iter->trace_buffer->time_start)
+ if (ts >= iter->array_buffer->time_start)
break;
entries++;
ring_buffer_read(buf_iter, NULL);
}
- per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
+ per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
}
/*
@@ -3489,7 +3624,7 @@ static void s_stop(struct seq_file *m, void *p)
}
static void
-get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total,
+get_total_entries_cpu(struct array_buffer *buf, unsigned long *total,
unsigned long *entries, int cpu)
{
unsigned long count;
@@ -3511,7 +3646,7 @@ get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total,
}
static void
-get_total_entries(struct trace_buffer *buf,
+get_total_entries(struct array_buffer *buf,
unsigned long *total, unsigned long *entries)
{
unsigned long t, e;
@@ -3534,7 +3669,7 @@ unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu)
if (!tr)
tr = &global_trace;
- get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu);
+ get_total_entries_cpu(&tr->array_buffer, &total, &entries, cpu);
return entries;
}
@@ -3546,7 +3681,7 @@ unsigned long trace_total_entries(struct trace_array *tr)
if (!tr)
tr = &global_trace;
- get_total_entries(&tr->trace_buffer, &total, &entries);
+ get_total_entries(&tr->array_buffer, &total, &entries);
return entries;
}
@@ -3563,7 +3698,7 @@ static void print_lat_help_header(struct seq_file *m)
"# \\ / ||||| \\ | / \n");
}
-static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
+static void print_event_info(struct array_buffer *buf, struct seq_file *m)
{
unsigned long total;
unsigned long entries;
@@ -3574,7 +3709,7 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
seq_puts(m, "#\n");
}
-static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
+static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
bool tgid = flags & TRACE_ITER_RECORD_TGID;
@@ -3585,7 +3720,7 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
seq_printf(m, "# | | %s | | |\n", tgid ? " | " : "");
}
-static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
+static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
bool tgid = flags & TRACE_ITER_RECORD_TGID;
@@ -3607,7 +3742,7 @@ void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK);
- struct trace_buffer *buf = iter->trace_buffer;
+ struct array_buffer *buf = iter->array_buffer;
struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
struct tracer *type = iter->trace;
unsigned long entries;
@@ -3634,6 +3769,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
"desktop",
#elif defined(CONFIG_PREEMPT)
"preempt",
+#elif defined(CONFIG_PREEMPT_RT)
+ "preempt_rt",
#else
"unknown",
#endif
@@ -3680,7 +3817,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
cpumask_test_cpu(iter->cpu, iter->started))
return;
- if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
+ if (per_cpu_ptr(iter->array_buffer->data, iter->cpu)->skipped_entries)
return;
if (cpumask_available(iter->started))
@@ -3814,7 +3951,7 @@ int trace_empty(struct trace_iterator *iter)
if (!ring_buffer_iter_empty(buf_iter))
return 0;
} else {
- if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
+ if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu))
return 0;
}
return 1;
@@ -3826,7 +3963,7 @@ int trace_empty(struct trace_iterator *iter)
if (!ring_buffer_iter_empty(buf_iter))
return 0;
} else {
- if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
+ if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu))
return 0;
}
}
@@ -3916,10 +4053,10 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->trace_buffer,
+ print_func_help_header_irq(iter->array_buffer,
m, trace_flags);
else
- print_func_help_header(iter->trace_buffer, m,
+ print_func_help_header(iter->array_buffer, m,
trace_flags);
}
}
@@ -4077,21 +4214,21 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
#ifdef CONFIG_TRACER_MAX_TRACE
/* Currently only the top directory has a snapshot */
if (tr->current_trace->print_max || snapshot)
- iter->trace_buffer = &tr->max_buffer;
+ iter->array_buffer = &tr->max_buffer;
else
#endif
- iter->trace_buffer = &tr->trace_buffer;
+ iter->array_buffer = &tr->array_buffer;
iter->snapshot = snapshot;
iter->pos = -1;
iter->cpu_file = tracing_get_cpu(inode);
mutex_init(&iter->mutex);
/* Notify the tracer early; before we stop tracing. */
- if (iter->trace && iter->trace->open)
+ if (iter->trace->open)
iter->trace->open(iter);
/* Annotate start of buffers if we had overruns */
- if (ring_buffer_overruns(iter->trace_buffer->buffer))
+ if (ring_buffer_overruns(iter->array_buffer->buffer))
iter->iter_flags |= TRACE_FILE_ANNOTATE;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -4105,7 +4242,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->trace_buffer->buffer,
+ ring_buffer_read_prepare(iter->array_buffer->buffer,
cpu, GFP_KERNEL);
}
ring_buffer_read_prepare_sync();
@@ -4116,7 +4253,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->trace_buffer->buffer,
+ ring_buffer_read_prepare(iter->array_buffer->buffer,
cpu, GFP_KERNEL);
ring_buffer_read_prepare_sync();
ring_buffer_read_start(iter->buffer_iter[cpu]);
@@ -4138,8 +4275,11 @@ release:
int tracing_open_generic(struct inode *inode, struct file *filp)
{
- if (tracing_disabled)
- return -ENODEV;
+ int ret;
+
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
filp->private_data = inode->i_private;
return 0;
@@ -4154,15 +4294,14 @@ bool tracing_is_disabled(void)
* Open and update trace_array ref count.
* Must have the current trace_array passed to it.
*/
-static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
+int tracing_open_generic_tr(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
+ int ret;
- if (tracing_disabled)
- return -ENODEV;
-
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
filp->private_data = inode->i_private;
@@ -4231,15 +4370,16 @@ static int tracing_open(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
- int ret = 0;
+ int ret;
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
int cpu = tracing_get_cpu(inode);
- struct trace_buffer *trace_buf = &tr->trace_buffer;
+ struct array_buffer *trace_buf = &tr->array_buffer;
#ifdef CONFIG_TRACER_MAX_TRACE
if (tr->current_trace->print_max)
@@ -4249,7 +4389,7 @@ static int tracing_open(struct inode *inode, struct file *file)
if (cpu == RING_BUFFER_ALL_CPUS)
tracing_reset_online_cpus(trace_buf);
else
- tracing_reset(trace_buf, cpu);
+ tracing_reset_cpu(trace_buf, cpu);
}
if (file->f_mode & FMODE_READ) {
@@ -4350,12 +4490,15 @@ static int show_traces_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret;
- if (tracing_disabled)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
ret = seq_open(file, &show_traces_seq_ops);
- if (ret)
+ if (ret) {
+ trace_array_put(tr);
return ret;
+ }
m = file->private_data;
m->private = tr;
@@ -4363,6 +4506,14 @@ static int show_traces_open(struct inode *inode, struct file *file)
return 0;
}
+static int show_traces_release(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+ return seq_release(inode, file);
+}
+
static ssize_t
tracing_write_stub(struct file *filp, const char __user *ubuf,
size_t count, loff_t *ppos)
@@ -4393,8 +4544,8 @@ static const struct file_operations tracing_fops = {
static const struct file_operations show_traces_fops = {
.open = show_traces_open,
.read = seq_read,
- .release = seq_release,
.llseek = seq_lseek,
+ .release = show_traces_release,
};
static ssize_t
@@ -4425,20 +4576,13 @@ out_err:
return count;
}
-static ssize_t
-tracing_cpumask_write(struct file *filp, const char __user *ubuf,
- size_t count, loff_t *ppos)
+int tracing_set_cpumask(struct trace_array *tr,
+ cpumask_var_t tracing_cpumask_new)
{
- struct trace_array *tr = file_inode(filp)->i_private;
- cpumask_var_t tracing_cpumask_new;
- int err, cpu;
-
- if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
- return -ENOMEM;
+ int cpu;
- err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
- if (err)
- goto err_unlock;
+ if (!tr)
+ return -EINVAL;
local_irq_disable();
arch_spin_lock(&tr->max_lock);
@@ -4449,24 +4593,47 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
*/
if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
- ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
+ atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
+ ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
}
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
- ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
+ atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
+ ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
}
}
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
+
+ return 0;
+}
+
+static ssize_t
+tracing_cpumask_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct trace_array *tr = file_inode(filp)->i_private;
+ cpumask_var_t tracing_cpumask_new;
+ int err;
+
+ if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+ if (err)
+ goto err_free;
+
+ err = tracing_set_cpumask(tr, tracing_cpumask_new);
+ if (err)
+ goto err_free;
+
free_cpumask_var(tracing_cpumask_new);
return count;
-err_unlock:
+err_free:
free_cpumask_var(tracing_cpumask_new);
return err;
@@ -4556,6 +4723,10 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
+ if ((mask == TRACE_ITER_RECORD_TGID) ||
+ (mask == TRACE_ITER_RECORD_CMD))
+ lockdep_assert_held(&event_mutex);
+
/* do nothing if flag is already set */
if (!!(tr->trace_flags & mask) == !!enabled)
return 0;
@@ -4575,7 +4746,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_TGID) {
if (!tgid_map)
- tgid_map = kcalloc(PID_MAX_DEFAULT + 1,
+ tgid_map = kvcalloc(PID_MAX_DEFAULT + 1,
sizeof(*tgid_map),
GFP_KERNEL);
if (!tgid_map) {
@@ -4593,7 +4764,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
ftrace_pid_follow_fork(tr, enabled);
if (mask == TRACE_ITER_OVERWRITE) {
- ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
+ ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled);
#ifdef CONFIG_TRACER_MAX_TRACE
ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
#endif
@@ -4607,7 +4778,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
return 0;
}
-static int trace_set_options(struct trace_array *tr, char *option)
+int trace_set_options(struct trace_array *tr, char *option)
{
char *cmp;
int neg = 0;
@@ -4623,6 +4794,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
cmp += len;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = match_string(trace_options, -1, cmp);
@@ -4633,6 +4805,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
ret = set_tracer_flag(tr, 1 << ret, !neg);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
/*
* If the first trailing whitespace is replaced with '\0' by strstrip,
@@ -4695,11 +4868,9 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)
struct trace_array *tr = inode->i_private;
int ret;
- if (tracing_disabled)
- return -ENODEV;
-
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
ret = single_open(file, tracing_trace_options_show, inode->i_private);
if (ret < 0)
@@ -4813,15 +4984,15 @@ static const char readme_msg[] =
#endif
#endif /* CONFIG_STACK_TRACER */
#ifdef CONFIG_DYNAMIC_EVENTS
- " dynamic_events\t\t- Add/remove/show the generic dynamic events\n"
+ " dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
#endif
#ifdef CONFIG_KPROBE_EVENTS
- " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"
+ " kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
- " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n"
+ " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
#endif
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
@@ -4846,7 +5017,7 @@ static const char readme_msg[] =
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
#endif
- "\t +|-[u]<offset>(<fetcharg>)\n"
+ "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
"\t <type>\\[<array-size>\\]\n"
@@ -5036,8 +5207,11 @@ static const struct seq_operations tracing_saved_tgids_seq_ops = {
static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
{
- if (tracing_disabled)
- return -ENODEV;
+ int ret;
+
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
return seq_open(filp, &tracing_saved_tgids_seq_ops);
}
@@ -5113,8 +5287,11 @@ static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
{
- if (tracing_disabled)
- return -ENODEV;
+ int ret;
+
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
}
@@ -5222,14 +5399,12 @@ static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos)
* Paranoid! If ptr points to end, we don't want to increment past it.
* This really should never happen.
*/
+ (*pos)++;
ptr = update_eval_map(ptr);
if (WARN_ON_ONCE(!ptr))
return NULL;
ptr++;
-
- (*pos)++;
-
ptr = update_eval_map(ptr);
return ptr;
@@ -5278,8 +5453,11 @@ static const struct seq_operations tracing_eval_map_seq_ops = {
static int tracing_eval_map_open(struct inode *inode, struct file *filp)
{
- if (tracing_disabled)
- return -ENODEV;
+ int ret;
+
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
return seq_open(filp, &tracing_eval_map_seq_ops);
}
@@ -5392,11 +5570,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
int tracer_init(struct tracer *t, struct trace_array *tr)
{
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
return t->init(tr);
}
-static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
+static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
{
int cpu;
@@ -5406,8 +5584,8 @@ static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
#ifdef CONFIG_TRACER_MAX_TRACE
/* resize @tr's buffer to the size of @size_tr's entries */
-static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
- struct trace_buffer *size_buf, int cpu_id)
+static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
+ struct array_buffer *size_buf, int cpu_id)
{
int cpu, ret = 0;
@@ -5445,10 +5623,10 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ring_buffer_expanded = true;
/* May be called before buffers are initialized */
- if (!tr->trace_buffer.buffer)
+ if (!tr->array_buffer.buffer)
return 0;
- ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
+ ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
if (ret < 0)
return ret;
@@ -5459,8 +5637,8 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
if (ret < 0) {
- int r = resize_buffer_duplicate_size(&tr->trace_buffer,
- &tr->trace_buffer, cpu);
+ int r = resize_buffer_duplicate_size(&tr->array_buffer,
+ &tr->array_buffer, cpu);
if (r < 0) {
/*
* AARGH! We are left with different
@@ -5491,15 +5669,15 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
#endif /* CONFIG_TRACER_MAX_TRACE */
if (cpu == RING_BUFFER_ALL_CPUS)
- set_buffer_entries(&tr->trace_buffer, size);
+ set_buffer_entries(&tr->array_buffer, size);
else
- per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
+ per_cpu_ptr(tr->array_buffer.data, cpu)->entries = size;
return ret;
}
-static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
- unsigned long size, int cpu_id)
+ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
+ unsigned long size, int cpu_id)
{
int ret = size;
@@ -5578,7 +5756,7 @@ static void add_tracer_options(struct trace_array *tr, struct tracer *t)
create_trace_option_files(tr, t);
}
-static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -5802,13 +5980,11 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
- int ret = 0;
-
- if (tracing_disabled)
- return -ENODEV;
+ int ret;
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
mutex_lock(&trace_types_lock);
@@ -5839,7 +6015,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
iter->tr = tr;
- iter->trace_buffer = &tr->trace_buffer;
+ iter->array_buffer = &tr->array_buffer;
iter->cpu_file = tracing_get_cpu(inode);
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -5899,7 +6075,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
*/
return EPOLLIN | EPOLLRDNORM;
else
- return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file,
filp, poll_table);
}
@@ -5997,6 +6173,7 @@ waitagain:
sizeof(struct trace_iterator) -
offsetof(struct trace_iterator, seq));
cpumask_clear(iter->started);
+ trace_seq_init(&iter->seq);
iter->pos = -1;
trace_event_read_lock();
@@ -6215,8 +6392,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
for_each_tracing_cpu(cpu) {
/* fill in the size from first enabled cpu */
if (size == 0)
- size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
- if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
+ size = per_cpu_ptr(tr->array_buffer.data, cpu)->entries;
+ if (size != per_cpu_ptr(tr->array_buffer.data, cpu)->entries) {
buf_size_same = 0;
break;
}
@@ -6232,7 +6409,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
} else
r = sprintf(buf, "X\n");
} else
- r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10);
+ r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10);
mutex_unlock(&trace_types_lock);
@@ -6279,7 +6456,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
- size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
+ size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10;
if (!ring_buffer_expanded)
expanded_size += trace_buf_size >> 10;
}
@@ -6329,7 +6506,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
enum event_trigger_type tt = ETT_NONE;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
ssize_t written;
@@ -6358,7 +6535,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (cnt < FAULTED_SIZE)
size += FAULTED_SIZE - cnt;
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
irq_flags, preempt_count());
if (unlikely(!event))
@@ -6409,7 +6586,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
{
struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct raw_data_entry *entry;
unsigned long irq_flags;
ssize_t written;
@@ -6438,7 +6615,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
irq_flags, preempt_count());
if (!event)
@@ -6493,13 +6670,13 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr)
tr->clock_id = i;
- ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
+ ring_buffer_set_clock(tr->array_buffer.buffer, trace_clocks[i].func);
/*
* New clock may not be consistent with the previous clock.
* Reset the buffer so that it doesn't have incomparable timestamps.
*/
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
if (tr->max_buffer.buffer)
@@ -6545,11 +6722,9 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
struct trace_array *tr = inode->i_private;
int ret;
- if (tracing_disabled)
- return -ENODEV;
-
- if (trace_array_get(tr))
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
ret = single_open(file, tracing_clock_show, inode->i_private);
if (ret < 0)
@@ -6564,7 +6739,7 @@ static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
mutex_lock(&trace_types_lock);
- if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
+ if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer))
seq_puts(m, "delta [absolute]\n");
else
seq_puts(m, "[delta] absolute\n");
@@ -6579,11 +6754,9 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
struct trace_array *tr = inode->i_private;
int ret;
- if (tracing_disabled)
- return -ENODEV;
-
- if (trace_array_get(tr))
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
if (ret < 0)
@@ -6611,7 +6784,7 @@ int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
goto out;
}
- ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
+ ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs);
#ifdef CONFIG_TRACER_MAX_TRACE
if (tr->max_buffer.buffer)
@@ -6636,10 +6809,11 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file)
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
struct seq_file *m;
- int ret = 0;
+ int ret;
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
if (file->f_mode & FMODE_READ) {
iter = __tracing_open(inode, file, true);
@@ -6659,7 +6833,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file)
ret = 0;
iter->tr = tr;
- iter->trace_buffer = &tr->max_buffer;
+ iter->array_buffer = &tr->max_buffer;
iter->cpu_file = tracing_get_cpu(inode);
m->private = iter;
file->private_data = m;
@@ -6722,7 +6896,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
#endif
if (tr->allocated_snapshot)
ret = resize_buffer_duplicate_size(&tr->max_buffer,
- &tr->trace_buffer, iter->cpu_file);
+ &tr->array_buffer, iter->cpu_file);
else
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
@@ -6740,7 +6914,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
tracing_reset_online_cpus(&tr->max_buffer);
else
- tracing_reset(&tr->max_buffer, iter->cpu_file);
+ tracing_reset_cpu(&tr->max_buffer, iter->cpu_file);
}
break;
}
@@ -6784,6 +6958,7 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
struct ftrace_buffer_info *info;
int ret;
+ /* The following checks for tracefs lockdown */
ret = tracing_buffers_open(inode, filp);
if (ret < 0)
return ret;
@@ -6796,7 +6971,7 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
}
info->iter.snapshot = true;
- info->iter.trace_buffer = &info->iter.tr->max_buffer;
+ info->iter.array_buffer = &info->iter.tr->max_buffer;
return ret;
}
@@ -7103,8 +7278,9 @@ static int tracing_err_log_open(struct inode *inode, struct file *file)
struct trace_array *tr = inode->i_private;
int ret = 0;
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
/* If this file was opened for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
@@ -7155,11 +7331,9 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
struct ftrace_buffer_info *info;
int ret;
- if (tracing_disabled)
- return -ENODEV;
-
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
@@ -7172,7 +7346,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
info->iter.tr = tr;
info->iter.cpu_file = tracing_get_cpu(inode);
info->iter.trace = tr->current_trace;
- info->iter.trace_buffer = &tr->trace_buffer;
+ info->iter.array_buffer = &tr->array_buffer;
info->spare = NULL;
/* Force reading ring buffer for first read */
info->read = (unsigned int)-1;
@@ -7217,7 +7391,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
#endif
if (!info->spare) {
- info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
+ info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
iter->cpu_file);
if (IS_ERR(info->spare)) {
ret = PTR_ERR(info->spare);
@@ -7235,7 +7409,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
again:
trace_access_lock(iter->cpu_file);
- ret = ring_buffer_read_page(iter->trace_buffer->buffer,
+ ret = ring_buffer_read_page(iter->array_buffer->buffer,
&info->spare,
count,
iter->cpu_file, 0);
@@ -7285,7 +7459,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
if (info->spare)
- ring_buffer_free_read_page(iter->trace_buffer->buffer,
+ ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
kfree(info);
@@ -7295,7 +7469,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
}
struct buffer_ref {
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
void *page;
int cpu;
refcount_t refcount;
@@ -7390,7 +7564,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
again:
trace_access_lock(iter->cpu_file);
- entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
+ entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
struct page *page;
@@ -7403,7 +7577,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
refcount_set(&ref->refcount, 1);
- ref->buffer = iter->trace_buffer->buffer;
+ ref->buffer = iter->array_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (IS_ERR(ref->page)) {
ret = PTR_ERR(ref->page);
@@ -7431,7 +7605,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
spd.nr_pages++;
*ppos += PAGE_SIZE;
- entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
+ entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
}
trace_access_unlock(iter->cpu_file);
@@ -7475,7 +7649,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
{
struct inode *inode = file_inode(filp);
struct trace_array *tr = inode->i_private;
- struct trace_buffer *trace_buf = &tr->trace_buffer;
+ struct array_buffer *trace_buf = &tr->array_buffer;
int cpu = tracing_get_cpu(inode);
struct trace_seq *s;
unsigned long cnt;
@@ -7546,14 +7720,23 @@ static ssize_t
tracing_read_dyn_info(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- unsigned long *p = filp->private_data;
- char buf[64]; /* Not too big for a shallow stack */
+ ssize_t ret;
+ char *buf;
int r;
- r = scnprintf(buf, 63, "%ld", *p);
- buf[r++] = '\n';
+ /* 256 should be plenty to hold the amount needed */
+ buf = kmalloc(256, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n",
+ ftrace_update_tot_cnt,
+ ftrace_number_of_pages,
+ ftrace_number_of_groups);
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ kfree(buf);
+ return ret;
}
static const struct file_operations tracing_dyn_info_fops = {
@@ -7747,7 +7930,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
- WARN_ONCE(!tr->percpu_dir,
+ MEM_FAIL(!tr->percpu_dir,
"Could not create tracefs directory 'per_cpu/%d'\n", cpu);
return tr->percpu_dir;
@@ -7935,9 +8118,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (val != 0 && val != 1)
return -EINVAL;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = set_tracer_flag(tr, 1 << index, val);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
if (ret < 0)
return ret;
@@ -8066,7 +8251,7 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
for (cnt = 0; opts[cnt].name; cnt++) {
create_trace_option_file(tr, &topts[cnt], flags,
&opts[cnt]);
- WARN_ONCE(topts[cnt].entry == NULL,
+ MEM_FAIL(topts[cnt].entry == NULL,
"Failed to create trace option: %s",
opts[cnt].name);
}
@@ -8123,7 +8308,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_array *tr = filp->private_data;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
unsigned long val;
int ret;
@@ -8213,7 +8398,7 @@ static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
static int
-allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
+allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
{
enum ring_buffer_flags rb_flags;
@@ -8233,8 +8418,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
}
/* Allocate the first page for all buffers */
- set_buffer_entries(&tr->trace_buffer,
- ring_buffer_size(tr->trace_buffer.buffer, 0));
+ set_buffer_entries(&tr->array_buffer,
+ ring_buffer_size(tr->array_buffer.buffer, 0));
return 0;
}
@@ -8243,18 +8428,18 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
{
int ret;
- ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
+ ret = allocate_trace_buffer(tr, &tr->array_buffer, size);
if (ret)
return ret;
#ifdef CONFIG_TRACER_MAX_TRACE
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
- if (WARN_ON(ret)) {
- ring_buffer_free(tr->trace_buffer.buffer);
- tr->trace_buffer.buffer = NULL;
- free_percpu(tr->trace_buffer.data);
- tr->trace_buffer.data = NULL;
+ if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
+ ring_buffer_free(tr->array_buffer.buffer);
+ tr->array_buffer.buffer = NULL;
+ free_percpu(tr->array_buffer.data);
+ tr->array_buffer.data = NULL;
return -ENOMEM;
}
tr->allocated_snapshot = allocate_snapshot;
@@ -8268,7 +8453,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return 0;
}
-static void free_trace_buffer(struct trace_buffer *buf)
+static void free_trace_buffer(struct array_buffer *buf)
{
if (buf->buffer) {
ring_buffer_free(buf->buffer);
@@ -8283,7 +8468,7 @@ static void free_trace_buffers(struct trace_array *tr)
if (!tr)
return;
- free_trace_buffer(&tr->trace_buffer);
+ free_trace_buffer(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
free_trace_buffer(&tr->max_buffer);
@@ -8314,24 +8499,43 @@ static void update_tracer_options(struct trace_array *tr)
mutex_unlock(&trace_types_lock);
}
-struct trace_array *trace_array_create(const char *name)
+/* Must have trace_types_lock held */
+struct trace_array *trace_array_find(const char *instance)
+{
+ struct trace_array *tr, *found = NULL;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, instance) == 0) {
+ found = tr;
+ break;
+ }
+ }
+
+ return found;
+}
+
+struct trace_array *trace_array_find_get(const char *instance)
{
struct trace_array *tr;
- int ret;
- mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
+ tr = trace_array_find(instance);
+ if (tr)
+ tr->ref++;
+ mutex_unlock(&trace_types_lock);
- ret = -EEXIST;
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr->name && strcmp(tr->name, name) == 0)
- goto out_unlock;
- }
+ return tr;
+}
+
+static struct trace_array *trace_array_create(const char *name)
+{
+ struct trace_array *tr;
+ int ret;
ret = -ENOMEM;
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
if (!tr)
- goto out_unlock;
+ return ERR_PTR(ret);
tr->name = kstrdup(name, GFP_KERNEL);
if (!tr->name)
@@ -8364,7 +8568,7 @@ struct trace_array *trace_array_create(const char *name)
ret = event_trace_add_tracer(tr->dir, tr);
if (ret) {
- tracefs_remove_recursive(tr->dir);
+ tracefs_remove(tr->dir);
goto out_free_tr;
}
@@ -8376,8 +8580,8 @@ struct trace_array *trace_array_create(const char *name)
list_add(&tr->list, &ftrace_trace_arrays);
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
+ tr->ref++;
+
return tr;
@@ -8387,24 +8591,79 @@ struct trace_array *trace_array_create(const char *name)
kfree(tr->name);
kfree(tr);
- out_unlock:
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
-
return ERR_PTR(ret);
}
-EXPORT_SYMBOL_GPL(trace_array_create);
static int instance_mkdir(const char *name)
{
- return PTR_ERR_OR_ZERO(trace_array_create(name));
+ struct trace_array *tr;
+ int ret;
+
+ mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
+
+ ret = -EEXIST;
+ if (trace_array_find(name))
+ goto out_unlock;
+
+ tr = trace_array_create(name);
+
+ ret = PTR_ERR_OR_ZERO(tr);
+
+out_unlock:
+ mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
+ return ret;
}
+/**
+ * trace_array_get_by_name - Create/Lookup a trace array, given its name.
+ * @name: The name of the trace array to be looked up/created.
+ *
+ * Returns pointer to trace array with given name.
+ * NULL, if it cannot be created.
+ *
+ * NOTE: This function increments the reference counter associated with the
+ * trace array returned. This makes sure it cannot be freed while in use.
+ * Use trace_array_put() once the trace array is no longer needed.
+ * If the trace_array is to be freed, trace_array_destroy() needs to
+ * be called after the trace_array_put(), or simply let user space delete
+ * it from the tracefs instances directory. But until the
+ * trace_array_put() is called, user space can not delete it.
+ *
+ */
+struct trace_array *trace_array_get_by_name(const char *name)
+{
+ struct trace_array *tr;
+
+ mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, name) == 0)
+ goto out_unlock;
+ }
+
+ tr = trace_array_create(name);
+
+ if (IS_ERR(tr))
+ tr = NULL;
+out_unlock:
+ if (tr)
+ tr->ref++;
+
+ mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
+ return tr;
+}
+EXPORT_SYMBOL_GPL(trace_array_get_by_name);
+
static int __remove_instance(struct trace_array *tr)
{
int i;
- if (tr->ref || (tr->current_trace && tr->current_trace->ref))
+ /* Reference counter for a newly created trace array = 1. */
+ if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref))
return -EBUSY;
list_del(&tr->list);
@@ -8420,7 +8679,7 @@ static int __remove_instance(struct trace_array *tr)
event_trace_del_tracer(tr);
ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
- tracefs_remove_recursive(tr->dir);
+ tracefs_remove(tr->dir);
free_trace_buffers(tr);
for (i = 0; i < tr->nr_topts; i++) {
@@ -8436,17 +8695,26 @@ static int __remove_instance(struct trace_array *tr)
return 0;
}
-int trace_array_destroy(struct trace_array *tr)
+int trace_array_destroy(struct trace_array *this_tr)
{
+ struct trace_array *tr;
int ret;
- if (!tr)
+ if (!this_tr)
return -EINVAL;
mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
- ret = __remove_instance(tr);
+ ret = -ENODEV;
+
+ /* Making sure trace array exists before destroying it. */
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr == this_tr) {
+ ret = __remove_instance(tr);
+ break;
+ }
+ }
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
@@ -8464,12 +8732,9 @@ static int instance_rmdir(const char *name)
mutex_lock(&trace_types_lock);
ret = -ENODEV;
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr->name && strcmp(tr->name, name) == 0) {
- ret = __remove_instance(tr);
- break;
- }
- }
+ tr = trace_array_find(name);
+ if (tr)
+ ret = __remove_instance(tr);
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
@@ -8482,7 +8747,7 @@ static __init void create_trace_instances(struct dentry *d_tracer)
trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
instance_mkdir,
instance_rmdir);
- if (WARN_ON(!trace_instance_dir))
+ if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n"))
return;
}
@@ -8548,12 +8813,11 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
- trace_create_file("tracing_max_latency", 0644, d_tracer,
- &tr->max_latency, &tracing_max_lat_fops);
+ trace_create_maxlat_file(tr, d_tracer);
#endif
if (ftrace_create_function_files(tr, d_tracer))
- WARN(1, "Could not allocate function filter files");
+ MEM_FAIL(1, "Could not allocate function filter files");
#ifdef CONFIG_TRACER_SNAPSHOT
trace_create_file("snapshot", 0644, d_tracer,
@@ -8602,6 +8866,11 @@ struct dentry *tracing_init_dentry(void)
{
struct trace_array *tr = &global_trace;
+ if (security_locked_down(LOCKDOWN_TRACEFS)) {
+ pr_warn("Tracing disabled due to lockdown\n");
+ return ERR_PTR(-EPERM);
+ }
+
/* The top level trace array uses NULL as parent */
if (tr->dir)
return NULL;
@@ -8745,7 +9014,7 @@ static __init int tracer_init_tracefs(void)
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
- &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
+ NULL, &tracing_dyn_info_fops);
#endif
create_trace_instances(d_tracer);
@@ -8830,13 +9099,13 @@ void trace_init_global_iter(struct trace_iterator *iter)
iter->tr = &global_trace;
iter->trace = iter->tr->current_trace;
iter->cpu_file = RING_BUFFER_ALL_CPUS;
- iter->trace_buffer = &global_trace.trace_buffer;
+ iter->array_buffer = &global_trace.array_buffer;
if (iter->trace && iter->trace->open)
iter->trace->open(iter);
/* Annotate start of buffers if we had overruns */
- if (ring_buffer_overruns(iter->trace_buffer->buffer))
+ if (ring_buffer_overruns(iter->array_buffer->buffer))
iter->iter_flags |= TRACE_FILE_ANNOTATE;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -8877,7 +9146,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -8945,7 +9214,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
tr->trace_flags |= old_userobj;
for_each_tracing_cpu(cpu) {
- atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
atomic_dec(&dump_running);
printk_nmi_direct_exit();
@@ -9044,6 +9313,12 @@ __init static int tracer_alloc_buffers(void)
int ring_buf_size;
int ret = -ENOMEM;
+
+ if (security_locked_down(LOCKDOWN_TRACEFS)) {
+ pr_warn("Tracing disabled due to lockdown\n");
+ return -EPERM;
+ }
+
/*
* Make sure we don't accidently add more trace options
* than we have bits for.
@@ -9094,8 +9369,7 @@ __init static int tracer_alloc_buffers(void)
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
- printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
- WARN_ON(1);
+ MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
goto out_free_savedcmd;
}
@@ -9168,7 +9442,8 @@ void __init early_trace_init(void)
if (tracepoint_printk) {
tracepoint_print_iter =
kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
- if (WARN_ON(!tracepoint_print_iter))
+ if (MEM_FAIL(!tracepoint_print_iter,
+ "Failed to allocate trace iterator\n"))
tracepoint_printk = 0;
else
static_key_enable(&tracepoint_printk_key.key);
@@ -9208,6 +9483,11 @@ __init static int tracing_set_default_clock(void)
{
/* sched_clock_stable() is determined in late_initcall */
if (!trace_boot_clock && !sched_clock_stable()) {
+ if (security_locked_down(LOCKDOWN_TRACEFS)) {
+ pr_warn("Can not set tracing clock due to lockdown\n");
+ return -EPERM;
+ }
+
printk(KERN_WARNING
"Unstable clock detected, switching default tracing clock to \"global\"\n"
"If you want to keep using the local clock, then add:\n"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 005f08629b8b..99372dd7d168 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,11 +11,14 @@
#include <linux/mmiotrace.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
+#include <linux/trace.h>
#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
#include <linux/trace_events.h>
#include <linux/compiler.h>
#include <linux/glob.h>
+#include <linux/irq_work.h>
+#include <linux/workqueue.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -49,6 +52,9 @@ enum trace_type {
#undef __field
#define __field(type, item) type item;
+#undef __field_fn
+#define __field_fn(type, item) type item;
+
#undef __field_struct
#define __field_struct(type, item) __field(type, item)
@@ -68,29 +74,37 @@ enum trace_type {
#define F_STRUCT(args...) args
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
struct struct_name { \
struct trace_entry ent; \
tstruct \
}
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
+#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
#undef FTRACE_ENTRY_REG
-#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
- filter, regfn) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
#undef FTRACE_ENTRY_PACKED
-#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \
- filter) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter) __packed
+#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed
#include "trace_entries.h"
+/* Use this for memory failure errors */
+#define MEM_FAIL(condition, fmt, ...) ({ \
+ static bool __section(.data.once) __warned; \
+ int __ret_warn_once = !!(condition); \
+ \
+ if (unlikely(__ret_warn_once && !__warned)) { \
+ __warned = true; \
+ pr_err("ERROR: " fmt, ##__VA_ARGS__); \
+ } \
+ unlikely(__ret_warn_once); \
+})
+
/*
* syscalls are special, and need special handling, this is why
* they are not included in trace_entries.h
@@ -173,9 +187,9 @@ struct trace_array_cpu {
struct tracer;
struct trace_option_dentry;
-struct trace_buffer {
+struct array_buffer {
struct trace_array *tr;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct trace_array_cpu __percpu *data;
u64 time_start;
int cpu;
@@ -246,7 +260,7 @@ struct cond_snapshot {
struct trace_array {
struct list_head list;
char *name;
- struct trace_buffer trace_buffer;
+ struct array_buffer array_buffer;
#ifdef CONFIG_TRACER_MAX_TRACE
/*
* The max_buffer is used to snapshot the trace when a maximum
@@ -254,16 +268,21 @@ struct trace_array {
* Some tracers will use this to store a maximum trace while
* it continues examining live traces.
*
- * The buffers for the max_buffer are set up the same as the trace_buffer
+ * The buffers for the max_buffer are set up the same as the array_buffer
* When a snapshot is taken, the buffer of the max_buffer is swapped
- * with the buffer of the trace_buffer and the buffers are reset for
- * the trace_buffer so the tracing can continue.
+ * with the buffer of the array_buffer and the buffers are reset for
+ * the array_buffer so the tracing can continue.
*/
- struct trace_buffer max_buffer;
+ struct array_buffer max_buffer;
bool allocated_snapshot;
#endif
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
unsigned long max_latency;
+#ifdef CONFIG_FSNOTIFY
+ struct dentry *d_max_latency;
+ struct work_struct fsnotify_work;
+ struct irq_work fsnotify_irqwork;
+#endif
#endif
struct trace_pid_list __rcu *filtered_pids;
/*
@@ -337,7 +356,9 @@ extern struct list_head ftrace_trace_arrays;
extern struct mutex trace_types_lock;
extern int trace_array_get(struct trace_array *tr);
-extern void trace_array_put(struct trace_array *tr);
+extern int tracing_check_open_get_tr(struct trace_array *tr);
+extern struct trace_array *trace_array_find(const char *instance);
+extern struct trace_array *trace_array_find_get(const char *instance);
extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
@@ -365,11 +386,11 @@ static inline struct trace_array *top_trace_array(void)
__builtin_types_compatible_p(typeof(var), type *)
#undef IF_ASSIGN
-#define IF_ASSIGN(var, entry, etype, id) \
- if (FTRACE_CMP_TYPE(var, etype)) { \
- var = (typeof(var))(entry); \
- WARN_ON(id && (entry)->type != id); \
- break; \
+#define IF_ASSIGN(var, entry, etype, id) \
+ if (FTRACE_CMP_TYPE(var, etype)) { \
+ var = (typeof(var))(entry); \
+ WARN_ON(id != 0 && (entry)->type != id); \
+ break; \
}
/* Will cause compile errors if type is not found. */
@@ -677,11 +698,11 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
-void tracing_reset(struct trace_buffer *buf, int cpu);
-void tracing_reset_online_cpus(struct trace_buffer *buf);
+void tracing_reset_online_cpus(struct array_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
+int tracing_open_generic_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
@@ -697,7 +718,7 @@ struct dentry *tracing_init_dentry(void);
struct ring_buffer_event;
struct ring_buffer_event *
-trace_buffer_lock_reserve(struct ring_buffer *buffer,
+trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
unsigned long flags,
@@ -709,7 +730,7 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
-void trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer,
+void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
struct ring_buffer_event *event);
int trace_empty(struct trace_iterator *iter);
@@ -785,6 +806,17 @@ void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
#endif /* CONFIG_TRACER_MAX_TRACE */
+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ defined(CONFIG_FSNOTIFY)
+
+void latency_fsnotify(struct trace_array *tr);
+
+#else
+
+static inline void latency_fsnotify(struct trace_array *tr) { }
+
+#endif
+
#ifdef CONFIG_STACKTRACE
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
@@ -803,6 +835,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
+extern unsigned long ftrace_number_of_pages;
+extern unsigned long ftrace_number_of_groups;
void ftrace_init_trace_array(struct trace_array *tr);
#else
static inline void ftrace_init_trace_array(struct trace_array *tr) { }
@@ -852,9 +886,7 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args);
extern int
trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args);
-int trace_array_printk(struct trace_array *tr,
- unsigned long ip, const char *fmt, ...);
-int trace_array_printk_buf(struct ring_buffer *buffer,
+int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
enum print_line_t print_trace_line(struct trace_iterator *iter);
@@ -931,22 +963,31 @@ extern void __trace_graph_return(struct trace_array *tr,
unsigned long flags, int pc);
#ifdef CONFIG_DYNAMIC_FTRACE
-extern struct ftrace_hash *ftrace_graph_hash;
-extern struct ftrace_hash *ftrace_graph_notrace_hash;
+extern struct ftrace_hash __rcu *ftrace_graph_hash;
+extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
{
unsigned long addr = trace->func;
int ret = 0;
+ struct ftrace_hash *hash;
preempt_disable_notrace();
- if (ftrace_hash_empty(ftrace_graph_hash)) {
+ /*
+ * Have to open code "rcu_dereference_sched()" because the
+ * function graph tracer can be called when RCU is not
+ * "watching".
+ * Protected with schedule_on_each_cpu(ftrace_sync)
+ */
+ hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible());
+
+ if (ftrace_hash_empty(hash)) {
ret = 1;
goto out;
}
- if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+ if (ftrace_lookup_ip(hash, addr)) {
/*
* This needs to be cleared on the return functions
@@ -982,10 +1023,20 @@ static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
int ret = 0;
+ struct ftrace_hash *notrace_hash;
preempt_disable_notrace();
- if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr))
+ /*
+ * Have to open code "rcu_dereference_sched()" because the
+ * function graph tracer can be called when RCU is not
+ * "watching".
+ * Protected with schedule_on_each_cpu(ftrace_sync)
+ */
+ notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ !preemptible());
+
+ if (ftrace_lookup_ip(notrace_hash, addr))
ret = 1;
preempt_enable_notrace();
@@ -1038,7 +1089,7 @@ struct ftrace_func_command {
extern bool ftrace_filter_param __initdata;
static inline int ftrace_trace_task(struct trace_array *tr)
{
- return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);
+ return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
@@ -1126,6 +1177,11 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd);
void ftrace_create_filter_files(struct ftrace_ops *ops,
struct dentry *parent);
void ftrace_destroy_filter_files(struct ftrace_ops *ops);
+
+extern int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
+ int len, int reset);
+extern int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
+ int len, int reset);
#else
struct ftrace_func_command;
@@ -1348,17 +1404,17 @@ struct trace_subsystem_dir {
};
extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event);
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc,
struct pt_regs *regs);
static inline void trace_buffer_unlock_commit(struct trace_array *tr,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc)
{
@@ -1371,7 +1427,7 @@ void trace_buffered_event_disable(void);
void trace_buffered_event_enable(void);
static inline void
-__trace_event_discard_commit(struct ring_buffer *buffer,
+__trace_event_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
if (this_cpu_read(trace_buffered_event) == event) {
@@ -1397,7 +1453,7 @@ __trace_event_discard_commit(struct ring_buffer *buffer,
*/
static inline bool
__event_trigger_test_discard(struct trace_event_file *file,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
void *entry,
enum event_trigger_type *tt)
@@ -1432,7 +1488,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
*/
static inline void
event_trigger_unlock_commit(struct trace_event_file *file,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
void *entry, unsigned long irq_flags, int pc)
{
@@ -1463,7 +1519,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
*/
static inline void
event_trigger_unlock_commit_regs(struct trace_event_file *file,
- struct ring_buffer *buffer,
+ struct trace_buffer *buffer,
struct ring_buffer_event *event,
void *entry, unsigned long irq_flags, int pc,
struct pt_regs *regs)
@@ -1582,6 +1638,7 @@ extern struct list_head ftrace_events;
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
+extern const struct file_operations event_inject_fops;
#ifdef CONFIG_HIST_TRIGGERS
extern int register_trigger_hist_cmd(void);
@@ -1869,11 +1926,19 @@ extern const char *__start___tracepoint_str[];
extern const char *__stop___tracepoint_str[];
void trace_printk_control(bool enabled);
-void trace_printk_init_buffers(void);
void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+/* Used from boot time tracer */
+extern int trace_set_options(struct trace_array *tr, char *option);
+extern int tracing_set_tracer(struct trace_array *tr, const char *buf);
+extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
+ unsigned long size, int cpu_id);
+extern int tracing_set_cpumask(struct trace_array *tr,
+ cpumask_var_t tracing_cpumask_new);
+
+
#define MAX_EVENT_NAME_LEN 64
extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
@@ -1898,17 +1963,15 @@ extern void tracing_log_err(struct trace_array *tr,
#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
+#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
extern struct trace_event_call \
__aligned(4) event_##call;
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
- FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
#undef FTRACE_ENTRY_PACKED
-#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
- FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
#include "trace_entries.h"
@@ -1933,6 +1996,9 @@ static inline const char *get_syscall_name(int syscall)
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
void trace_event_eval_update(struct trace_eval_map **map, int len);
+/* Used from boot time tracer */
+extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
+extern int trigger_process_regex(struct trace_event_file *file, char *buff);
#else
static inline void __init trace_event_init(void) { }
static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 80e0b2aca703..2e9a4746ea85 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -178,14 +178,14 @@ static int benchmark_event_kthread(void *arg)
int trace_benchmark_reg(void)
{
if (!ok_to_run) {
- pr_warning("trace benchmark cannot be started via kernel command line\n");
+ pr_warn("trace benchmark cannot be started via kernel command line\n");
return -EBUSY;
}
bm_event_thread = kthread_run(benchmark_event_kthread,
NULL, "event_benchmark");
if (IS_ERR(bm_event_thread)) {
- pr_warning("trace benchmark failed to create kernel thread\n");
+ pr_warn("trace benchmark failed to create kernel thread\n");
return PTR_ERR(bm_event_thread);
}
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
new file mode 100644
index 000000000000..06d7feb5255f
--- /dev/null
+++ b/kernel/trace/trace_boot.c
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * trace_boot.c
+ * Tracing kernel boot-time
+ */
+
+#define pr_fmt(fmt) "trace_boot: " fmt
+
+#include <linux/bootconfig.h>
+#include <linux/cpumask.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/trace.h>
+#include <linux/trace_events.h>
+
+#include "trace.h"
+
+#define MAX_BUF_LEN 256
+
+static void __init
+trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
+{
+ struct xbc_node *anode;
+ const char *p;
+ char buf[MAX_BUF_LEN];
+ unsigned long v = 0;
+
+ /* Common ftrace options */
+ xbc_node_for_each_array_value(node, "options", anode, p) {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) {
+ pr_err("String is too long: %s\n", p);
+ continue;
+ }
+
+ if (trace_set_options(tr, buf) < 0)
+ pr_err("Failed to set option: %s\n", buf);
+ }
+
+ p = xbc_node_find_value(node, "trace_clock", NULL);
+ if (p && *p != '\0') {
+ if (tracing_set_clock(tr, p) < 0)
+ pr_err("Failed to set trace clock: %s\n", p);
+ }
+
+ p = xbc_node_find_value(node, "buffer_size", NULL);
+ if (p && *p != '\0') {
+ v = memparse(p, NULL);
+ if (v < PAGE_SIZE)
+ pr_err("Buffer size is too small: %s\n", p);
+ if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
+ pr_err("Failed to resize trace buffer to %s\n", p);
+ }
+
+ p = xbc_node_find_value(node, "cpumask", NULL);
+ if (p && *p != '\0') {
+ cpumask_var_t new_mask;
+
+ if (alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ if (cpumask_parse(p, new_mask) < 0 ||
+ tracing_set_cpumask(tr, new_mask) < 0)
+ pr_err("Failed to set new CPU mask %s\n", p);
+ free_cpumask_var(new_mask);
+ }
+ }
+}
+
+#ifdef CONFIG_EVENT_TRACING
+static void __init
+trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node)
+{
+ struct xbc_node *anode;
+ char buf[MAX_BUF_LEN];
+ const char *p;
+
+ xbc_node_for_each_array_value(node, "events", anode, p) {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) {
+ pr_err("String is too long: %s\n", p);
+ continue;
+ }
+
+ if (ftrace_set_clr_event(tr, buf, 1) < 0)
+ pr_err("Failed to enable event: %s\n", p);
+ }
+}
+
+#ifdef CONFIG_KPROBE_EVENTS
+static int __init
+trace_boot_add_kprobe_event(struct xbc_node *node, const char *event)
+{
+ struct dynevent_cmd cmd;
+ struct xbc_node *anode;
+ char buf[MAX_BUF_LEN];
+ const char *val;
+ int ret;
+
+ kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN);
+
+ ret = kprobe_event_gen_cmd_start(&cmd, event, NULL);
+ if (ret)
+ return ret;
+
+ xbc_node_for_each_array_value(node, "probes", anode, val) {
+ ret = kprobe_event_add_field(&cmd, val);
+ if (ret)
+ return ret;
+ }
+
+ ret = kprobe_event_gen_cmd_end(&cmd);
+ if (ret)
+ pr_err("Failed to add probe: %s\n", buf);
+
+ return ret;
+}
+#else
+static inline int __init
+trace_boot_add_kprobe_event(struct xbc_node *node, const char *event)
+{
+ pr_err("Kprobe event is not supported.\n");
+ return -ENOTSUPP;
+}
+#endif
+
+#ifdef CONFIG_HIST_TRIGGERS
+static int __init
+trace_boot_add_synth_event(struct xbc_node *node, const char *event)
+{
+ struct dynevent_cmd cmd;
+ struct xbc_node *anode;
+ char buf[MAX_BUF_LEN];
+ const char *p;
+ int ret;
+
+ synth_event_cmd_init(&cmd, buf, MAX_BUF_LEN);
+
+ ret = synth_event_gen_cmd_start(&cmd, event, NULL);
+ if (ret)
+ return ret;
+
+ xbc_node_for_each_array_value(node, "fields", anode, p) {
+ ret = synth_event_add_field_str(&cmd, p);
+ if (ret)
+ return ret;
+ }
+
+ ret = synth_event_gen_cmd_end(&cmd);
+ if (ret < 0)
+ pr_err("Failed to add synthetic event: %s\n", buf);
+
+ return ret;
+}
+#else
+static inline int __init
+trace_boot_add_synth_event(struct xbc_node *node, const char *event)
+{
+ pr_err("Synthetic event is not supported.\n");
+ return -ENOTSUPP;
+}
+#endif
+
+static void __init
+trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
+ struct xbc_node *enode)
+{
+ struct trace_event_file *file;
+ struct xbc_node *anode;
+ char buf[MAX_BUF_LEN];
+ const char *p, *group, *event;
+
+ group = xbc_node_get_data(gnode);
+ event = xbc_node_get_data(enode);
+
+ if (!strcmp(group, "kprobes"))
+ if (trace_boot_add_kprobe_event(enode, event) < 0)
+ return;
+ if (!strcmp(group, "synthetic"))
+ if (trace_boot_add_synth_event(enode, event) < 0)
+ return;
+
+ mutex_lock(&event_mutex);
+ file = find_event_file(tr, group, event);
+ if (!file) {
+ pr_err("Failed to find event: %s:%s\n", group, event);
+ goto out;
+ }
+
+ p = xbc_node_find_value(enode, "filter", NULL);
+ if (p && *p != '\0') {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ pr_err("filter string is too long: %s\n", p);
+ else if (apply_event_filter(file, buf) < 0)
+ pr_err("Failed to apply filter: %s\n", buf);
+ }
+
+ xbc_node_for_each_array_value(enode, "actions", anode, p) {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ pr_err("action string is too long: %s\n", p);
+ else if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply an action: %s\n", buf);
+ }
+
+ if (xbc_node_find_value(enode, "enable", NULL)) {
+ if (trace_event_enable_disable(file, 1, 0) < 0)
+ pr_err("Failed to enable event node: %s:%s\n",
+ group, event);
+ }
+out:
+ mutex_unlock(&event_mutex);
+}
+
+static void __init
+trace_boot_init_events(struct trace_array *tr, struct xbc_node *node)
+{
+ struct xbc_node *gnode, *enode;
+
+ node = xbc_node_find_child(node, "event");
+ if (!node)
+ return;
+ /* per-event key starts with "event.GROUP.EVENT" */
+ xbc_node_for_each_child(node, gnode)
+ xbc_node_for_each_child(gnode, enode)
+ trace_boot_init_one_event(tr, gnode, enode);
+}
+#else
+#define trace_boot_enable_events(tr, node) do {} while (0)
+#define trace_boot_init_events(tr, node) do {} while (0)
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void __init
+trace_boot_set_ftrace_filter(struct trace_array *tr, struct xbc_node *node)
+{
+ struct xbc_node *anode;
+ const char *p;
+ char *q;
+
+ xbc_node_for_each_array_value(node, "ftrace.filters", anode, p) {
+ q = kstrdup(p, GFP_KERNEL);
+ if (!q)
+ return;
+ if (ftrace_set_filter(tr->ops, q, strlen(q), 0) < 0)
+ pr_err("Failed to add %s to ftrace filter\n", p);
+ else
+ ftrace_filter_param = true;
+ kfree(q);
+ }
+ xbc_node_for_each_array_value(node, "ftrace.notraces", anode, p) {
+ q = kstrdup(p, GFP_KERNEL);
+ if (!q)
+ return;
+ if (ftrace_set_notrace(tr->ops, q, strlen(q), 0) < 0)
+ pr_err("Failed to add %s to ftrace filter\n", p);
+ else
+ ftrace_filter_param = true;
+ kfree(q);
+ }
+}
+#else
+#define trace_boot_set_ftrace_filter(tr, node) do {} while (0)
+#endif
+
+static void __init
+trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node)
+{
+ const char *p;
+
+ trace_boot_set_ftrace_filter(tr, node);
+
+ p = xbc_node_find_value(node, "tracer", NULL);
+ if (p && *p != '\0') {
+ if (tracing_set_tracer(tr, p) < 0)
+ pr_err("Failed to set given tracer: %s\n", p);
+ }
+}
+
+static void __init
+trace_boot_init_one_instance(struct trace_array *tr, struct xbc_node *node)
+{
+ trace_boot_set_instance_options(tr, node);
+ trace_boot_init_events(tr, node);
+ trace_boot_enable_events(tr, node);
+ trace_boot_enable_tracer(tr, node);
+}
+
+static void __init
+trace_boot_init_instances(struct xbc_node *node)
+{
+ struct xbc_node *inode;
+ struct trace_array *tr;
+ const char *p;
+
+ node = xbc_node_find_child(node, "instance");
+ if (!node)
+ return;
+
+ xbc_node_for_each_child(node, inode) {
+ p = xbc_node_get_data(inode);
+ if (!p || *p == '\0')
+ continue;
+
+ tr = trace_array_get_by_name(p);
+ if (!tr) {
+ pr_err("Failed to get trace instance %s\n", p);
+ continue;
+ }
+ trace_boot_init_one_instance(tr, inode);
+ trace_array_put(tr);
+ }
+}
+
+static int __init trace_boot_init(void)
+{
+ struct xbc_node *trace_node;
+ struct trace_array *tr;
+
+ trace_node = xbc_find_node("ftrace");
+ if (!trace_node)
+ return 0;
+
+ tr = top_trace_array();
+ if (!tr)
+ return 0;
+
+ /* Global trace array is also one instance */
+ trace_boot_init_one_instance(tr, trace_node);
+ trace_boot_init_instances(trace_node);
+
+ return 0;
+}
+
+fs_initcall(trace_boot_init);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 3ea65cdff30d..eff099123aa2 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,10 +32,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
struct trace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
+ struct trace_buffer *buffer;
struct trace_array_cpu *data;
struct ring_buffer_event *event;
struct trace_branch *entry;
- struct ring_buffer *buffer;
unsigned long flags;
int pc;
const char *p;
@@ -55,12 +55,12 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
raw_local_irq_save(flags);
current->trace_recursion |= TRACE_BRANCH_BIT;
- data = this_cpu_ptr(tr->trace_buffer.data);
+ data = this_cpu_ptr(tr->array_buffer.data);
if (atomic_read(&data->disabled))
goto out;
pc = preempt_count();
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
sizeof(*entry), flags, pc);
if (!event)
@@ -244,7 +244,7 @@ static int annotated_branch_stat_headers(struct seq_file *m)
return 0;
}
-static inline long get_incorrect_percent(struct ftrace_branch_data *p)
+static inline long get_incorrect_percent(const struct ftrace_branch_data *p)
{
long percent;
@@ -332,10 +332,10 @@ annotated_branch_stat_next(void *v, int idx)
return p;
}
-static int annotated_branch_stat_cmp(void *p1, void *p2)
+static int annotated_branch_stat_cmp(const void *p1, const void *p2)
{
- struct ftrace_branch_data *a = p1;
- struct ftrace_branch_data *b = p2;
+ const struct ftrace_branch_data *a = p1;
+ const struct ftrace_branch_data *b = p2;
long percent_a, percent_b;
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index fa100ed3b4de..9f2e8520b748 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -47,6 +47,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
return -EINVAL;
event++;
}
+ argc--; argv++;
p = strchr(event, '/');
if (p) {
@@ -61,10 +62,13 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
for_each_dyn_event_safe(pos, n) {
if (type && type != pos->ops)
continue;
- if (pos->ops->match(system, event, pos)) {
- ret = pos->ops->free(pos);
+ if (!pos->ops->match(system, event,
+ argc, (const char **)argv, pos))
+ continue;
+
+ ret = pos->ops->free(pos);
+ if (ret)
break;
- }
}
mutex_unlock(&event_mutex);
@@ -170,6 +174,10 @@ static int dyn_event_open(struct inode *inode, struct file *file)
{
int ret;
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
+
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
ret = dyn_events_release_all(NULL);
if (ret < 0)
@@ -215,3 +223,215 @@ static __init int init_dynamic_event(void)
return 0;
}
fs_initcall(init_dynamic_event);
+
+/**
+ * dynevent_arg_add - Add an arg to a dynevent_cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd
+ * @arg: The argument to append to the current cmd
+ * @check_arg: An (optional) pointer to a function checking arg sanity
+ *
+ * Append an argument to a dynevent_cmd. The argument string will be
+ * appended to the current cmd string, followed by a separator, if
+ * applicable. Before the argument is added, the @check_arg function,
+ * if present, will be used to check the sanity of the current arg
+ * string.
+ *
+ * The cmd string and separator should be set using the
+ * dynevent_arg_init() before any arguments are added using this
+ * function.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int dynevent_arg_add(struct dynevent_cmd *cmd,
+ struct dynevent_arg *arg,
+ dynevent_check_arg_fn_t check_arg)
+{
+ int ret = 0;
+
+ if (check_arg) {
+ ret = check_arg(arg);
+ if (ret)
+ return ret;
+ }
+
+ ret = seq_buf_printf(&cmd->seq, " %s%c", arg->str, arg->separator);
+ if (ret) {
+ pr_err("String is too long: %s%c\n", arg->str, arg->separator);
+ return -E2BIG;
+ }
+
+ return ret;
+}
+
+/**
+ * dynevent_arg_pair_add - Add an arg pair to a dynevent_cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd
+ * @arg_pair: The argument pair to append to the current cmd
+ * @check_arg: An (optional) pointer to a function checking arg sanity
+ *
+ * Append an argument pair to a dynevent_cmd. An argument pair
+ * consists of a left-hand-side argument and a right-hand-side
+ * argument separated by an operator, which can be whitespace, all
+ * followed by a separator, if applicable. This can be used to add
+ * arguments of the form 'type variable_name;' or 'x+y'.
+ *
+ * The lhs argument string will be appended to the current cmd string,
+ * followed by an operator, if applicable, followd by the rhs string,
+ * followed finally by a separator, if applicable. Before the
+ * argument is added, the @check_arg function, if present, will be
+ * used to check the sanity of the current arg strings.
+ *
+ * The cmd strings, operator, and separator should be set using the
+ * dynevent_arg_pair_init() before any arguments are added using this
+ * function.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int dynevent_arg_pair_add(struct dynevent_cmd *cmd,
+ struct dynevent_arg_pair *arg_pair,
+ dynevent_check_arg_fn_t check_arg)
+{
+ int ret = 0;
+
+ if (check_arg) {
+ ret = check_arg(arg_pair);
+ if (ret)
+ return ret;
+ }
+
+ ret = seq_buf_printf(&cmd->seq, " %s%c%s%c", arg_pair->lhs,
+ arg_pair->operator, arg_pair->rhs,
+ arg_pair->separator);
+ if (ret) {
+ pr_err("field string is too long: %s%c%s%c\n", arg_pair->lhs,
+ arg_pair->operator, arg_pair->rhs,
+ arg_pair->separator);
+ return -E2BIG;
+ }
+
+ return ret;
+}
+
+/**
+ * dynevent_str_add - Add a string to a dynevent_cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd
+ * @str: The string to append to the current cmd
+ *
+ * Append a string to a dynevent_cmd. The string will be appended to
+ * the current cmd string as-is, with nothing prepended or appended.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int dynevent_str_add(struct dynevent_cmd *cmd, const char *str)
+{
+ int ret = 0;
+
+ ret = seq_buf_puts(&cmd->seq, str);
+ if (ret) {
+ pr_err("String is too long: %s\n", str);
+ return -E2BIG;
+ }
+
+ return ret;
+}
+
+/**
+ * dynevent_cmd_init - Initialize a dynevent_cmd object
+ * @cmd: A pointer to the dynevent_cmd struct representing the cmd
+ * @buf: A pointer to the buffer to generate the command into
+ * @maxlen: The length of the buffer the command will be generated into
+ * @type: The type of the cmd, checked against further operations
+ * @run_command: The type-specific function that will actually run the command
+ *
+ * Initialize a dynevent_cmd. A dynevent_cmd is used to build up and
+ * run dynamic event creation commands, such as commands for creating
+ * synthetic and kprobe events. Before calling any of the functions
+ * used to build the command, a dynevent_cmd object should be
+ * instantiated and initialized using this function.
+ *
+ * The initialization sets things up by saving a pointer to the
+ * user-supplied buffer and its length via the @buf and @maxlen
+ * params, and by saving the cmd-specific @type and @run_command
+ * params which are used to check subsequent dynevent_cmd operations
+ * and actually run the command when complete.
+ */
+void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen,
+ enum dynevent_type type,
+ dynevent_create_fn_t run_command)
+{
+ memset(cmd, '\0', sizeof(*cmd));
+
+ seq_buf_init(&cmd->seq, buf, maxlen);
+ cmd->type = type;
+ cmd->run_command = run_command;
+}
+
+/**
+ * dynevent_arg_init - Initialize a dynevent_arg object
+ * @arg: A pointer to the dynevent_arg struct representing the arg
+ * @separator: An (optional) separator, appended after adding the arg
+ *
+ * Initialize a dynevent_arg object. A dynevent_arg represents an
+ * object used to append single arguments to the current command
+ * string. After the arg string is successfully appended to the
+ * command string, the optional @separator is appended. If no
+ * separator was specified when initializing the arg, a space will be
+ * appended.
+ */
+void dynevent_arg_init(struct dynevent_arg *arg,
+ char separator)
+{
+ memset(arg, '\0', sizeof(*arg));
+
+ if (!separator)
+ separator = ' ';
+ arg->separator = separator;
+}
+
+/**
+ * dynevent_arg_pair_init - Initialize a dynevent_arg_pair object
+ * @arg_pair: A pointer to the dynevent_arg_pair struct representing the arg
+ * @operator: An (optional) operator, appended after adding the first arg
+ * @separator: An (optional) separator, appended after adding the second arg
+ *
+ * Initialize a dynevent_arg_pair object. A dynevent_arg_pair
+ * represents an object used to append argument pairs such as 'type
+ * variable_name;' or 'x+y' to the current command string. An
+ * argument pair consists of a left-hand-side argument and a
+ * right-hand-side argument separated by an operator, which can be
+ * whitespace, all followed by a separator, if applicable. After the
+ * first arg string is successfully appended to the command string,
+ * the optional @operator is appended, followed by the second arg and
+ * and optional @separator. If no separator was specified when
+ * initializing the arg, a space will be appended.
+ */
+void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair,
+ char operator, char separator)
+{
+ memset(arg_pair, '\0', sizeof(*arg_pair));
+
+ if (!operator)
+ operator = ' ';
+ arg_pair->operator = operator;
+
+ if (!separator)
+ separator = ' ';
+ arg_pair->separator = separator;
+}
+
+/**
+ * dynevent_create - Create the dynamic event contained in dynevent_cmd
+ * @cmd: The dynevent_cmd object containing the dynamic event creation command
+ *
+ * Once a dynevent_cmd object has been successfully built up via the
+ * dynevent_cmd_init(), dynevent_arg_add() and dynevent_arg_pair_add()
+ * functions, this function runs the final command to actually create
+ * the event.
+ *
+ * Return: 0 if the event was successfully created, error otherwise.
+ */
+int dynevent_create(struct dynevent_cmd *cmd)
+{
+ return cmd->run_command(cmd);
+}
+EXPORT_SYMBOL_GPL(dynevent_create);
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index 8c334064e4d6..d6857a254ede 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -31,8 +31,9 @@ struct dyn_event;
* @is_busy: Check whether given event is busy so that it can not be deleted.
* Return true if it is busy, otherwides false.
* @free: Delete the given event. Return 0 if success, otherwides error.
- * @match: Check whether given event and system name match this event.
- * Return true if it matches, otherwides false.
+ * @match: Check whether given event and system name match this event. The argc
+ * and argv is used for exact match. Return true if it matches, otherwides
+ * false.
*
* Except for @create, these methods are called under holding event_mutex.
*/
@@ -43,7 +44,7 @@ struct dyn_event_operations {
bool (*is_busy)(struct dyn_event *ev);
int (*free)(struct dyn_event *ev);
bool (*match)(const char *system, const char *event,
- struct dyn_event *ev);
+ int argc, const char **argv, struct dyn_event *ev);
};
/* Register new dyn_event type -- must be called at first */
@@ -116,4 +117,36 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type);
#define for_each_dyn_event_safe(pos, n) \
list_for_each_entry_safe(pos, n, &dyn_event_list, list)
+extern void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen,
+ enum dynevent_type type,
+ dynevent_create_fn_t run_command);
+
+typedef int (*dynevent_check_arg_fn_t)(void *data);
+
+struct dynevent_arg {
+ const char *str;
+ char separator; /* e.g. ';', ',', or nothing */
+};
+
+extern void dynevent_arg_init(struct dynevent_arg *arg,
+ char separator);
+extern int dynevent_arg_add(struct dynevent_cmd *cmd,
+ struct dynevent_arg *arg,
+ dynevent_check_arg_fn_t check_arg);
+
+struct dynevent_arg_pair {
+ const char *lhs;
+ const char *rhs;
+ char operator; /* e.g. '=' or nothing */
+ char separator; /* e.g. ';', ',', or nothing */
+};
+
+extern void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair,
+ char operator, char separator);
+
+extern int dynevent_arg_pair_add(struct dynevent_cmd *cmd,
+ struct dynevent_arg_pair *arg_pair,
+ dynevent_check_arg_fn_t check_arg);
+extern int dynevent_str_add(struct dynevent_cmd *cmd, const char *str);
+
#endif
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index fc8e97328e54..f22746f3c132 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -61,15 +61,13 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
TRACE_FN,
F_STRUCT(
- __field( unsigned long, ip )
- __field( unsigned long, parent_ip )
+ __field_fn( unsigned long, ip )
+ __field_fn( unsigned long, parent_ip )
),
F_printk(" %ps <-- %ps",
(void *)__entry->ip, (void *)__entry->parent_ip),
- FILTER_TRACE_FN,
-
perf_ftrace_event_register
);
@@ -84,9 +82,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
__field_desc( int, graph_ent, depth )
),
- F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth),
-
- FILTER_OTHER
+ F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
);
/* Function return entry */
@@ -97,18 +93,16 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
__field_desc( unsigned long, ret, func )
+ __field_desc( unsigned long, ret, overrun )
__field_desc( unsigned long long, ret, calltime)
__field_desc( unsigned long long, ret, rettime )
- __field_desc( unsigned long, ret, overrun )
__field_desc( int, ret, depth )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
(void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
- __entry->depth),
-
- FILTER_OTHER
+ __entry->depth)
);
/*
@@ -137,9 +131,7 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
__entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_pid, __entry->next_prio, __entry->next_state,
- __entry->next_cpu),
-
- FILTER_OTHER
+ __entry->next_cpu)
);
/*
@@ -157,9 +149,7 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
__entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_pid, __entry->next_prio, __entry->next_state,
- __entry->next_cpu),
-
- FILTER_OTHER
+ __entry->next_cpu)
);
/*
@@ -174,7 +164,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
F_STRUCT(
__field( int, size )
- __dynamic_array(unsigned long, caller )
+ __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
),
F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n"
@@ -183,9 +173,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
(void *)__entry->caller[0], (void *)__entry->caller[1],
(void *)__entry->caller[2], (void *)__entry->caller[3],
(void *)__entry->caller[4], (void *)__entry->caller[5],
- (void *)__entry->caller[6], (void *)__entry->caller[7]),
-
- FILTER_OTHER
+ (void *)__entry->caller[6], (void *)__entry->caller[7])
);
FTRACE_ENTRY(user_stack, userstack_entry,
@@ -203,9 +191,7 @@ FTRACE_ENTRY(user_stack, userstack_entry,
(void *)__entry->caller[0], (void *)__entry->caller[1],
(void *)__entry->caller[2], (void *)__entry->caller[3],
(void *)__entry->caller[4], (void *)__entry->caller[5],
- (void *)__entry->caller[6], (void *)__entry->caller[7]),
-
- FILTER_OTHER
+ (void *)__entry->caller[6], (void *)__entry->caller[7])
);
/*
@@ -222,9 +208,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
),
F_printk("%ps: %s",
- (void *)__entry->ip, __entry->fmt),
-
- FILTER_OTHER
+ (void *)__entry->ip, __entry->fmt)
);
FTRACE_ENTRY_REG(print, print_entry,
@@ -239,8 +223,6 @@ FTRACE_ENTRY_REG(print, print_entry,
F_printk("%ps: %s",
(void *)__entry->ip, __entry->buf),
- FILTER_OTHER,
-
ftrace_event_register
);
@@ -254,9 +236,7 @@ FTRACE_ENTRY(raw_data, raw_data_entry,
),
F_printk("id:%04x %08x",
- __entry->id, (int)__entry->buf[0]),
-
- FILTER_OTHER
+ __entry->id, (int)__entry->buf[0])
);
FTRACE_ENTRY(bputs, bputs_entry,
@@ -269,9 +249,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
),
F_printk("%ps: %s",
- (void *)__entry->ip, __entry->str),
-
- FILTER_OTHER
+ (void *)__entry->ip, __entry->str)
);
FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
@@ -283,16 +261,14 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
__field_desc( resource_size_t, rw, phys )
__field_desc( unsigned long, rw, value )
__field_desc( unsigned long, rw, pc )
- __field_desc( int, rw, map_id )
+ __field_desc( int, rw, map_id )
__field_desc( unsigned char, rw, opcode )
__field_desc( unsigned char, rw, width )
),
F_printk("%lx %lx %lx %d %x %x",
(unsigned long)__entry->phys, __entry->value, __entry->pc,
- __entry->map_id, __entry->opcode, __entry->width),
-
- FILTER_OTHER
+ __entry->map_id, __entry->opcode, __entry->width)
);
FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
@@ -304,15 +280,13 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
__field_desc( resource_size_t, map, phys )
__field_desc( unsigned long, map, virt )
__field_desc( unsigned long, map, len )
- __field_desc( int, map, map_id )
+ __field_desc( int, map, map_id )
__field_desc( unsigned char, map, opcode )
),
F_printk("%lx %lx %lx %d %x",
(unsigned long)__entry->phys, __entry->virt, __entry->len,
- __entry->map_id, __entry->opcode),
-
- FILTER_OTHER
+ __entry->map_id, __entry->opcode)
);
@@ -334,9 +308,7 @@ FTRACE_ENTRY(branch, trace_branch,
F_printk("%u:%s:%s (%u)%s",
__entry->line,
__entry->func, __entry->file, __entry->correct,
- __entry->constant ? " CONSTANT" : ""),
-
- FILTER_OTHER
+ __entry->constant ? " CONSTANT" : "")
);
@@ -362,7 +334,5 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
__entry->duration,
__entry->outer_duration,
__entry->nmi_total_ts,
- __entry->nmi_count),
-
- FILTER_OTHER
+ __entry->nmi_count)
);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0892e38ed6fb..643e0b19920d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
+#include <linux/security.h>
#include "trace.h"
#include "trace_probe.h"
@@ -26,8 +27,10 @@ static int total_ref_count;
static int perf_trace_event_perm(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
+ int ret;
+
if (tp_event->perf_perm) {
- int ret = tp_event->perf_perm(tp_event, p_event);
+ ret = tp_event->perf_perm(tp_event, p_event);
if (ret)
return ret;
}
@@ -46,8 +49,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
- if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ ret = perf_allow_tracepoint(&p_event->attr);
+ if (ret)
+ return ret;
if (!is_sampling_event(p_event))
return 0;
@@ -82,8 +86,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
* ...otherwise raw tracepoint data can be a severe data leak,
* only allow root to have these.
*/
- if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ ret = perf_allow_tracepoint(&p_event->attr);
+ if (ret)
+ return ret;
return 0;
}
@@ -272,9 +277,11 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
goto out;
}
+ mutex_lock(&event_mutex);
ret = perf_trace_event_init(tp_event, p_event);
if (ret)
destroy_local_trace_kprobe(tp_event);
+ mutex_unlock(&event_mutex);
out:
kfree(func);
return ret;
@@ -282,8 +289,10 @@ out:
void perf_kprobe_destroy(struct perf_event *p_event)
{
+ mutex_lock(&event_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+ mutex_unlock(&event_mutex);
destroy_local_trace_kprobe(p_event->tp_event);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c7506bc81b75..f38234ecea18 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) fmt
#include <linux/workqueue.h>
+#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
#include <linux/tracefs.h>
@@ -23,6 +24,7 @@
#include <linux/delay.h>
#include <trace/events/sched.h>
+#include <trace/syscall.h>
#include <asm/setup.h>
@@ -236,7 +238,7 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
if (!pid_list)
return false;
- data = this_cpu_ptr(tr->trace_buffer.data);
+ data = this_cpu_ptr(tr->array_buffer.data);
return data->ignore_pid;
}
@@ -255,12 +257,12 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
/*
- * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+ * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
* preemption (adding one to the preempt_count). Since we are
* interested in the preempt_count at the time the tracepoint was
* hit, we need to subtract one to offset the increment.
*/
- if (IS_ENABLED(CONFIG_PREEMPT))
+ if (IS_ENABLED(CONFIG_PREEMPTION))
fbuffer->pc--;
fbuffer->trace_file = trace_file;
@@ -271,6 +273,7 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
if (!fbuffer->event)
return NULL;
+ fbuffer->regs = NULL;
fbuffer->entry = ring_buffer_event_data(fbuffer->event);
return fbuffer->entry;
}
@@ -319,7 +322,8 @@ void trace_event_enable_cmd_record(bool enable)
struct trace_event_file *file;
struct trace_array *tr;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
do_for_each_event_file(tr, file) {
if (!(file->flags & EVENT_FILE_FL_ENABLED))
@@ -333,7 +337,6 @@ void trace_event_enable_cmd_record(bool enable)
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
} while_for_each_event_file();
- mutex_unlock(&event_mutex);
}
void trace_event_enable_tgid_record(bool enable)
@@ -341,7 +344,8 @@ void trace_event_enable_tgid_record(bool enable)
struct trace_event_file *file;
struct trace_array *tr;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
do_for_each_event_file(tr, file) {
if (!(file->flags & EVENT_FILE_FL_ENABLED))
continue;
@@ -355,7 +359,6 @@ void trace_event_enable_tgid_record(bool enable)
&file->flags);
}
} while_for_each_event_file();
- mutex_unlock(&event_mutex);
}
static int __ftrace_event_enable_disable(struct trace_event_file *file,
@@ -545,7 +548,7 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
- this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ignore_pid,
trace_ignore_this_task(pid_list, prev) &&
trace_ignore_this_task(pid_list, next));
}
@@ -559,7 +562,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
- this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ignore_pid,
trace_ignore_this_task(pid_list, next));
}
@@ -570,12 +573,12 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
struct trace_pid_list *pid_list;
/* Nothing to do if we are already tracing */
- if (!this_cpu_read(tr->trace_buffer.data->ignore_pid))
+ if (!this_cpu_read(tr->array_buffer.data->ignore_pid))
return;
pid_list = rcu_dereference_sched(tr->filtered_pids);
- this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ignore_pid,
trace_ignore_this_task(pid_list, task));
}
@@ -586,13 +589,13 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
struct trace_pid_list *pid_list;
/* Nothing to do if we are not tracing */
- if (this_cpu_read(tr->trace_buffer.data->ignore_pid))
+ if (this_cpu_read(tr->array_buffer.data->ignore_pid))
return;
pid_list = rcu_dereference_sched(tr->filtered_pids);
/* Set tracing if current is enabled */
- this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ignore_pid,
trace_ignore_this_task(pid_list, current));
}
@@ -624,7 +627,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
}
for_each_possible_cpu(cpu)
- per_cpu_ptr(tr->trace_buffer.data, cpu)->ignore_pid = false;
+ per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false;
rcu_assign_pointer(tr->filtered_pids, NULL);
@@ -696,7 +699,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
return;
if (!--dir->nr_events) {
- tracefs_remove_recursive(dir->entry);
+ tracefs_remove(dir->entry);
list_del(&dir->list);
__put_system_dir(dir);
}
@@ -715,7 +718,7 @@ static void remove_event_file_dir(struct trace_event_file *file)
}
spin_unlock(&dir->d_lock);
- tracefs_remove_recursive(dir);
+ tracefs_remove(dir);
}
list_del(&file->list);
@@ -787,11 +790,13 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
return ret;
}
-static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
+int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
char *event = NULL, *sub = NULL, *match;
int ret;
+ if (!tr)
+ return -ENOENT;
/*
* The buf format can be <subsystem>:<event-name>
* *:<event-name> means any event by that name.
@@ -824,7 +829,6 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
return ret;
}
-EXPORT_SYMBOL_GPL(ftrace_set_clr_event);
/**
* trace_set_clr_event - enable or disable an event
@@ -849,6 +853,32 @@ int trace_set_clr_event(const char *system, const char *event, int set)
}
EXPORT_SYMBOL_GPL(trace_set_clr_event);
+/**
+ * trace_array_set_clr_event - enable or disable an event for a trace array.
+ * @tr: concerned trace array.
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @enable: true to enable, false to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_array_set_clr_event(struct trace_array *tr, const char *system,
+ const char *event, bool enable)
+{
+ int set;
+
+ if (!tr)
+ return -ENOENT;
+
+ set = (enable == true) ? 1 : 0;
+ return __ftrace_set_clr_event(tr, NULL, system, event, set);
+}
+EXPORT_SYMBOL_GPL(trace_array_set_clr_event);
+
/* 128 should be much more than enough */
#define EVENT_BUF_SIZE 127
@@ -1294,6 +1324,8 @@ static int trace_format_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret;
+ /* Do we want to hide event format files on tracefs lockdown? */
+
ret = seq_open(file, &trace_format_seq_ops);
if (ret < 0)
return ret;
@@ -1440,28 +1472,17 @@ static int system_tr_open(struct inode *inode, struct file *filp)
struct trace_array *tr = inode->i_private;
int ret;
- if (tracing_is_disabled())
- return -ENODEV;
-
- if (trace_array_get(tr) < 0)
- return -ENODEV;
-
/* Make a temporary dir that has no system but points to tr */
dir = kzalloc(sizeof(*dir), GFP_KERNEL);
- if (!dir) {
- trace_array_put(tr);
+ if (!dir)
return -ENOMEM;
- }
-
- dir->tr = tr;
- ret = tracing_open_generic(inode, filp);
+ ret = tracing_open_generic_tr(inode, filp);
if (ret < 0) {
- trace_array_put(tr);
kfree(dir);
return ret;
}
-
+ dir->tr = tr;
filp->private_data = dir;
return 0;
@@ -1575,7 +1596,7 @@ static void ignore_task_cpu(void *data)
pid_list = rcu_dereference_protected(tr->filtered_pids,
mutex_is_locked(&event_mutex));
- this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ this_cpu_write(tr->array_buffer.data->ignore_pid,
trace_ignore_this_task(pid_list, current));
}
@@ -1771,6 +1792,10 @@ ftrace_event_open(struct inode *inode, struct file *file,
struct seq_file *m;
int ret;
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
ret = seq_open(file, seq_ops);
if (ret < 0)
return ret;
@@ -1795,6 +1820,7 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)
{
const struct seq_operations *seq_ops = &show_event_seq_ops;
+ /* Checks for tracefs lockdown */
return ftrace_event_open(inode, file, seq_ops);
}
@@ -1805,8 +1831,9 @@ ftrace_event_set_open(struct inode *inode, struct file *file)
struct trace_array *tr = inode->i_private;
int ret;
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
@@ -1825,8 +1852,9 @@ ftrace_event_set_pid_open(struct inode *inode, struct file *file)
struct trace_array *tr = inode->i_private;
int ret;
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
@@ -1991,7 +2019,24 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
*/
head = trace_get_fields(call);
if (list_empty(head)) {
- ret = call->class->define_fields(call);
+ struct trace_event_fields *field = call->class->fields_array;
+ unsigned int offset = sizeof(struct trace_entry);
+
+ for (; field->type; field++) {
+ if (field->type == TRACE_FUNCTION_TYPE) {
+ ret = field->define_fields(call);
+ break;
+ }
+
+ offset = ALIGN(offset, field->align);
+ ret = trace_define_field(call, field->type, field->name,
+ offset, field->size,
+ field->is_signed, field->filter_type);
+ if (ret)
+ break;
+
+ offset += field->size;
+ }
if (ret < 0) {
pr_warn("Could not initialize trace point events/%s\n",
name);
@@ -2018,6 +2063,12 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
trace_create_file("format", 0444, file->dir, call,
&ftrace_event_format_fops);
+#ifdef CONFIG_TRACE_EVENT_INJECT
+ if (call->event.type && call->class->reg)
+ trace_create_file("inject", 0200, file->dir, file,
+ &event_inject_fops);
+#endif
+
return 0;
}
@@ -2503,6 +2554,91 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
return file;
}
+/**
+ * trace_get_event_file - Find and return a trace event file
+ * @instance: The name of the trace instance containing the event
+ * @system: The name of the system containing the event
+ * @event: The name of the event
+ *
+ * Return a trace event file given the trace instance name, trace
+ * system, and trace event name. If the instance name is NULL, it
+ * refers to the top-level trace array.
+ *
+ * This function will look it up and return it if found, after calling
+ * trace_array_get() to prevent the instance from going away, and
+ * increment the event's module refcount to prevent it from being
+ * removed.
+ *
+ * To release the file, call trace_put_event_file(), which will call
+ * trace_array_put() and decrement the event's module refcount.
+ *
+ * Return: The trace event on success, ERR_PTR otherwise.
+ */
+struct trace_event_file *trace_get_event_file(const char *instance,
+ const char *system,
+ const char *event)
+{
+ struct trace_array *tr = top_trace_array();
+ struct trace_event_file *file = NULL;
+ int ret = -EINVAL;
+
+ if (instance) {
+ tr = trace_array_find_get(instance);
+ if (!tr)
+ return ERR_PTR(-ENOENT);
+ } else {
+ ret = trace_array_get(tr);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ mutex_lock(&event_mutex);
+
+ file = find_event_file(tr, system, event);
+ if (!file) {
+ trace_array_put(tr);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Don't let event modules unload while in use */
+ ret = try_module_get(file->event_call->mod);
+ if (!ret) {
+ trace_array_put(tr);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = 0;
+ out:
+ mutex_unlock(&event_mutex);
+
+ if (ret)
+ file = ERR_PTR(ret);
+
+ return file;
+}
+EXPORT_SYMBOL_GPL(trace_get_event_file);
+
+/**
+ * trace_put_event_file - Release a file from trace_get_event_file()
+ * @file: The trace event file
+ *
+ * If a file was retrieved using trace_get_event_file(), this should
+ * be called when it's no longer needed. It will cancel the previous
+ * trace_array_get() called by that function, and decrement the
+ * event's module refcount.
+ */
+void trace_put_event_file(struct trace_event_file *file)
+{
+ mutex_lock(&event_mutex);
+ module_put(file->event_call->mod);
+ mutex_unlock(&event_mutex);
+
+ trace_array_put(file->tr);
+}
+EXPORT_SYMBOL_GPL(trace_put_event_file);
+
#ifdef CONFIG_DYNAMIC_FTRACE
/* Avoid typos */
@@ -3032,7 +3168,7 @@ int event_trace_del_tracer(struct trace_array *tr)
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
- tracefs_remove_recursive(tr->event_dir);
+ tracefs_remove(tr->event_dir);
up_write(&trace_event_sem);
tr->event_dir = NULL;
@@ -3359,8 +3495,8 @@ static void __init
function_test_events_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
+ struct trace_buffer *buffer;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
struct ftrace_entry *entry;
unsigned long flags;
long disabled;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c773b8fb270c..bf44f6bbd0c3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -452,8 +452,10 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
switch (*next) {
case '(': /* #2 */
- if (top - op_stack > nr_parens)
- return ERR_PTR(-EINVAL);
+ if (top - op_stack > nr_parens) {
+ ret = -EINVAL;
+ goto out_free;
+ }
*(++top) = invert;
continue;
case '!': /* #3 */
@@ -1660,7 +1662,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
- kfree(filter);
+ __free_filter(filter);
/* If any call succeeded, we still need to sync */
if (!fail)
tracepoint_synchronize_unregister();
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index ca6b0dff60c5..483b3fd1094f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -7,18 +7,23 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
+#include <linux/security.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/rculist.h>
#include <linux/tracefs.h>
+/* for gfp flag names */
+#include <linux/trace_events.h>
+#include <trace/events/mmflags.h>
+
#include "tracing_map.h"
#include "trace.h"
#include "trace_dynevent.h"
#define SYNTH_SYSTEM "synthetic"
-#define SYNTH_FIELDS_MAX 16
+#define SYNTH_FIELDS_MAX 32
#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
@@ -61,7 +66,12 @@
C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \
C(INVALID_REF_KEY, "Using variable references in keys not supported"), \
C(VAR_NOT_FOUND, "Couldn't find variable"), \
- C(FIELD_NOT_FOUND, "Couldn't find field"),
+ C(FIELD_NOT_FOUND, "Couldn't find field"), \
+ C(EMPTY_ASSIGNMENT, "Empty assignment"), \
+ C(INVALID_SORT_MODIFIER,"Invalid sort modifier"), \
+ C(EMPTY_SORT_FIELD, "Empty sort field"), \
+ C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \
+ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -111,6 +121,7 @@ struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
hist_field_fn_t fn;
+ unsigned int ref;
unsigned int size;
unsigned int offset;
unsigned int is_signed;
@@ -369,15 +380,15 @@ struct hist_trigger_data {
unsigned int n_save_var_str;
};
-static int synth_event_create(int argc, const char **argv);
+static int create_synth_event(int argc, const char **argv);
static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
static int synth_event_release(struct dyn_event *ev);
static bool synth_event_is_busy(struct dyn_event *ev);
static bool synth_event_match(const char *system, const char *event,
- struct dyn_event *ev);
+ int argc, const char **argv, struct dyn_event *ev);
static struct dyn_event_operations synth_event_ops = {
- .create = synth_event_create,
+ .create = create_synth_event,
.show = synth_event_show,
.is_busy = synth_event_is_busy,
.free = synth_event_release,
@@ -388,6 +399,7 @@ struct synth_field {
char *type;
char *name;
size_t size;
+ unsigned int offset;
bool is_signed;
bool is_string;
};
@@ -402,6 +414,7 @@ struct synth_event {
struct trace_event_class class;
struct trace_event_call call;
struct tracepoint *tp;
+ struct module *mod;
};
static bool is_synth_event(struct dyn_event *ev)
@@ -422,7 +435,7 @@ static bool synth_event_is_busy(struct dyn_event *ev)
}
static bool synth_event_match(const char *system, const char *event,
- struct dyn_event *ev)
+ int argc, const char **argv, struct dyn_event *ev)
{
struct synth_event *sev = to_synth_event(ev);
@@ -464,11 +477,12 @@ struct action_data {
* When a histogram trigger is hit, the values of any
* references to variables, including variables being passed
* as parameters to synthetic events, are collected into a
- * var_ref_vals array. This var_ref_idx is the index of the
- * first param in the array to be passed to the synthetic
- * event invocation.
+ * var_ref_vals array. This var_ref_idx array is an array of
+ * indices into the var_ref_vals array, one for each synthetic
+ * event param, and is passed to the synthetic event
+ * invocation.
*/
- unsigned int var_ref_idx;
+ unsigned int var_ref_idx[TRACING_MAP_VARS_MAX];
struct synth_event *synth_event;
bool use_trace_keyword;
char *synth_event_name;
@@ -602,7 +616,8 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
if (!str)
return;
- strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+ strcpy(last_cmd, "hist:");
+ strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:"));
if (file) {
call = file->event_call;
@@ -656,6 +671,8 @@ static int synth_event_define_fields(struct trace_event_call *call)
if (ret)
break;
+ event->fields[i]->offset = n_u64;
+
if (event->fields[i]->is_string) {
offset += STR_VAR_LEN_MAX;
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
@@ -674,6 +691,8 @@ static bool synth_field_signed(char *type)
{
if (str_has_prefix(type, "u"))
return false;
+ if (strcmp(type, "gfp_t") == 0)
+ return false;
return true;
}
@@ -752,6 +771,8 @@ static int synth_field_size(char *type)
size = sizeof(unsigned long);
else if (strcmp(type, "pid_t") == 0)
size = sizeof(pid_t);
+ else if (strcmp(type, "gfp_t") == 0)
+ size = sizeof(gfp_t);
else if (synth_field_is_string(type))
size = synth_field_string_size(type);
@@ -792,6 +813,8 @@ static const char *synth_field_fmt(char *type)
fmt = "%lu";
else if (strcmp(type, "pid_t") == 0)
fmt = "%d";
+ else if (strcmp(type, "gfp_t") == 0)
+ fmt = "%x";
else if (synth_field_is_string(type))
fmt = "%s";
@@ -822,7 +845,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
fmt = synth_field_fmt(se->fields[i]->type);
/* parameter types */
- if (tr->trace_flags & TRACE_ITER_VERBOSE)
+ if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
trace_seq_printf(s, "%s ", fmt);
snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
@@ -834,9 +857,20 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
i == se->n_fields - 1 ? "" : " ");
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
+ struct trace_print_flags __flags[] = {
+ __def_gfpflag_names, {-1, NULL} };
+
trace_seq_printf(s, print_fmt, se->fields[i]->name,
entry->fields[n_u64],
i == se->n_fields - 1 ? "" : " ");
+
+ if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
+ trace_seq_puts(s, " (");
+ trace_print_flags_seq(s, "|",
+ entry->fields[n_u64],
+ __flags);
+ trace_seq_putc(s, ')');
+ }
n_u64++;
}
}
@@ -852,14 +886,14 @@ static struct trace_event_functions synth_event_funcs = {
static notrace void trace_event_raw_event_synth(void *__data,
u64 *var_ref_vals,
- unsigned int var_ref_idx)
+ unsigned int *var_ref_idx)
{
struct trace_event_file *trace_file = __data;
struct synth_trace_event *entry;
struct trace_event_buffer fbuffer;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
struct synth_event *event;
- unsigned int i, n_u64;
+ unsigned int i, n_u64, val_idx;
int fields_size = 0;
event = trace_file->event_call->data;
@@ -873,7 +907,7 @@ static notrace void trace_event_raw_event_synth(void *__data,
* Avoid ring buffer recursion detection, as this event
* is being performed within another event.
*/
- buffer = trace_file->tr->trace_buffer.buffer;
+ buffer = trace_file->tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
@@ -882,14 +916,34 @@ static notrace void trace_event_raw_event_synth(void *__data,
goto out;
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ val_idx = var_ref_idx[i];
if (event->fields[i]->is_string) {
- char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
+ char *str_val = (char *)(long)var_ref_vals[val_idx];
char *str_field = (char *)&entry->fields[n_u64];
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
- entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
+ struct synth_field *field = event->fields[i];
+ u64 val = var_ref_vals[val_idx];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ entry->fields[n_u64] = val;
+ break;
+ }
n_u64++;
}
}
@@ -1071,10 +1125,10 @@ static struct tracepoint *alloc_synth_tracepoint(char *name)
}
typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
- unsigned int var_ref_idx);
+ unsigned int *var_ref_idx);
static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
- unsigned int var_ref_idx)
+ unsigned int *var_ref_idx)
{
struct tracepoint *tp = event->tp;
@@ -1113,6 +1167,12 @@ static struct synth_event *find_synth_event(const char *name)
return NULL;
}
+static struct trace_event_fields synth_event_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = synth_event_define_fields },
+ {}
+};
+
static int register_synth_event(struct synth_event *event)
{
struct trace_event_call *call = &event->call;
@@ -1134,7 +1194,7 @@ static int register_synth_event(struct synth_event *event)
INIT_LIST_HEAD(&call->class->fields);
call->event.funcs = &synth_event_funcs;
- call->class->define_fields = synth_event_define_fields;
+ call->class->fields_array = synth_event_fields_array;
ret = register_trace_event(&call->event);
if (!ret) {
@@ -1245,6 +1305,273 @@ struct hist_var_data {
struct hist_trigger_data *hist_data;
};
+static int synth_event_check_arg_fn(void *data)
+{
+ struct dynevent_arg_pair *arg_pair = data;
+ int size;
+
+ size = synth_field_size((char *)arg_pair->lhs);
+
+ return size ? 0 : -EINVAL;
+}
+
+/**
+ * synth_event_add_field - Add a new field to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @type: The type of the new field to add
+ * @name: The name of the new field to add
+ *
+ * Add a new field to a synthetic event cmd object. Field ordering is in
+ * the same order the fields are added.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_field(struct dynevent_cmd *cmd, const char *type,
+ const char *name)
+{
+ struct dynevent_arg_pair arg_pair;
+ int ret;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (!type || !name)
+ return -EINVAL;
+
+ dynevent_arg_pair_init(&arg_pair, 0, ';');
+
+ arg_pair.lhs = type;
+ arg_pair.rhs = name;
+
+ ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn);
+ if (ret)
+ return ret;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX)
+ ret = -EINVAL;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_field);
+
+/**
+ * synth_event_add_field_str - Add a new field to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @type_name: The type and name of the new field to add, as a single string
+ *
+ * Add a new field to a synthetic event cmd object, as a single
+ * string. The @type_name string is expected to be of the form 'type
+ * name', which will be appended by ';'. No sanity checking is done -
+ * what's passed in is assumed to already be well-formed. Field
+ * ordering is in the same order the fields are added.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name)
+{
+ struct dynevent_arg arg;
+ int ret;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (!type_name)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, ';');
+
+ arg.str = type_name;
+
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX)
+ ret = -EINVAL;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_field_str);
+
+/**
+ * synth_event_add_fields - Add multiple fields to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ *
+ * Add a new set of fields to a synthetic event cmd object. The event
+ * fields that will be defined for the event should be passed in as an
+ * array of struct synth_field_desc, and the number of elements in the
+ * array passed in as n_fields. Field ordering will retain the
+ * ordering given in the fields array.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_fields(struct dynevent_cmd *cmd,
+ struct synth_field_desc *fields,
+ unsigned int n_fields)
+{
+ unsigned int i;
+ int ret = 0;
+
+ for (i = 0; i < n_fields; i++) {
+ if (fields[i].type == NULL || fields[i].name == NULL) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_fields);
+
+/**
+ * __synth_event_gen_cmd_start - Start a synthetic event command from arg list
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @name: The name of the synthetic event
+ * @mod: The module creating the event, NULL if not created from a module
+ * @args: Variable number of arg (pairs), one pair for each field
+ *
+ * NOTE: Users normally won't want to call this function directly, but
+ * rather use the synth_event_gen_cmd_start() wrapper, which
+ * automatically adds a NULL to the end of the arg list. If this
+ * function is used directly, make sure the last arg in the variable
+ * arg list is NULL.
+ *
+ * Generate a synthetic event command to be executed by
+ * synth_event_gen_cmd_end(). This function can be used to generate
+ * the complete command or only the first part of it; in the latter
+ * case, synth_event_add_field(), synth_event_add_field_str(), or
+ * synth_event_add_fields() can be used to add more fields following
+ * this.
+ *
+ * There should be an even number variable args, each pair consisting
+ * of a type followed by a field name.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name,
+ struct module *mod, ...)
+{
+ struct dynevent_arg arg;
+ va_list args;
+ int ret;
+
+ cmd->event_name = name;
+ cmd->private_data = mod;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, 0);
+ arg.str = name;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ va_start(args, mod);
+ for (;;) {
+ const char *type, *name;
+
+ type = va_arg(args, const char *);
+ if (!type)
+ break;
+ name = va_arg(args, const char *);
+ if (!name)
+ break;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = synth_event_add_field(cmd, type, name);
+ if (ret)
+ break;
+ }
+ va_end(args);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start);
+
+/**
+ * synth_event_gen_cmd_array_start - Start synthetic event command from an array
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @name: The name of the synthetic event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ *
+ * Generate a synthetic event command to be executed by
+ * synth_event_gen_cmd_end(). This function can be used to generate
+ * the complete command or only the first part of it; in the latter
+ * case, synth_event_add_field(), synth_event_add_field_str(), or
+ * synth_event_add_fields() can be used to add more fields following
+ * this.
+ *
+ * The event fields that will be defined for the event should be
+ * passed in as an array of struct synth_field_desc, and the number of
+ * elements in the array passed in as n_fields. Field ordering will
+ * retain the ordering given in the fields array.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
+ struct module *mod,
+ struct synth_field_desc *fields,
+ unsigned int n_fields)
+{
+ struct dynevent_arg arg;
+ unsigned int i;
+ int ret = 0;
+
+ cmd->event_name = name;
+ cmd->private_data = mod;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (n_fields > SYNTH_FIELDS_MAX)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, 0);
+ arg.str = name;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < n_fields; i++) {
+ if (fields[i].type == NULL || fields[i].name == NULL)
+ return -EINVAL;
+
+ ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
+
static int __create_synth_event(int argc, const char *name, const char **argv)
{
struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
@@ -1313,29 +1640,123 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
goto out;
}
+/**
+ * synth_event_create - Create a new synthetic event
+ * @name: The name of the new sythetic event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ * @mod: The module creating the event, NULL if not created from a module
+ *
+ * Create a new synthetic event with the given name under the
+ * trace/events/synthetic/ directory. The event fields that will be
+ * defined for the event should be passed in as an array of struct
+ * synth_field_desc, and the number elements in the array passed in as
+ * n_fields. Field ordering will retain the ordering given in the
+ * fields array.
+ *
+ * If the new synthetic event is being created from a module, the mod
+ * param must be non-NULL. This will ensure that the trace buffer
+ * won't contain unreadable events.
+ *
+ * The new synth event should be deleted using synth_event_delete()
+ * function. The new synthetic event can be generated from modules or
+ * other kernel code using trace_synth_event() and related functions.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_create(const char *name, struct synth_field_desc *fields,
+ unsigned int n_fields, struct module *mod)
+{
+ struct dynevent_cmd cmd;
+ char *buf;
+ int ret;
+
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ ret = synth_event_gen_cmd_array_start(&cmd, name, mod,
+ fields, n_fields);
+ if (ret)
+ goto out;
+
+ ret = synth_event_gen_cmd_end(&cmd);
+ out:
+ kfree(buf);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_create);
+
+static int destroy_synth_event(struct synth_event *se)
+{
+ int ret;
+
+ if (se->ref)
+ ret = -EBUSY;
+ else {
+ ret = unregister_synth_event(se);
+ if (!ret) {
+ dyn_event_remove(&se->devent);
+ free_synth_event(se);
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * synth_event_delete - Delete a synthetic event
+ * @event_name: The name of the new sythetic event
+ *
+ * Delete a synthetic event that was created with synth_event_create().
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_delete(const char *event_name)
+{
+ struct synth_event *se = NULL;
+ struct module *mod = NULL;
+ int ret = -ENOENT;
+
+ mutex_lock(&event_mutex);
+ se = find_synth_event(event_name);
+ if (se) {
+ mod = se->mod;
+ ret = destroy_synth_event(se);
+ }
+ mutex_unlock(&event_mutex);
+
+ if (mod) {
+ mutex_lock(&trace_types_lock);
+ /*
+ * It is safest to reset the ring buffer if the module
+ * being unloaded registered any events that were
+ * used. The only worry is if a new module gets
+ * loaded, and takes on the same id as the events of
+ * this module. When printing out the buffer, traced
+ * events left over from this module may be passed to
+ * the new module events and unexpected results may
+ * occur.
+ */
+ tracing_reset_all_online_cpus();
+ mutex_unlock(&trace_types_lock);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_delete);
+
static int create_or_delete_synth_event(int argc, char **argv)
{
const char *name = argv[0];
- struct synth_event *event = NULL;
int ret;
/* trace_run_command() ensures argc != 0 */
if (name[0] == '!') {
- mutex_lock(&event_mutex);
- event = find_synth_event(name + 1);
- if (event) {
- if (event->ref)
- ret = -EBUSY;
- else {
- ret = unregister_synth_event(event);
- if (!ret) {
- dyn_event_remove(&event->devent);
- free_synth_event(event);
- }
- }
- } else
- ret = -ENOENT;
- mutex_unlock(&event_mutex);
+ ret = synth_event_delete(name + 1);
return ret;
}
@@ -1343,7 +1764,419 @@ static int create_or_delete_synth_event(int argc, char **argv)
return ret == -ECANCELED ? -EINVAL : ret;
}
-static int synth_event_create(int argc, const char **argv)
+static int synth_event_run_command(struct dynevent_cmd *cmd)
+{
+ struct synth_event *se;
+ int ret;
+
+ ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
+ if (ret)
+ return ret;
+
+ se = find_synth_event(cmd->event_name);
+ if (WARN_ON(!se))
+ return -ENOENT;
+
+ se->mod = cmd->private_data;
+
+ return ret;
+}
+
+/**
+ * synth_event_cmd_init - Initialize a synthetic event command object
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @buf: A pointer to the buffer used to build the command
+ * @maxlen: The length of the buffer passed in @buf
+ *
+ * Initialize a synthetic event command object. Use this before
+ * calling any of the other dyenvent_cmd functions.
+ */
+void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen)
+{
+ dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH,
+ synth_event_run_command);
+}
+EXPORT_SYMBOL_GPL(synth_event_cmd_init);
+
+static inline int
+__synth_event_trace_start(struct trace_event_file *file,
+ struct synth_event_trace_state *trace_state)
+{
+ int entry_size, fields_size = 0;
+ int ret = 0;
+
+ /*
+ * Normal event tracing doesn't get called at all unless the
+ * ENABLED bit is set (which attaches the probe thus allowing
+ * this code to be called, etc). Because this is called
+ * directly by the user, we don't have that but we still need
+ * to honor not logging when disabled. For the the iterated
+ * trace case, we save the enabed state upon start and just
+ * ignore the following data calls.
+ */
+ if (!(file->flags & EVENT_FILE_FL_ENABLED) ||
+ trace_trigger_soft_disabled(file)) {
+ trace_state->disabled = true;
+ ret = -ENOENT;
+ goto out;
+ }
+
+ trace_state->event = file->event_call->data;
+
+ fields_size = trace_state->event->n_u64 * sizeof(u64);
+
+ /*
+ * Avoid ring buffer recursion detection, as this event
+ * is being performed within another event.
+ */
+ trace_state->buffer = file->tr->array_buffer.buffer;
+ ring_buffer_nest_start(trace_state->buffer);
+
+ entry_size = sizeof(*trace_state->entry) + fields_size;
+ trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer,
+ file,
+ entry_size);
+ if (!trace_state->entry) {
+ ring_buffer_nest_end(trace_state->buffer);
+ ret = -EINVAL;
+ }
+out:
+ return ret;
+}
+
+static inline void
+__synth_event_trace_end(struct synth_event_trace_state *trace_state)
+{
+ trace_event_buffer_commit(&trace_state->fbuffer);
+
+ ring_buffer_nest_end(trace_state->buffer);
+}
+
+/**
+ * synth_event_trace - Trace a synthetic event
+ * @file: The trace_event_file representing the synthetic event
+ * @n_vals: The number of values in vals
+ * @args: Variable number of args containing the event values
+ *
+ * Trace a synthetic event using the values passed in the variable
+ * argument list.
+ *
+ * The argument list should be a list 'n_vals' u64 values. The number
+ * of vals must match the number of field in the synthetic event, and
+ * must be in the same order as the synthetic event fields.
+ *
+ * All vals should be cast to u64, and string vals are just pointers
+ * to strings, cast to u64. Strings will be copied into space
+ * reserved in the event for the string, using these pointers.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
+{
+ struct synth_event_trace_state state;
+ unsigned int i, n_u64;
+ va_list args;
+ int ret;
+
+ ret = __synth_event_trace_start(file, &state);
+ if (ret) {
+ if (ret == -ENOENT)
+ ret = 0; /* just disabled, not really an error */
+ return ret;
+ }
+
+ va_start(args, n_vals);
+ for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
+ u64 val;
+
+ val = va_arg(args, u64);
+
+ if (state.event->fields[i]->is_string) {
+ char *str_val = (char *)(long)val;
+ char *str_field = (char *)&state.entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ state.entry->fields[n_u64] = val;
+ n_u64++;
+ }
+ }
+ va_end(args);
+
+ __synth_event_trace_end(&state);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace);
+
+/**
+ * synth_event_trace_array - Trace a synthetic event from an array
+ * @file: The trace_event_file representing the synthetic event
+ * @vals: Array of values
+ * @n_vals: The number of values in vals
+ *
+ * Trace a synthetic event using the values passed in as 'vals'.
+ *
+ * The 'vals' array is just an array of 'n_vals' u64. The number of
+ * vals must match the number of field in the synthetic event, and
+ * must be in the same order as the synthetic event fields.
+ *
+ * All vals should be cast to u64, and string vals are just pointers
+ * to strings, cast to u64. Strings will be copied into space
+ * reserved in the event for the string, using these pointers.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
+ unsigned int n_vals)
+{
+ struct synth_event_trace_state state;
+ unsigned int i, n_u64;
+ int ret;
+
+ ret = __synth_event_trace_start(file, &state);
+ if (ret) {
+ if (ret == -ENOENT)
+ ret = 0; /* just disabled, not really an error */
+ return ret;
+ }
+
+ for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
+ if (state.event->fields[i]->is_string) {
+ char *str_val = (char *)(long)vals[i];
+ char *str_field = (char *)&state.entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ state.entry->fields[n_u64] = vals[i];
+ n_u64++;
+ }
+ }
+
+ __synth_event_trace_end(&state);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace_array);
+
+/**
+ * synth_event_trace_start - Start piecewise synthetic event trace
+ * @file: The trace_event_file representing the synthetic event
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * Start the trace of a synthetic event field-by-field rather than all
+ * at once.
+ *
+ * This function 'opens' an event trace, which means space is reserved
+ * for the event in the trace buffer, after which the event's
+ * individual field values can be set through either
+ * synth_event_add_next_val() or synth_event_add_val().
+ *
+ * A pointer to a trace_state object is passed in, which will keep
+ * track of the current event trace state until the event trace is
+ * closed (and the event finally traced) using
+ * synth_event_trace_end().
+ *
+ * Note that synth_event_trace_end() must be called after all values
+ * have been added for each event trace, regardless of whether adding
+ * all field values succeeded or not.
+ *
+ * Note also that for a given event trace, all fields must be added
+ * using either synth_event_add_next_val() or synth_event_add_val()
+ * but not both together or interleaved.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace_start(struct trace_event_file *file,
+ struct synth_event_trace_state *trace_state)
+{
+ int ret;
+
+ if (!trace_state)
+ return -EINVAL;
+
+ memset(trace_state, '\0', sizeof(*trace_state));
+
+ ret = __synth_event_trace_start(file, trace_state);
+ if (ret == -ENOENT)
+ ret = 0; /* just disabled, not really an error */
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace_start);
+
+static int __synth_event_add_val(const char *field_name, u64 val,
+ struct synth_event_trace_state *trace_state)
+{
+ struct synth_field *field = NULL;
+ struct synth_trace_event *entry;
+ struct synth_event *event;
+ int i, ret = 0;
+
+ if (!trace_state) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* can't mix add_next_synth_val() with add_synth_val() */
+ if (field_name) {
+ if (trace_state->add_next) {
+ ret = -EINVAL;
+ goto out;
+ }
+ trace_state->add_name = true;
+ } else {
+ if (trace_state->add_name) {
+ ret = -EINVAL;
+ goto out;
+ }
+ trace_state->add_next = true;
+ }
+
+ if (trace_state->disabled)
+ goto out;
+
+ event = trace_state->event;
+ if (trace_state->add_name) {
+ for (i = 0; i < event->n_fields; i++) {
+ field = event->fields[i];
+ if (strcmp(field->name, field_name) == 0)
+ break;
+ }
+ if (!field) {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else {
+ if (trace_state->cur_field >= event->n_fields) {
+ ret = -EINVAL;
+ goto out;
+ }
+ field = event->fields[trace_state->cur_field++];
+ }
+
+ entry = trace_state->entry;
+ if (field->is_string) {
+ char *str_val = (char *)(long)val;
+ char *str_field;
+
+ if (!str_val) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ str_field = (char *)&entry->fields[field->offset];
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ } else
+ entry->fields[field->offset] = val;
+ out:
+ return ret;
+}
+
+/**
+ * synth_event_add_next_val - Add the next field's value to an open synth trace
+ * @val: The value to set the next field to
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * Set the value of the next field in an event that's been opened by
+ * synth_event_trace_start().
+ *
+ * The val param should be the value cast to u64. If the value points
+ * to a string, the val param should be a char * cast to u64.
+ *
+ * This function assumes all the fields in an event are to be set one
+ * after another - successive calls to this function are made, one for
+ * each field, in the order of the fields in the event, until all
+ * fields have been set. If you'd rather set each field individually
+ * without regard to ordering, synth_event_add_val() can be used
+ * instead.
+ *
+ * Note however that synth_event_add_next_val() and
+ * synth_event_add_val() can't be intermixed for a given event trace -
+ * one or the other but not both can be used at the same time.
+ *
+ * Note also that synth_event_trace_end() must be called after all
+ * values have been added for each event trace, regardless of whether
+ * adding all field values succeeded or not.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_add_next_val(u64 val,
+ struct synth_event_trace_state *trace_state)
+{
+ return __synth_event_add_val(NULL, val, trace_state);
+}
+EXPORT_SYMBOL_GPL(synth_event_add_next_val);
+
+/**
+ * synth_event_add_val - Add a named field's value to an open synth trace
+ * @field_name: The name of the synthetic event field value to set
+ * @val: The value to set the next field to
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * Set the value of the named field in an event that's been opened by
+ * synth_event_trace_start().
+ *
+ * The val param should be the value cast to u64. If the value points
+ * to a string, the val param should be a char * cast to u64.
+ *
+ * This function looks up the field name, and if found, sets the field
+ * to the specified value. This lookup makes this function more
+ * expensive than synth_event_add_next_val(), so use that or the
+ * none-piecewise synth_event_trace() instead if efficiency is more
+ * important.
+ *
+ * Note however that synth_event_add_next_val() and
+ * synth_event_add_val() can't be intermixed for a given event trace -
+ * one or the other but not both can be used at the same time.
+ *
+ * Note also that synth_event_trace_end() must be called after all
+ * values have been added for each event trace, regardless of whether
+ * adding all field values succeeded or not.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_add_val(const char *field_name, u64 val,
+ struct synth_event_trace_state *trace_state)
+{
+ return __synth_event_add_val(field_name, val, trace_state);
+}
+EXPORT_SYMBOL_GPL(synth_event_add_val);
+
+/**
+ * synth_event_trace_end - End piecewise synthetic event trace
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * End the trace of a synthetic event opened by
+ * synth_event_trace__start().
+ *
+ * This function 'closes' an event trace, which basically means that
+ * it commits the reserved event and cleans up other loose ends.
+ *
+ * A pointer to a trace_state object is passed in, which will keep
+ * track of the current event trace state opened with
+ * synth_event_trace_start().
+ *
+ * Note that this function must be called after all values have been
+ * added for each event trace, regardless of whether adding all field
+ * values succeeded or not.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace_end(struct synth_event_trace_state *trace_state)
+{
+ if (!trace_state)
+ return -EINVAL;
+
+ __synth_event_trace_end(trace_state);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace_end);
+
+static int create_synth_event(int argc, const char **argv)
{
const char *name = argv[0];
int len;
@@ -1429,6 +2262,10 @@ static int synth_events_open(struct inode *inode, struct file *file)
{
int ret;
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
ret = dyn_events_release_all(&synth_event_ops);
if (ret < 0)
@@ -1661,7 +2498,7 @@ static int save_hist_vars(struct hist_trigger_data *hist_data)
if (var_data)
return 0;
- if (trace_array_get(tr) < 0)
+ if (tracing_check_open_get_tr(tr))
return -ENODEV;
var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
@@ -1721,11 +2558,13 @@ static struct hist_field *find_var(struct hist_trigger_data *hist_data,
struct event_trigger_data *test;
struct hist_field *hist_field;
+ lockdep_assert_held(&event_mutex);
+
hist_field = find_var_field(hist_data, var_name);
if (hist_field)
return hist_field;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
test_data = test->private_data;
hist_field = find_var_field(test_data, var_name);
@@ -1775,7 +2614,9 @@ static struct hist_field *find_file_var(struct trace_event_file *file,
struct event_trigger_data *test;
struct hist_field *hist_field;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
test_data = test->private_data;
hist_field = find_var_field(test_data, var_name);
@@ -1985,12 +2826,6 @@ static int parse_map_size(char *str)
unsigned long size, map_bits;
int ret;
- strsep(&str, "=");
- if (!str) {
- ret = -EINVAL;
- goto out;
- }
-
ret = kstrtoul(str, 0, &size);
if (ret)
goto out;
@@ -2050,25 +2885,25 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
static int parse_assignment(struct trace_array *tr,
char *str, struct hist_trigger_attrs *attrs)
{
- int ret = 0;
+ int len, ret = 0;
- if ((str_has_prefix(str, "key=")) ||
- (str_has_prefix(str, "keys="))) {
- attrs->keys_str = kstrdup(str, GFP_KERNEL);
+ if ((len = str_has_prefix(str, "key=")) ||
+ (len = str_has_prefix(str, "keys="))) {
+ attrs->keys_str = kstrdup(str + len, GFP_KERNEL);
if (!attrs->keys_str) {
ret = -ENOMEM;
goto out;
}
- } else if ((str_has_prefix(str, "val=")) ||
- (str_has_prefix(str, "vals=")) ||
- (str_has_prefix(str, "values="))) {
- attrs->vals_str = kstrdup(str, GFP_KERNEL);
+ } else if ((len = str_has_prefix(str, "val=")) ||
+ (len = str_has_prefix(str, "vals=")) ||
+ (len = str_has_prefix(str, "values="))) {
+ attrs->vals_str = kstrdup(str + len, GFP_KERNEL);
if (!attrs->vals_str) {
ret = -ENOMEM;
goto out;
}
- } else if (str_has_prefix(str, "sort=")) {
- attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+ } else if ((len = str_has_prefix(str, "sort="))) {
+ attrs->sort_key_str = kstrdup(str + len, GFP_KERNEL);
if (!attrs->sort_key_str) {
ret = -ENOMEM;
goto out;
@@ -2079,12 +2914,8 @@ static int parse_assignment(struct trace_array *tr,
ret = -ENOMEM;
goto out;
}
- } else if (str_has_prefix(str, "clock=")) {
- strsep(&str, "=");
- if (!str) {
- ret = -EINVAL;
- goto out;
- }
+ } else if ((len = str_has_prefix(str, "clock="))) {
+ str += len;
str = strstrip(str);
attrs->clock = kstrdup(str, GFP_KERNEL);
@@ -2092,8 +2923,8 @@ static int parse_assignment(struct trace_array *tr,
ret = -ENOMEM;
goto out;
}
- } else if (str_has_prefix(str, "size=")) {
- int map_bits = parse_map_size(str);
+ } else if ((len = str_has_prefix(str, "size="))) {
+ int map_bits = parse_map_size(str + len);
if (map_bits < 0) {
ret = map_bits;
@@ -2133,8 +2964,15 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
while (trigger_str) {
char *str = strsep(&trigger_str, ":");
+ char *rhs;
- if (strchr(str, '=')) {
+ rhs = strchr(str, '=');
+ if (rhs) {
+ if (!strlen(++rhs)) {
+ ret = -EINVAL;
+ hist_err(tr, HIST_ERR_EMPTY_ASSIGNMENT, errpos(str));
+ goto free;
+ }
ret = parse_assignment(tr, str, attrs);
if (ret)
goto free;
@@ -2378,8 +3216,16 @@ static int contains_operator(char *str)
return field_op;
}
+static void get_hist_field(struct hist_field *hist_field)
+{
+ hist_field->ref++;
+}
+
static void __destroy_hist_field(struct hist_field *hist_field)
{
+ if (--hist_field->ref > 1)
+ return;
+
kfree(hist_field->var.name);
kfree(hist_field->name);
kfree(hist_field->type);
@@ -2421,6 +3267,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (!hist_field)
return NULL;
+ hist_field->ref = 1;
+
hist_field->hist_data = hist_data;
if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
@@ -2595,6 +3443,22 @@ static int init_var_ref(struct hist_field *ref_field,
goto out;
}
+static int find_var_ref_idx(struct hist_trigger_data *hist_data,
+ struct hist_field *var_field)
+{
+ struct hist_field *ref_field;
+ int i;
+
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ ref_field = hist_data->var_refs[i];
+ if (ref_field->var.idx == var_field->var.idx &&
+ ref_field->var.hist_data == var_field->hist_data)
+ return i;
+ }
+
+ return -ENOENT;
+}
+
/**
* create_var_ref - Create a variable reference and attach it to trigger
* @hist_data: The trigger that will be referencing the variable
@@ -2616,6 +3480,17 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data,
{
unsigned long flags = HIST_FIELD_FL_VAR_REF;
struct hist_field *ref_field;
+ int i;
+
+ /* Check if the variable already exists */
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ ref_field = hist_data->var_refs[i];
+ if (ref_field->var.idx == var_field->var.idx &&
+ ref_field->var.hist_data == var_field->hist_data) {
+ get_hist_field(ref_field);
+ return ref_field;
+ }
+ }
ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
if (ref_field) {
@@ -2785,6 +3660,8 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
return NULL;
}
+ alias->var_ref_idx = var_ref->var_ref_idx;
+
return alias;
}
@@ -3068,7 +3945,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data,
{
struct event_trigger_data *test;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (test->private_data == hist_data)
return test->filter_str;
@@ -3119,9 +3998,11 @@ find_compatible_hist(struct hist_trigger_data *target_hist_data,
struct event_trigger_data *test;
unsigned int n_keys;
+ lockdep_assert_held(&event_mutex);
+
n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
hist_data = test->private_data;
@@ -4063,8 +4944,11 @@ static int check_synth_field(struct synth_event *event,
field = event->fields[field_pos];
- if (strcmp(field->type, hist_field->type) != 0)
- return -EINVAL;
+ if (strcmp(field->type, hist_field->type) != 0) {
+ if (field->size != hist_field->size ||
+ field->is_signed != hist_field->is_signed)
+ return -EINVAL;
+ }
return 0;
}
@@ -4151,11 +5035,11 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
struct trace_array *tr = hist_data->event_file->tr;
char *event_name, *param, *system = NULL;
struct hist_field *hist_field, *var_ref;
- unsigned int i, var_ref_idx;
+ unsigned int i;
unsigned int field_pos = 0;
struct synth_event *event;
char *synth_event_name;
- int ret = 0;
+ int var_ref_idx, ret = 0;
lockdep_assert_held(&event_mutex);
@@ -4172,8 +5056,6 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
event->ref++;
- var_ref_idx = hist_data->n_var_refs;
-
for (i = 0; i < data->n_params; i++) {
char *p;
@@ -4222,6 +5104,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
goto err;
}
+ var_ref_idx = find_var_ref_idx(hist_data, var_ref);
+ if (WARN_ON(var_ref_idx < 0)) {
+ ret = var_ref_idx;
+ goto err;
+ }
+
+ data->var_ref_idx[i] = var_ref_idx;
+
field_pos++;
kfree(p);
continue;
@@ -4240,7 +5130,6 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
}
data->synth_event = event;
- data->var_ref_idx = var_ref_idx;
out:
return ret;
err:
@@ -4459,10 +5348,6 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
if (!fields_str)
goto out;
- strsep(&fields_str, "=");
- if (!fields_str)
- goto out;
-
for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX &&
j < TRACING_MAP_VALS_MAX; i++) {
field_str = strsep(&fields_str, ",");
@@ -4557,10 +5442,6 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
if (!fields_str)
goto out;
- strsep(&fields_str, "=");
- if (!fields_str)
- goto out;
-
for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) {
field_str = strsep(&fields_str, ",");
if (!field_str)
@@ -4692,7 +5573,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data,
return ret;
}
-static int is_descending(const char *str)
+static int is_descending(struct trace_array *tr, const char *str)
{
if (!str)
return 0;
@@ -4703,11 +5584,14 @@ static int is_descending(const char *str)
if (strcmp(str, "ascending") == 0)
return 0;
+ hist_err(tr, HIST_ERR_INVALID_SORT_MODIFIER, errpos((char *)str));
+
return -EINVAL;
}
static int create_sort_keys(struct hist_trigger_data *hist_data)
{
+ struct trace_array *tr = hist_data->event_file->tr;
char *fields_str = hist_data->attrs->sort_key_str;
struct tracing_map_sort_key *sort_key;
int descending, ret = 0;
@@ -4718,12 +5602,6 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
if (!fields_str)
goto out;
- strsep(&fields_str, "=");
- if (!fields_str) {
- ret = -EINVAL;
- goto out;
- }
-
for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
struct hist_field *hist_field;
char *field_str, *field_name;
@@ -4732,25 +5610,30 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
sort_key = &hist_data->sort_keys[i];
field_str = strsep(&fields_str, ",");
- if (!field_str) {
- if (i == 0)
- ret = -EINVAL;
+ if (!field_str)
+ break;
+
+ if (!*field_str) {
+ ret = -EINVAL;
+ hist_err(tr, HIST_ERR_EMPTY_SORT_FIELD, errpos("sort="));
break;
}
if ((i == TRACING_MAP_SORT_KEYS_MAX - 1) && fields_str) {
+ hist_err(tr, HIST_ERR_TOO_MANY_SORT_FIELDS, errpos("sort="));
ret = -EINVAL;
break;
}
field_name = strsep(&field_str, ".");
- if (!field_name) {
+ if (!field_name || !*field_name) {
ret = -EINVAL;
+ hist_err(tr, HIST_ERR_EMPTY_SORT_FIELD, errpos("sort="));
break;
}
if (strcmp(field_name, "hitcount") == 0) {
- descending = is_descending(field_str);
+ descending = is_descending(tr, field_str);
if (descending < 0) {
ret = descending;
break;
@@ -4772,7 +5655,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
if (strcmp(field_name, test_name) == 0) {
sort_key->field_idx = idx;
- descending = is_descending(field_str);
+ descending = is_descending(tr, field_str);
if (descending < 0) {
ret = descending;
goto out;
@@ -4783,6 +5666,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
}
if (j == hist_data->n_fields) {
ret = -EINVAL;
+ hist_err(tr, HIST_ERR_INVALID_SORT_FIELD, errpos(field_name));
break;
}
}
@@ -5481,7 +6365,7 @@ static int hist_show(struct seq_file *m, void *v)
goto out_unlock;
}
- list_for_each_entry_rcu(data, &event_file->triggers, list) {
+ list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_show(m, data, n++);
}
@@ -5494,6 +6378,12 @@ static int hist_show(struct seq_file *m, void *v)
static int event_hist_open(struct inode *inode, struct file *file)
{
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
return single_open(file, hist_show, file);
}
@@ -5868,7 +6758,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
if (hist_data->attrs->name && !named_data)
goto new;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (!hist_trigger_match(data, test, named_data, false))
continue;
@@ -5952,10 +6844,12 @@ static bool have_hist_trigger_match(struct event_trigger_data *data,
struct event_trigger_data *test, *named_data = NULL;
bool match = false;
+ lockdep_assert_held(&event_mutex);
+
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (hist_trigger_match(data, test, named_data, false)) {
match = true;
@@ -5973,10 +6867,12 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data,
struct hist_trigger_data *hist_data = data->private_data;
struct event_trigger_data *test, *named_data = NULL;
+ lockdep_assert_held(&event_mutex);
+
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (!hist_trigger_match(data, test, named_data, false))
continue;
@@ -5998,10 +6894,12 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *test, *named_data = NULL;
bool unregistered = false;
+ lockdep_assert_held(&event_mutex);
+
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (!hist_trigger_match(data, test, named_data, false))
continue;
@@ -6027,7 +6925,9 @@ static bool hist_file_check_refs(struct trace_event_file *file)
struct hist_trigger_data *hist_data;
struct event_trigger_data *test;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
hist_data = test->private_data;
if (check_var_refs(hist_data))
@@ -6270,7 +7170,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec,
struct enable_trigger_data *enable_data = data->private_data;
struct event_trigger_data *test;
- list_for_each_entry_rcu(test, &enable_data->file->triggers, list) {
+ list_for_each_entry_rcu(test, &enable_data->file->triggers, list,
+ lockdep_is_held(&event_mutex)) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (enable_data->enable)
test->paused = false;
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
new file mode 100644
index 000000000000..22bcf7c51d1e
--- /dev/null
+++ b/kernel/trace/trace_events_inject.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * trace_events_inject - trace event injection
+ *
+ * Copyright (C) 2019 Cong Wang <cwang@twitter.com>
+ */
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+
+#include "trace.h"
+
+static int
+trace_inject_entry(struct trace_event_file *file, void *rec, int len)
+{
+ struct trace_event_buffer fbuffer;
+ int written = 0;
+ void *entry;
+
+ rcu_read_lock_sched();
+ entry = trace_event_buffer_reserve(&fbuffer, file, len);
+ if (entry) {
+ memcpy(entry, rec, len);
+ written = len;
+ trace_event_buffer_commit(&fbuffer);
+ }
+ rcu_read_unlock_sched();
+
+ return written;
+}
+
+static int
+parse_field(char *str, struct trace_event_call *call,
+ struct ftrace_event_field **pf, u64 *pv)
+{
+ struct ftrace_event_field *field;
+ char *field_name;
+ int s, i = 0;
+ int len;
+ u64 val;
+
+ if (!str[i])
+ return 0;
+ /* First find the field to associate to */
+ while (isspace(str[i]))
+ i++;
+ s = i;
+ while (isalnum(str[i]) || str[i] == '_')
+ i++;
+ len = i - s;
+ if (!len)
+ return -EINVAL;
+
+ field_name = kmemdup_nul(str + s, len, GFP_KERNEL);
+ if (!field_name)
+ return -ENOMEM;
+ field = trace_find_event_field(call, field_name);
+ kfree(field_name);
+ if (!field)
+ return -ENOENT;
+
+ *pf = field;
+ while (isspace(str[i]))
+ i++;
+ if (str[i] != '=')
+ return -EINVAL;
+ i++;
+ while (isspace(str[i]))
+ i++;
+ s = i;
+ if (isdigit(str[i]) || str[i] == '-') {
+ char *num, c;
+ int ret;
+
+ /* Make sure the field is not a string */
+ if (is_string_field(field))
+ return -EINVAL;
+
+ if (str[i] == '-')
+ i++;
+
+ /* We allow 0xDEADBEEF */
+ while (isalnum(str[i]))
+ i++;
+ num = str + s;
+ c = str[i];
+ if (c != '\0' && !isspace(c))
+ return -EINVAL;
+ str[i] = '\0';
+ /* Make sure it is a value */
+ if (field->is_signed)
+ ret = kstrtoll(num, 0, &val);
+ else
+ ret = kstrtoull(num, 0, &val);
+ str[i] = c;
+ if (ret)
+ return ret;
+
+ *pv = val;
+ return i;
+ } else if (str[i] == '\'' || str[i] == '"') {
+ char q = str[i];
+
+ /* Make sure the field is OK for strings */
+ if (!is_string_field(field))
+ return -EINVAL;
+
+ for (i++; str[i]; i++) {
+ if (str[i] == '\\' && str[i + 1]) {
+ i++;
+ continue;
+ }
+ if (str[i] == q)
+ break;
+ }
+ if (!str[i])
+ return -EINVAL;
+
+ /* Skip quotes */
+ s++;
+ len = i - s;
+ if (len >= MAX_FILTER_STR_VAL)
+ return -EINVAL;
+
+ *pv = (unsigned long)(str + s);
+ str[i] = 0;
+ /* go past the last quote */
+ i++;
+ return i;
+ }
+
+ return -EINVAL;
+}
+
+static int trace_get_entry_size(struct trace_event_call *call)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+ int size = 0;
+
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
+ if (field->size + field->offset > size)
+ size = field->size + field->offset;
+ }
+
+ return size;
+}
+
+static void *trace_alloc_entry(struct trace_event_call *call, int *size)
+{
+ int entry_size = trace_get_entry_size(call);
+ struct ftrace_event_field *field;
+ struct list_head *head;
+ void *entry = NULL;
+
+ /* We need an extra '\0' at the end. */
+ entry = kzalloc(entry_size + 1, GFP_KERNEL);
+ if (!entry)
+ return NULL;
+
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
+ if (!is_string_field(field))
+ continue;
+ if (field->filter_type == FILTER_STATIC_STRING)
+ continue;
+ if (field->filter_type == FILTER_DYN_STRING) {
+ u32 *str_item;
+ int str_loc = entry_size & 0xffff;
+
+ str_item = (u32 *)(entry + field->offset);
+ *str_item = str_loc; /* string length is 0. */
+ } else {
+ char **paddr;
+
+ paddr = (char **)(entry + field->offset);
+ *paddr = "";
+ }
+ }
+
+ *size = entry_size + 1;
+ return entry;
+}
+
+#define INJECT_STRING "STATIC STRING CAN NOT BE INJECTED"
+
+/* Caller is responsible to free the *pentry. */
+static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
+{
+ struct ftrace_event_field *field;
+ unsigned long irq_flags;
+ void *entry = NULL;
+ int entry_size;
+ u64 val = 0;
+ int len;
+
+ entry = trace_alloc_entry(call, &entry_size);
+ *pentry = entry;
+ if (!entry)
+ return -ENOMEM;
+
+ local_save_flags(irq_flags);
+ tracing_generic_entry_update(entry, call->event.type, irq_flags,
+ preempt_count());
+
+ while ((len = parse_field(str, call, &field, &val)) > 0) {
+ if (is_function_field(field))
+ return -EINVAL;
+
+ if (is_string_field(field)) {
+ char *addr = (char *)(unsigned long) val;
+
+ if (field->filter_type == FILTER_STATIC_STRING) {
+ strlcpy(entry + field->offset, addr, field->size);
+ } else if (field->filter_type == FILTER_DYN_STRING) {
+ int str_len = strlen(addr) + 1;
+ int str_loc = entry_size & 0xffff;
+ u32 *str_item;
+
+ entry_size += str_len;
+ *pentry = krealloc(entry, entry_size, GFP_KERNEL);
+ if (!*pentry) {
+ kfree(entry);
+ return -ENOMEM;
+ }
+ entry = *pentry;
+
+ strlcpy(entry + (entry_size - str_len), addr, str_len);
+ str_item = (u32 *)(entry + field->offset);
+ *str_item = (str_len << 16) | str_loc;
+ } else {
+ char **paddr;
+
+ paddr = (char **)(entry + field->offset);
+ *paddr = INJECT_STRING;
+ }
+ } else {
+ switch (field->size) {
+ case 1: {
+ u8 tmp = (u8) val;
+
+ memcpy(entry + field->offset, &tmp, 1);
+ break;
+ }
+ case 2: {
+ u16 tmp = (u16) val;
+
+ memcpy(entry + field->offset, &tmp, 2);
+ break;
+ }
+ case 4: {
+ u32 tmp = (u32) val;
+
+ memcpy(entry + field->offset, &tmp, 4);
+ break;
+ }
+ case 8:
+ memcpy(entry + field->offset, &val, 8);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ str += len;
+ }
+
+ if (len < 0)
+ return len;
+
+ return entry_size;
+}
+
+static ssize_t
+event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct trace_event_call *call;
+ struct trace_event_file *file;
+ int err = -ENODEV, size;
+ void *entry = NULL;
+ char *buf;
+
+ if (cnt >= PAGE_SIZE)
+ return -EINVAL;
+
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+ strim(buf);
+
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (file) {
+ call = file->event_call;
+ size = parse_entry(buf, call, &entry);
+ if (size < 0)
+ err = size;
+ else
+ err = trace_inject_entry(file, entry, size);
+ }
+ mutex_unlock(&event_mutex);
+
+ kfree(entry);
+ kfree(buf);
+
+ if (err < 0)
+ return err;
+
+ *ppos += err;
+ return cnt;
+}
+
+static ssize_t
+event_inject_read(struct file *file, char __user *buf, size_t size,
+ loff_t *ppos)
+{
+ return -EPERM;
+}
+
+const struct file_operations event_inject_fops = {
+ .open = tracing_open_generic,
+ .read = event_inject_read,
+ .write = event_inject_write,
+};
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 2a2912cb4533..dd34a1b46a86 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -5,6 +5,7 @@
* Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com>
*/
+#include <linux/security.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
@@ -115,9 +116,10 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
{
struct trace_event_file *event_file = event_file_data(m->private);
- if (t == SHOW_AVAILABLE_TRIGGERS)
+ if (t == SHOW_AVAILABLE_TRIGGERS) {
+ (*pos)++;
return NULL;
-
+ }
return seq_list_next(t, &event_file->triggers, pos);
}
@@ -173,7 +175,11 @@ static const struct seq_operations event_triggers_seq_ops = {
static int event_trigger_regex_open(struct inode *inode, struct file *file)
{
- int ret = 0;
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
mutex_lock(&event_mutex);
@@ -208,7 +214,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
return ret;
}
-static int trigger_process_regex(struct trace_event_file *file, char *buff)
+int trigger_process_regex(struct trace_event_file *file, char *buff)
{
char *command, *next = buff;
struct event_command *p;
@@ -292,6 +298,7 @@ event_trigger_write(struct file *filp, const char __user *ubuf,
static int
event_trigger_open(struct inode *inode, struct file *filp)
{
+ /* Checks for tracefs lockdown */
return event_trigger_regex_open(inode, filp);
}
@@ -495,7 +502,9 @@ void update_cond_flag(struct trace_event_file *file)
struct event_trigger_data *data;
bool set_cond = false;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(data, &file->triggers, list) {
if (data->filter || event_command_post_trigger(data->cmd_ops) ||
event_command_needs_rec(data->cmd_ops)) {
set_cond = true;
@@ -530,7 +539,9 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *test;
int ret = 0;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
ret = -EEXIST;
goto out;
@@ -575,7 +586,9 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data;
bool unregistered = false;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(data, &file->triggers, list) {
if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
unregistered = true;
list_del_rcu(&data->list);
@@ -1491,7 +1504,9 @@ int event_enable_register_trigger(char *glob,
struct event_trigger_data *test;
int ret = 0;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
test_enable_data = test->private_data;
if (test_enable_data &&
(test->cmd_ops->trigger_type ==
@@ -1531,7 +1546,9 @@ void event_enable_unregister_trigger(char *glob,
struct event_trigger_data *data;
bool unregistered = false;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(data, &file->triggers, list) {
enable_data = data->private_data;
if (enable_data &&
(data->cmd_ops->trigger_type ==
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 45630a76ed3a..77ce5a3b6773 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -29,10 +29,8 @@ static int ftrace_event_register(struct trace_event_call *call,
* function and thus become accesible via perf.
*/
#undef FTRACE_ENTRY_REG
-#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
- filter, regfn) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
/* not needed for this file */
#undef __field_struct
@@ -41,6 +39,9 @@ static int ftrace_event_register(struct trace_event_call *call,
#undef __field
#define __field(type, item) type item;
+#undef __field_fn
+#define __field_fn(type, item) type item;
+
#undef __field_desc
#define __field_desc(type, container, item) type item;
@@ -60,7 +61,7 @@ static int ftrace_event_register(struct trace_event_call *call,
#define F_printk(fmt, args...) fmt, args
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
struct ____ftrace_##name { \
tstruct \
}; \
@@ -73,76 +74,46 @@ static void __always_unused ____ftrace_check_##name(void) \
}
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
#include "trace_entries.h"
+#undef __field_ext
+#define __field_ext(_type, _item, _filter_type) { \
+ .type = #_type, .name = #_item, \
+ .size = sizeof(_type), .align = __alignof__(_type), \
+ is_signed_type(_type), .filter_type = _filter_type },
+
#undef __field
-#define __field(type, item) \
- ret = trace_define_field(event_call, #type, #item, \
- offsetof(typeof(field), item), \
- sizeof(field.item), \
- is_signed_type(type), filter_type); \
- if (ret) \
- return ret;
+#define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER)
+
+#undef __field_fn
+#define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN)
#undef __field_desc
-#define __field_desc(type, container, item) \
- ret = trace_define_field(event_call, #type, #item, \
- offsetof(typeof(field), \
- container.item), \
- sizeof(field.container.item), \
- is_signed_type(type), filter_type); \
- if (ret) \
- return ret;
+#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER)
#undef __array
-#define __array(type, item, len) \
- do { \
- char *type_str = #type"["__stringify(len)"]"; \
- BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- ret = trace_define_field(event_call, type_str, #item, \
- offsetof(typeof(field), item), \
- sizeof(field.item), \
- is_signed_type(type), filter_type); \
- if (ret) \
- return ret; \
- } while (0);
+#define __array(_type, _item, _len) { \
+ .type = #_type"["__stringify(_len)"]", .name = #_item, \
+ .size = sizeof(_type[_len]), .align = __alignof__(_type), \
+ is_signed_type(_type), .filter_type = FILTER_OTHER },
#undef __array_desc
-#define __array_desc(type, container, item, len) \
- BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- ret = trace_define_field(event_call, #type "[" #len "]", #item, \
- offsetof(typeof(field), \
- container.item), \
- sizeof(field.container.item), \
- is_signed_type(type), filter_type); \
- if (ret) \
- return ret;
+#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len)
#undef __dynamic_array
-#define __dynamic_array(type, item) \
- ret = trace_define_field(event_call, #type "[]", #item, \
- offsetof(typeof(field), item), \
- 0, is_signed_type(type), filter_type);\
- if (ret) \
- return ret;
+#define __dynamic_array(_type, _item) { \
+ .type = #_type "[]", .name = #_item, \
+ .size = 0, .align = __alignof__(_type), \
+ is_signed_type(_type), .filter_type = FILTER_OTHER },
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
-static int __init \
-ftrace_define_fields_##name(struct trace_event_call *event_call) \
-{ \
- struct struct_name field; \
- int ret; \
- int filter_type = filter; \
- \
- tstruct; \
- \
- return ret; \
-}
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
+static struct trace_event_fields ftrace_event_fields_##name[] = { \
+ tstruct \
+ {} };
#include "trace_entries.h"
@@ -152,6 +123,9 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \
#undef __field
#define __field(type, item)
+#undef __field_fn
+#define __field_fn(type, item)
+
#undef __field_desc
#define __field_desc(type, container, item)
@@ -168,12 +142,10 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \
#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args)
#undef FTRACE_ENTRY_REG
-#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
- regfn) \
- \
-struct trace_event_class __refdata event_class_ftrace_##call = { \
+#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn) \
+static struct trace_event_class __refdata event_class_ftrace_##call = { \
.system = __stringify(TRACE_SYSTEM), \
- .define_fields = ftrace_define_fields_##call, \
+ .fields_array = ftrace_event_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
.reg = regfn, \
}; \
@@ -187,13 +159,13 @@ struct trace_event_call __used event_##call = { \
.print_fmt = print, \
.flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
}; \
-struct trace_event_call __used \
+static struct trace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
FTRACE_ENTRY_REG(call, struct_name, etype, \
- PARAMS(tstruct), PARAMS(print), filter, NULL)
+ PARAMS(tstruct), PARAMS(print), NULL)
bool ftrace_event_is_function(struct trace_event_call *call)
{
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b611cd36e22d..8a4c8d5c2c98 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -101,7 +101,7 @@ static int function_trace_init(struct trace_array *tr)
ftrace_init_array_ops(tr, func);
- tr->trace_buffer.cpu = get_cpu();
+ tr->array_buffer.cpu = get_cpu();
put_cpu();
tracing_start_cmdline_record();
@@ -118,7 +118,7 @@ static void function_trace_reset(struct trace_array *tr)
static void function_trace_start(struct trace_array *tr)
{
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
}
static void
@@ -143,7 +143,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
goto out;
cpu = smp_processor_id();
- data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
if (!atomic_read(&data->disabled)) {
local_save_flags(flags);
trace_function(tr, ip, parent_ip, flags, pc);
@@ -192,7 +192,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
*/
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 78af97163147..7d71546ba00a 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -101,7 +101,7 @@ int __trace_graph_entry(struct trace_array *tr,
{
struct trace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -171,7 +171,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
@@ -221,7 +221,7 @@ void __trace_graph_return(struct trace_array *tr,
{
struct trace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -252,7 +252,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
@@ -444,9 +444,9 @@ get_return_for_leaf(struct trace_iterator *iter,
* We need to consume the current entry to see
* the next one.
*/
- ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
+ ring_buffer_consume(iter->array_buffer->buffer, iter->cpu,
NULL, NULL);
- event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
+ event = ring_buffer_peek(iter->array_buffer->buffer, iter->cpu,
NULL, NULL);
}
@@ -503,7 +503,7 @@ print_graph_rel_time(struct trace_iterator *iter, struct trace_seq *s)
{
unsigned long long usecs;
- usecs = iter->ts - iter->trace_buffer->time_start;
+ usecs = iter->ts - iter->array_buffer->time_start;
do_div(usecs, NSEC_PER_USEC);
trace_seq_printf(s, "%9llu us | ", usecs);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index fa95139445b2..a48808c43249 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * trace_hwlatdetect.c - A simple Hardware Latency detector.
+ * trace_hwlat.c - A simple Hardware Latency detector.
*
* Use this tracer to detect large system latencies induced by the behavior of
* certain underlying system hardware or firmware, independent of Linux itself.
@@ -104,7 +104,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
{
struct trace_array *tr = hwlat_trace;
struct trace_event_call *call = &event_hwlat;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct hwlat_entry *entry;
unsigned long flags;
@@ -150,7 +150,7 @@ void trace_hwlat_callback(bool enter)
if (enter)
nmi_ts_start = time_get();
else
- nmi_total_ts = time_get() - nmi_ts_start;
+ nmi_total_ts += time_get() - nmi_ts_start;
}
if (enter)
@@ -237,6 +237,7 @@ static int get_sample(void)
/* If we exceed the threshold value, we have found a hardware latency */
if (sample > thresh || outer_sample > thresh) {
struct hwlat_sample s;
+ u64 latency;
ret = 1;
@@ -253,9 +254,13 @@ static int get_sample(void)
s.nmi_count = nmi_count;
trace_hwlat_sample(&s);
+ latency = max(sample, outer_sample);
+
/* Keep a running maximum ever recorded hardware latency */
- if (sample > tr->max_latency)
- tr->max_latency = sample;
+ if (latency > tr->max_latency) {
+ tr->max_latency = latency;
+ latency_fsnotify(tr);
+ }
}
out:
@@ -274,7 +279,7 @@ static void move_to_next_cpu(void)
return;
/*
* If for some reason the user modifies the CPU affinity
- * of this thread, than stop migrating for the duration
+ * of this thread, then stop migrating for the duration
* of the current test.
*/
if (!cpumask_equal(current_mask, current->cpus_ptr))
@@ -551,7 +556,7 @@ static int init_tracefs(void)
return 0;
err:
- tracefs_remove_recursive(top_dir);
+ tracefs_remove(top_dir);
return -ENOMEM;
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a745b0cee5d3..10bbb0f381d5 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -122,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,
if (!irqs_disabled_flags(*flags) && !preempt_count())
return 0;
- *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ *data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&(*data)->disabled);
if (likely(disabled == 1))
@@ -167,7 +167,7 @@ static int irqsoff_display_graph(struct trace_array *tr, int set)
per_cpu(tracing_cpu, cpu) = 0;
tr->max_latency = 0;
- tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
+ tracing_reset_online_cpus(&irqsoff_trace->array_buffer);
return start_irqsoff_tracer(irqsoff_trace, set);
}
@@ -382,7 +382,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
if (per_cpu(tracing_cpu, cpu))
return;
- data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
if (unlikely(!data) || atomic_read(&data->disabled))
return;
@@ -420,7 +420,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
if (!tracer_enabled || !tracing_is_enabled())
return;
- data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
if (unlikely(!data) ||
!data->critical_start || atomic_read(&data->disabled))
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index cca65044c14c..9da76104f7a2 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,7 +43,7 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
- ring_buffer_read_prepare(iter.trace_buffer->buffer,
+ ring_buffer_read_prepare(iter.array_buffer->buffer,
cpu, GFP_ATOMIC);
ring_buffer_read_start(iter.buffer_iter[cpu]);
tracing_iter_reset(&iter, cpu);
@@ -51,7 +51,7 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
- ring_buffer_read_prepare(iter.trace_buffer->buffer,
+ ring_buffer_read_prepare(iter.array_buffer->buffer,
cpu_file, GFP_ATOMIC);
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
@@ -124,7 +124,7 @@ static int kdb_ftdump(int argc, const char **argv)
iter.buffer_iter = buffer_iter;
for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
/* A negative skip_entries means skip all but the last entries */
@@ -139,7 +139,7 @@ static int kdb_ftdump(int argc, const char **argv)
ftrace_dump_buf(skip_entries, cpu_file);
for_each_tracing_cpu(cpu) {
- atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
kdb_trap_printk--;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9d483ad9bb6c..362cca52f5de 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -7,6 +7,7 @@
*/
#define pr_fmt(fmt) "trace_kprobe: " fmt
+#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/rculist.h>
@@ -21,7 +22,6 @@
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
-#define MAX_KPROBE_CMDLINE_SIZE 1024
/* Kprobe early definition from command line */
static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata;
@@ -39,7 +39,7 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_kprobe_release(struct dyn_event *ev);
static bool trace_kprobe_is_busy(struct dyn_event *ev);
static bool trace_kprobe_match(const char *system, const char *event,
- struct dyn_event *ev);
+ int argc, const char **argv, struct dyn_event *ev);
static struct dyn_event_operations trace_kprobe_ops = {
.create = trace_kprobe_create,
@@ -137,13 +137,36 @@ static bool trace_kprobe_is_busy(struct dyn_event *ev)
return trace_probe_is_enabled(&tk->tp);
}
+static bool trace_kprobe_match_command_head(struct trace_kprobe *tk,
+ int argc, const char **argv)
+{
+ char buf[MAX_ARGSTR_LEN + 1];
+
+ if (!argc)
+ return true;
+
+ if (!tk->symbol)
+ snprintf(buf, sizeof(buf), "0x%p", tk->rp.kp.addr);
+ else if (tk->rp.kp.offset)
+ snprintf(buf, sizeof(buf), "%s+%u",
+ trace_kprobe_symbol(tk), tk->rp.kp.offset);
+ else
+ snprintf(buf, sizeof(buf), "%s", trace_kprobe_symbol(tk));
+ if (strcmp(buf, argv[0]))
+ return false;
+ argc--; argv++;
+
+ return trace_probe_match_command_args(&tk->tp, argc, argv);
+}
+
static bool trace_kprobe_match(const char *system, const char *event,
- struct dyn_event *ev)
+ int argc, const char **argv, struct dyn_event *ev)
{
struct trace_kprobe *tk = to_trace_kprobe(ev);
return strcmp(trace_probe_name(&tk->tp), event) == 0 &&
- (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0);
+ (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) &&
+ trace_kprobe_match_command_head(tk, argc, argv);
}
static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
@@ -180,20 +203,33 @@ unsigned long trace_kprobe_address(struct trace_kprobe *tk)
return addr;
}
+static nokprobe_inline struct trace_kprobe *
+trace_kprobe_primary_from_call(struct trace_event_call *call)
+{
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return NULL;
+
+ return container_of(tp, struct trace_kprobe, tp);
+}
+
bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
- struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+ struct trace_kprobe *tk = trace_kprobe_primary_from_call(call);
- return kprobe_on_func_entry(tk->rp.kp.addr,
+ return tk ? kprobe_on_func_entry(tk->rp.kp.addr,
tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
- tk->rp.kp.addr ? 0 : tk->rp.kp.offset);
+ tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false;
}
bool trace_kprobe_error_injectable(struct trace_event_call *call)
{
- struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+ struct trace_kprobe *tk = trace_kprobe_primary_from_call(call);
- return within_error_injection_list(trace_kprobe_address(tk));
+ return tk ? within_error_injection_list(trace_kprobe_address(tk)) :
+ false;
}
static int register_kprobe_event(struct trace_kprobe *tk);
@@ -253,7 +289,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
INIT_HLIST_NODE(&tk->rp.kp.hlist);
INIT_LIST_HEAD(&tk->rp.kp.list);
- ret = trace_probe_init(&tk->tp, event, group);
+ ret = trace_probe_init(&tk->tp, event, group, false);
if (ret < 0)
goto error;
@@ -291,32 +327,68 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk)
return ret;
}
+static void __disable_trace_kprobe(struct trace_probe *tp)
+{
+ struct trace_probe *pos;
+ struct trace_kprobe *tk;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tk = container_of(pos, struct trace_kprobe, tp);
+ if (!trace_kprobe_is_registered(tk))
+ continue;
+ if (trace_kprobe_is_return(tk))
+ disable_kretprobe(&tk->rp);
+ else
+ disable_kprobe(&tk->rp.kp);
+ }
+}
+
/*
* Enable trace_probe
* if the file is NULL, enable "perf" handler, or enable "trace" handler.
*/
-static int
-enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
+static int enable_trace_kprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
{
- bool enabled = trace_probe_is_enabled(&tk->tp);
+ struct trace_probe *pos, *tp;
+ struct trace_kprobe *tk;
+ bool enabled;
int ret = 0;
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
if (file) {
- ret = trace_probe_add_file(&tk->tp, file);
+ ret = trace_probe_add_file(tp, file);
if (ret)
return ret;
} else
- trace_probe_set_flag(&tk->tp, TP_FLAG_PROFILE);
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
if (enabled)
return 0;
- ret = __enable_trace_kprobe(tk);
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tk = container_of(pos, struct trace_kprobe, tp);
+ if (trace_kprobe_has_gone(tk))
+ continue;
+ ret = __enable_trace_kprobe(tk);
+ if (ret)
+ break;
+ enabled = true;
+ }
+
if (ret) {
+ /* Failed to enable one of them. Roll back all */
+ if (enabled)
+ __disable_trace_kprobe(tp);
if (file)
- trace_probe_remove_file(&tk->tp, file);
+ trace_probe_remove_file(tp, file);
else
- trace_probe_clear_flag(&tk->tp, TP_FLAG_PROFILE);
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
}
return ret;
@@ -326,11 +398,14 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
* Disable trace_probe
* if the file is NULL, disable "perf" handler, or disable "trace" handler.
*/
-static int
-disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
+static int disable_trace_kprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
{
- struct trace_probe *tp = &tk->tp;
- int ret = 0;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
if (file) {
if (!trace_probe_get_file_link(tp, file))
@@ -341,12 +416,8 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
} else
trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
- if (!trace_probe_is_enabled(tp) && trace_kprobe_is_registered(tk)) {
- if (trace_kprobe_is_return(tk))
- disable_kretprobe(&tk->rp);
- else
- disable_kprobe(&tk->rp.kp);
- }
+ if (!trace_probe_is_enabled(tp))
+ __disable_trace_kprobe(tp);
out:
if (file)
@@ -358,16 +429,15 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
*/
trace_probe_remove_file(tp, file);
- return ret;
+ return 0;
}
#if defined(CONFIG_KPROBES_ON_FTRACE) && \
!defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
-static bool within_notrace_func(struct trace_kprobe *tk)
+static bool __within_notrace_func(unsigned long addr)
{
- unsigned long offset, size, addr;
+ unsigned long offset, size;
- addr = trace_kprobe_address(tk);
if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset))
return false;
@@ -380,6 +450,28 @@ static bool within_notrace_func(struct trace_kprobe *tk)
*/
return !ftrace_location_range(addr, addr + size - 1);
}
+
+static bool within_notrace_func(struct trace_kprobe *tk)
+{
+ unsigned long addr = addr = trace_kprobe_address(tk);
+ char symname[KSYM_NAME_LEN], *p;
+
+ if (!__within_notrace_func(addr))
+ return false;
+
+ /* Check if the address is on a suffixed-symbol */
+ if (!lookup_symbol_name(addr, symname)) {
+ p = strchr(symname, '.');
+ if (!p)
+ return true;
+ *p = '\0';
+ addr = (unsigned long)kprobe_lookup_name(symname, 0);
+ if (addr)
+ return __within_notrace_func(addr);
+ }
+
+ return true;
+}
#else
#define within_notrace_func(tk) (false)
#endif
@@ -389,6 +481,10 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
{
int i, ret;
+ ret = security_locked_down(LOCKDOWN_KPROBES);
+ if (ret)
+ return ret;
+
if (trace_kprobe_is_registered(tk))
return -EINVAL;
@@ -437,6 +533,10 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk)
/* Unregister a trace_probe and probe_event */
static int unregister_trace_kprobe(struct trace_kprobe *tk)
{
+ /* If other probes are on the event, just unregister kprobe */
+ if (trace_probe_has_sibling(&tk->tp))
+ goto unreg;
+
/* Enabled event can not be unregistered */
if (trace_probe_is_enabled(&tk->tp))
return -EBUSY;
@@ -445,12 +545,82 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk)
if (unregister_kprobe_event(tk))
return -EBUSY;
+unreg:
__unregister_trace_kprobe(tk);
dyn_event_remove(&tk->devent);
+ trace_probe_unlink(&tk->tp);
return 0;
}
+static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig,
+ struct trace_kprobe *comp)
+{
+ struct trace_probe_event *tpe = orig->tp.event;
+ struct trace_probe *pos;
+ int i;
+
+ list_for_each_entry(pos, &tpe->probes, list) {
+ orig = container_of(pos, struct trace_kprobe, tp);
+ if (strcmp(trace_kprobe_symbol(orig),
+ trace_kprobe_symbol(comp)) ||
+ trace_kprobe_offset(orig) != trace_kprobe_offset(comp))
+ continue;
+
+ /*
+ * trace_probe_compare_arg_type() ensured that nr_args and
+ * each argument name and type are same. Let's compare comm.
+ */
+ for (i = 0; i < orig->tp.nr_args; i++) {
+ if (strcmp(orig->tp.args[i].comm,
+ comp->tp.args[i].comm))
+ break;
+ }
+
+ if (i == orig->tp.nr_args)
+ return true;
+ }
+
+ return false;
+}
+
+static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
+{
+ int ret;
+
+ ret = trace_probe_compare_arg_type(&tk->tp, &to->tp);
+ if (ret) {
+ /* Note that argument starts index = 2 */
+ trace_probe_log_set_index(ret + 1);
+ trace_probe_log_err(0, DIFF_ARG_TYPE);
+ return -EEXIST;
+ }
+ if (trace_kprobe_has_same_kprobe(to, tk)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, SAME_PROBE);
+ return -EEXIST;
+ }
+
+ /* Append to existing event */
+ ret = trace_probe_append(&tk->tp, &to->tp);
+ if (ret)
+ return ret;
+
+ /* Register k*probe */
+ ret = __register_trace_kprobe(tk);
+ if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
+ pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
+ ret = 0;
+ }
+
+ if (ret)
+ trace_probe_unlink(&tk->tp);
+ else
+ dyn_event_add(&tk->devent);
+
+ return ret;
+}
+
/* Register a trace_probe and probe_event */
static int register_trace_kprobe(struct trace_kprobe *tk)
{
@@ -459,14 +629,17 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
mutex_lock(&event_mutex);
- /* Delete old (same name) event if exist */
old_tk = find_trace_kprobe(trace_probe_name(&tk->tp),
trace_probe_group_name(&tk->tp));
if (old_tk) {
- ret = unregister_trace_kprobe(old_tk);
- if (ret < 0)
- goto end;
- free_trace_kprobe(old_tk);
+ if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, DIFF_PROBE_TYPE);
+ ret = -EEXIST;
+ } else {
+ ret = append_trace_kprobe(tk, old_tk);
+ }
+ goto end;
}
/* Register new event */
@@ -700,7 +873,7 @@ static int trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_INSN_BNDRY);
else if (ret == -ENOENT)
trace_probe_log_err(0, BAD_PROBE_ADDR);
- else if (ret != -ENOMEM)
+ else if (ret != -ENOMEM && ret != -EEXIST)
trace_probe_log_err(0, FAIL_REG_PROBE);
goto error;
}
@@ -728,6 +901,167 @@ static int create_or_delete_trace_kprobe(int argc, char **argv)
return ret == -ECANCELED ? -EINVAL : ret;
}
+static int trace_kprobe_run_command(struct dynevent_cmd *cmd)
+{
+ return trace_run_command(cmd->seq.buffer, create_or_delete_trace_kprobe);
+}
+
+/**
+ * kprobe_event_cmd_init - Initialize a kprobe event command object
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @buf: A pointer to the buffer used to build the command
+ * @maxlen: The length of the buffer passed in @buf
+ *
+ * Initialize a synthetic event command object. Use this before
+ * calling any of the other kprobe_event functions.
+ */
+void kprobe_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen)
+{
+ dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_KPROBE,
+ trace_kprobe_run_command);
+}
+EXPORT_SYMBOL_GPL(kprobe_event_cmd_init);
+
+/**
+ * __kprobe_event_gen_cmd_start - Generate a kprobe event command from arg list
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @name: The name of the kprobe event
+ * @loc: The location of the kprobe event
+ * @kretprobe: Is this a return probe?
+ * @args: Variable number of arg (pairs), one pair for each field
+ *
+ * NOTE: Users normally won't want to call this function directly, but
+ * rather use the kprobe_event_gen_cmd_start() wrapper, which automatically
+ * adds a NULL to the end of the arg list. If this function is used
+ * directly, make sure the last arg in the variable arg list is NULL.
+ *
+ * Generate a kprobe event command to be executed by
+ * kprobe_event_gen_cmd_end(). This function can be used to generate the
+ * complete command or only the first part of it; in the latter case,
+ * kprobe_event_add_fields() can be used to add more fields following this.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe,
+ const char *name, const char *loc, ...)
+{
+ char buf[MAX_EVENT_NAME_LEN];
+ struct dynevent_arg arg;
+ va_list args;
+ int ret;
+
+ if (cmd->type != DYNEVENT_TYPE_KPROBE)
+ return -EINVAL;
+
+ if (kretprobe)
+ snprintf(buf, MAX_EVENT_NAME_LEN, "r:kprobes/%s", name);
+ else
+ snprintf(buf, MAX_EVENT_NAME_LEN, "p:kprobes/%s", name);
+
+ ret = dynevent_str_add(cmd, buf);
+ if (ret)
+ return ret;
+
+ dynevent_arg_init(&arg, 0);
+ arg.str = loc;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ va_start(args, loc);
+ for (;;) {
+ const char *field;
+
+ field = va_arg(args, const char *);
+ if (!field)
+ break;
+
+ if (++cmd->n_fields > MAX_TRACE_ARGS) {
+ ret = -EINVAL;
+ break;
+ }
+
+ arg.str = field;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ break;
+ }
+ va_end(args);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__kprobe_event_gen_cmd_start);
+
+/**
+ * __kprobe_event_add_fields - Add probe fields to a kprobe command from arg list
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @args: Variable number of arg (pairs), one pair for each field
+ *
+ * NOTE: Users normally won't want to call this function directly, but
+ * rather use the kprobe_event_add_fields() wrapper, which
+ * automatically adds a NULL to the end of the arg list. If this
+ * function is used directly, make sure the last arg in the variable
+ * arg list is NULL.
+ *
+ * Add probe fields to an existing kprobe command using a variable
+ * list of args. Fields are added in the same order they're listed.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...)
+{
+ struct dynevent_arg arg;
+ va_list args;
+ int ret = 0;
+
+ if (cmd->type != DYNEVENT_TYPE_KPROBE)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, 0);
+
+ va_start(args, cmd);
+ for (;;) {
+ const char *field;
+
+ field = va_arg(args, const char *);
+ if (!field)
+ break;
+
+ if (++cmd->n_fields > MAX_TRACE_ARGS) {
+ ret = -EINVAL;
+ break;
+ }
+
+ arg.str = field;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ break;
+ }
+ va_end(args);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__kprobe_event_add_fields);
+
+/**
+ * kprobe_event_delete - Delete a kprobe event
+ * @name: The name of the kprobe event to delete
+ *
+ * Delete a kprobe event with the give @name from kernel code rather
+ * than directly from the command line.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int kprobe_event_delete(const char *name)
+{
+ char buf[MAX_EVENT_NAME_LEN];
+
+ snprintf(buf, MAX_EVENT_NAME_LEN, "-:%s", name);
+
+ return trace_run_command(buf, create_or_delete_trace_kprobe);
+}
+EXPORT_SYMBOL_GPL(kprobe_event_delete);
+
static int trace_kprobe_release(struct dyn_event *ev)
{
struct trace_kprobe *tk = to_trace_kprobe(ev);
@@ -783,6 +1117,10 @@ static int probes_open(struct inode *inode, struct file *file)
{
int ret;
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
ret = dyn_events_release_all(&trace_kprobe_ops);
if (ret < 0)
@@ -835,6 +1173,12 @@ static const struct seq_operations profile_seq_op = {
static int profile_open(struct inode *inode, struct file *file)
{
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
return seq_open(file, &profile_seq_op);
}
@@ -965,6 +1309,9 @@ retry:
case FETCH_OP_COMM:
val = (unsigned long)current->comm;
break;
+ case FETCH_OP_DATA:
+ val = (unsigned long)code->data;
+ break;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
case FETCH_OP_ARG:
val = regs_get_kernel_argument(regs, code->param);
@@ -988,35 +1335,35 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
struct trace_event_file *trace_file)
{
struct kprobe_trace_entry_head *entry;
- struct ring_buffer_event *event;
- struct ring_buffer *buffer;
- int size, dsize, pc;
- unsigned long irq_flags;
struct trace_event_call *call = trace_probe_event_call(&tk->tp);
+ struct trace_event_buffer fbuffer;
+ int dsize;
WARN_ON(call != trace_file->event_call);
if (trace_trigger_soft_disabled(trace_file))
return;
- local_save_flags(irq_flags);
- pc = preempt_count();
+ local_save_flags(fbuffer.flags);
+ fbuffer.pc = preempt_count();
+ fbuffer.trace_file = trace_file;
dsize = __get_data_size(&tk->tp, regs);
- size = sizeof(*entry) + tk->tp.size + dsize;
- event = trace_event_buffer_lock_reserve(&buffer, trace_file,
- call->event.type,
- size, irq_flags, pc);
- if (!event)
+ fbuffer.event =
+ trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
+ call->event.type,
+ sizeof(*entry) + tk->tp.size + dsize,
+ fbuffer.flags, fbuffer.pc);
+ if (!fbuffer.event)
return;
- entry = ring_buffer_event_data(event);
+ fbuffer.regs = regs;
+ entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->ip = (unsigned long)tk->rp.kp.addr;
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
- event_trigger_unlock_commit_regs(trace_file, buffer, event,
- entry, irq_flags, pc, regs);
+ trace_event_buffer_commit(&fbuffer);
}
static void
@@ -1036,36 +1383,35 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct trace_event_file *trace_file)
{
struct kretprobe_trace_entry_head *entry;
- struct ring_buffer_event *event;
- struct ring_buffer *buffer;
- int size, pc, dsize;
- unsigned long irq_flags;
+ struct trace_event_buffer fbuffer;
struct trace_event_call *call = trace_probe_event_call(&tk->tp);
+ int dsize;
WARN_ON(call != trace_file->event_call);
if (trace_trigger_soft_disabled(trace_file))
return;
- local_save_flags(irq_flags);
- pc = preempt_count();
+ local_save_flags(fbuffer.flags);
+ fbuffer.pc = preempt_count();
+ fbuffer.trace_file = trace_file;
dsize = __get_data_size(&tk->tp, regs);
- size = sizeof(*entry) + tk->tp.size + dsize;
-
- event = trace_event_buffer_lock_reserve(&buffer, trace_file,
- call->event.type,
- size, irq_flags, pc);
- if (!event)
+ fbuffer.event =
+ trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
+ call->event.type,
+ sizeof(*entry) + tk->tp.size + dsize,
+ fbuffer.flags, fbuffer.pc);
+ if (!fbuffer.event)
return;
- entry = ring_buffer_event_data(event);
+ fbuffer.regs = regs;
+ entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
- event_trigger_unlock_commit_regs(trace_file, buffer, event,
- entry, irq_flags, pc, regs);
+ trace_event_buffer_commit(&fbuffer);
}
static void
@@ -1089,7 +1435,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
struct trace_probe *tp;
field = (struct kprobe_trace_entry_head *)iter->ent;
- tp = container_of(event, struct trace_probe, call.event);
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
@@ -1116,7 +1465,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
struct trace_probe *tp;
field = (struct kretprobe_trace_entry_head *)iter->ent;
- tp = container_of(event, struct trace_probe, call.event);
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
@@ -1145,23 +1497,31 @@ static int kprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret;
struct kprobe_trace_entry_head field;
- struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
- return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp);
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
}
static int kretprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret;
struct kretprobe_trace_entry_head field;
- struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
- return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp);
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
}
#ifdef CONFIG_PERF_EVENTS
@@ -1289,20 +1649,19 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
static int kprobe_register(struct trace_event_call *event,
enum trace_reg type, void *data)
{
- struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
- return enable_trace_kprobe(tk, file);
+ return enable_trace_kprobe(event, file);
case TRACE_REG_UNREGISTER:
- return disable_trace_kprobe(tk, file);
+ return disable_trace_kprobe(event, file);
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return enable_trace_kprobe(tk, NULL);
+ return enable_trace_kprobe(event, NULL);
case TRACE_REG_PERF_UNREGISTER:
- return disable_trace_kprobe(tk, NULL);
+ return disable_trace_kprobe(event, NULL);
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
case TRACE_REG_PERF_ADD:
@@ -1355,21 +1714,32 @@ static struct trace_event_functions kprobe_funcs = {
.trace = print_kprobe_event
};
+static struct trace_event_fields kretprobe_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = kretprobe_event_define_fields },
+ {}
+};
+
+static struct trace_event_fields kprobe_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = kprobe_event_define_fields },
+ {}
+};
+
static inline void init_trace_event_call(struct trace_kprobe *tk)
{
struct trace_event_call *call = trace_probe_event_call(&tk->tp);
if (trace_kprobe_is_return(tk)) {
call->event.funcs = &kretprobe_funcs;
- call->class->define_fields = kretprobe_event_define_fields;
+ call->class->fields_array = kretprobe_fields_array;
} else {
call->event.funcs = &kprobe_funcs;
- call->class->define_fields = kprobe_event_define_fields;
+ call->class->fields_array = kprobe_fields_array;
}
call->flags = TRACE_EVENT_FL_KPROBE;
call->class->reg = kprobe_register;
- call->data = tk;
}
static int register_kprobe_event(struct trace_kprobe *tk)
@@ -1432,7 +1802,9 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call)
{
struct trace_kprobe *tk;
- tk = container_of(event_call, struct trace_kprobe, tp.call);
+ tk = trace_kprobe_primary_from_call(event_call);
+ if (unlikely(!tk))
+ return;
if (trace_probe_is_enabled(&tk->tp)) {
WARN_ON(1);
@@ -1485,11 +1857,12 @@ static __init void setup_boot_kprobe_events(void)
enable_boot_kprobe_events();
}
-/* Make a tracefs interface for controlling probe points */
-static __init int init_kprobe_trace(void)
+/*
+ * Register dynevent at subsys_initcall. This allows kernel to setup kprobe
+ * events in fs_initcall without tracefs.
+ */
+static __init int init_kprobe_trace_early(void)
{
- struct dentry *d_tracer;
- struct dentry *entry;
int ret;
ret = dyn_event_register(&trace_kprobe_ops);
@@ -1499,6 +1872,16 @@ static __init int init_kprobe_trace(void)
if (register_module_notifier(&trace_kprobe_module_nb))
return -EINVAL;
+ return 0;
+}
+subsys_initcall(init_kprobe_trace_early);
+
+/* Make a tracefs interface for controlling probe points */
+static __init int init_kprobe_trace(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
d_tracer = tracing_init_dentry();
if (IS_ERR(d_tracer))
return 0;
@@ -1577,7 +1960,8 @@ static __init int kprobe_trace_self_tests_init(void)
pr_warn("error on getting probe file.\n");
warn++;
} else
- enable_trace_kprobe(tk, file);
+ enable_trace_kprobe(
+ trace_probe_event_call(&tk->tp), file);
}
}
@@ -1598,7 +1982,8 @@ static __init int kprobe_trace_self_tests_init(void)
pr_warn("error on getting probe file.\n");
warn++;
} else
- enable_trace_kprobe(tk, file);
+ enable_trace_kprobe(
+ trace_probe_event_call(&tk->tp), file);
}
}
@@ -1631,7 +2016,8 @@ static __init int kprobe_trace_self_tests_init(void)
pr_warn("error on getting probe file.\n");
warn++;
} else
- disable_trace_kprobe(tk, file);
+ disable_trace_kprobe(
+ trace_probe_event_call(&tk->tp), file);
}
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
@@ -1649,7 +2035,8 @@ static __init int kprobe_trace_self_tests_init(void)
pr_warn("error on getting probe file.\n");
warn++;
} else
- disable_trace_kprobe(tk, file);
+ disable_trace_kprobe(
+ trace_probe_event_call(&tk->tp), file);
}
ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index b0388016b687..84582bf1ed5f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -32,7 +32,7 @@ static void mmio_reset_data(struct trace_array *tr)
overrun_detected = false;
prev_overruns = 0;
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
}
static int mmio_trace_init(struct trace_array *tr)
@@ -122,7 +122,7 @@ static void mmio_close(struct trace_iterator *iter)
static unsigned long count_overruns(struct trace_iterator *iter)
{
unsigned long cnt = atomic_xchg(&dropped_count, 0);
- unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
+ unsigned long over = ring_buffer_overruns(iter->array_buffer->buffer);
if (over > prev_overruns)
cnt += over - prev_overruns;
@@ -297,7 +297,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct mmiotrace_rw *rw)
{
struct trace_event_call *call = &event_mmiotrace_rw;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
int pc = preempt_count();
@@ -318,7 +318,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
void mmio_trace_rw(struct mmiotrace_rw *rw)
{
struct trace_array *tr = mmio_trace_array;
- struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
+ struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id());
__trace_mmiotrace_rw(tr, data, rw);
}
@@ -327,7 +327,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct mmiotrace_map *map)
{
struct trace_event_call *call = &event_mmiotrace_map;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
int pc = preempt_count();
@@ -351,7 +351,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
struct trace_array_cpu *data;
preempt_disable();
- data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
+ data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id());
__trace_mmiotrace_map(tr, data, map);
preempt_enable();
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index cab4a5398f1d..b4909082f6a4 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -219,10 +219,10 @@ trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
+ const char *fmt = concatenate ? "%*phN" : "%*ph";
- for (i = 0; i < buf_len; i++)
- trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ",
- buf[i]);
+ for (i = 0; i < buf_len; i += 16)
+ trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]);
trace_seq_putc(p, 0);
return ret;
@@ -274,6 +274,21 @@ trace_print_array_seq(struct trace_seq *p, const void *buf, int count,
}
EXPORT_SYMBOL(trace_print_array_seq);
+const char *
+trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str,
+ int prefix_type, int rowsize, int groupsize,
+ const void *buf, size_t len, bool ascii)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ trace_seq_putc(p, '\n');
+ trace_seq_hex_dump(p, prefix_str, prefix_type,
+ rowsize, groupsize, buf, len, ascii);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+EXPORT_SYMBOL(trace_print_hex_dump_seq);
+
int trace_raw_output_prep(struct trace_iterator *iter,
struct trace_event *trace_event)
{
@@ -523,7 +538,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
struct trace_array *tr = iter->tr;
unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE;
unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
- unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
+ unsigned long long abs_ts = iter->ts - iter->array_buffer->time_start;
unsigned long long rel_ts = next_ts - iter->ts;
struct trace_seq *s = &iter->seq;
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index c3fd849d4a8f..d4e31e969206 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -6,6 +6,7 @@
*
*/
#include <linux/seq_file.h>
+#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/ftrace.h>
@@ -348,6 +349,12 @@ static const struct seq_operations show_format_seq_ops = {
static int
ftrace_formats_open(struct inode *inode, struct file *file)
{
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
return seq_open(file, &show_format_seq_ops);
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index dbef0d135075..ab8b6436d53f 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -178,6 +178,16 @@ void __trace_probe_log_err(int offset, int err_type)
if (!command)
return;
+ if (trace_probe_log.index >= trace_probe_log.argc) {
+ /**
+ * Set the error position is next to the last arg + space.
+ * Note that len includes the terminal null and the cursor
+ * appaers at pos + 1.
+ */
+ pos = len;
+ offset = 0;
+ }
+
/* And make a command string from argv array */
p = command;
for (i = 0; i < trace_probe_log.argc; i++) {
@@ -316,6 +326,29 @@ inval_var:
return -EINVAL;
}
+static int str_to_immediate(char *str, unsigned long *imm)
+{
+ if (isdigit(str[0]))
+ return kstrtoul(str, 0, imm);
+ else if (str[0] == '-')
+ return kstrtol(str, 0, (long *)imm);
+ else if (str[0] == '+')
+ return kstrtol(str + 1, 0, (long *)imm);
+ return -EINVAL;
+}
+
+static int __parse_imm_string(char *str, char **pbuf, int offs)
+{
+ size_t len = strlen(str);
+
+ if (str[len - 1] != '"') {
+ trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE);
+ return -EINVAL;
+ }
+ *pbuf = kstrndup(str, len - 1, GFP_KERNEL);
+ return 0;
+}
+
/* Recursive argument parser */
static int
parse_probe_arg(char *arg, const struct fetch_type *type,
@@ -430,7 +463,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
ret = parse_probe_arg(arg, t2, &code, end, flags, offs);
if (ret)
break;
- if (code->op == FETCH_OP_COMM) {
+ if (code->op == FETCH_OP_COMM ||
+ code->op == FETCH_OP_DATA) {
trace_probe_log_err(offs, COMM_CANT_DEREF);
return -EINVAL;
}
@@ -444,6 +478,21 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->offset = offset;
}
break;
+ case '\\': /* Immediate value */
+ if (arg[1] == '"') { /* Immediate string */
+ ret = __parse_imm_string(arg + 2, &tmp, offs + 2);
+ if (ret)
+ break;
+ code->op = FETCH_OP_DATA;
+ code->data = tmp;
+ } else {
+ ret = str_to_immediate(arg + 1, &code->immediate);
+ if (ret)
+ trace_probe_log_err(offs + 1, BAD_IMM);
+ else
+ code->op = FETCH_OP_IMM;
+ }
+ break;
}
if (!ret && code->op == FETCH_OP_NOP) {
/* Parsed, but do not find fetch method */
@@ -542,8 +591,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
}
}
- /* Since $comm can not be dereferred, we can find $comm by strcmp */
- if (strcmp(arg, "$comm") == 0) {
+ /*
+ * Since $comm and immediate string can not be dereferred,
+ * we can find those by strcmp.
+ */
+ if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {
/* The type of $comm must be "string", and not an array. */
if (parg->count || (t && strcmp(t, "string")))
return -EINVAL;
@@ -580,7 +632,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if (!strcmp(parg->type->name, "string") ||
!strcmp(parg->type->name, "ustring")) {
if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
- code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) {
+ code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
+ code->op != FETCH_OP_DATA) {
trace_probe_log_err(offset + (t ? (t - arg) : 0),
BAD_STRING);
ret = -EINVAL;
@@ -589,9 +642,10 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) ||
parg->count) {
/*
- * IMM and COMM is pointing actual address, those must
- * be kept, and if parg->count != 0, this is an array
- * of string pointers instead of string address itself.
+ * IMM, DATA and COMM is pointing actual address, those
+ * must be kept, and if parg->count != 0, this is an
+ * array of string pointers instead of string address
+ * itself.
*/
code++;
if (code->op != FETCH_OP_NOP) {
@@ -665,7 +719,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
fail:
if (ret) {
for (code = tmp; code < tmp + FETCH_INSN_MAX; code++)
- if (code->op == FETCH_NOP_SYMBOL)
+ if (code->op == FETCH_NOP_SYMBOL ||
+ code->op == FETCH_OP_DATA)
kfree(code->data);
}
kfree(tmp);
@@ -736,7 +791,8 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
struct fetch_insn *code = arg->code;
while (code && code->op != FETCH_OP_END) {
- if (code->op == FETCH_NOP_SYMBOL)
+ if (code->op == FETCH_NOP_SYMBOL ||
+ code->op == FETCH_OP_DATA)
kfree(code->data);
code++;
}
@@ -820,7 +876,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
for (i = 0; i < tp->nr_args; i++) {
parg = tp->args + i;
if (parg->count) {
- if (strcmp(parg->type->name, "string") == 0)
+ if ((strcmp(parg->type->name, "string") == 0) ||
+ (strcmp(parg->type->name, "ustring") == 0))
fmt = ", __get_str(%s[%d])";
else
fmt = ", REC->%s[%d]";
@@ -828,7 +885,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
pos += snprintf(buf + pos, LEN_OR_ZERO,
fmt, parg->name, j);
} else {
- if (strcmp(parg->type->name, "string") == 0)
+ if ((strcmp(parg->type->name, "string") == 0) ||
+ (strcmp(parg->type->name, "ustring") == 0))
fmt = ", __get_str(%s)";
else
fmt = ", REC->%s";
@@ -886,43 +944,89 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call,
return 0;
}
+static void trace_probe_event_free(struct trace_probe_event *tpe)
+{
+ kfree(tpe->class.system);
+ kfree(tpe->call.name);
+ kfree(tpe->call.print_fmt);
+ kfree(tpe);
+}
+
+int trace_probe_append(struct trace_probe *tp, struct trace_probe *to)
+{
+ if (trace_probe_has_sibling(tp))
+ return -EBUSY;
+
+ list_del_init(&tp->list);
+ trace_probe_event_free(tp->event);
+
+ tp->event = to->event;
+ list_add_tail(&tp->list, trace_probe_probe_list(to));
+
+ return 0;
+}
+
+void trace_probe_unlink(struct trace_probe *tp)
+{
+ list_del_init(&tp->list);
+ if (list_empty(trace_probe_probe_list(tp)))
+ trace_probe_event_free(tp->event);
+ tp->event = NULL;
+}
void trace_probe_cleanup(struct trace_probe *tp)
{
- struct trace_event_call *call = trace_probe_event_call(tp);
int i;
for (i = 0; i < tp->nr_args; i++)
traceprobe_free_probe_arg(&tp->args[i]);
- kfree(call->class->system);
- kfree(call->name);
- kfree(call->print_fmt);
+ if (tp->event)
+ trace_probe_unlink(tp);
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group)
+ const char *group, bool alloc_filter)
{
- struct trace_event_call *call = trace_probe_event_call(tp);
+ struct trace_event_call *call;
+ size_t size = sizeof(struct trace_probe_event);
+ int ret = 0;
if (!event || !group)
return -EINVAL;
- call->class = &tp->class;
- call->name = kstrdup(event, GFP_KERNEL);
- if (!call->name)
- return -ENOMEM;
+ if (alloc_filter)
+ size += sizeof(struct trace_uprobe_filter);
- tp->class.system = kstrdup(group, GFP_KERNEL);
- if (!tp->class.system) {
- kfree(call->name);
- call->name = NULL;
+ tp->event = kzalloc(size, GFP_KERNEL);
+ if (!tp->event)
return -ENOMEM;
+
+ INIT_LIST_HEAD(&tp->event->files);
+ INIT_LIST_HEAD(&tp->event->class.fields);
+ INIT_LIST_HEAD(&tp->event->probes);
+ INIT_LIST_HEAD(&tp->list);
+ list_add(&tp->event->probes, &tp->list);
+
+ call = trace_probe_event_call(tp);
+ call->class = &tp->event->class;
+ call->name = kstrdup(event, GFP_KERNEL);
+ if (!call->name) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ tp->event->class.system = kstrdup(group, GFP_KERNEL);
+ if (!tp->event->class.system) {
+ ret = -ENOMEM;
+ goto error;
}
- INIT_LIST_HEAD(&tp->files);
- INIT_LIST_HEAD(&tp->class.fields);
return 0;
+
+error:
+ trace_probe_cleanup(tp);
+ return ret;
}
int trace_probe_register_event_call(struct trace_probe *tp)
@@ -951,7 +1055,7 @@ int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file)
link->file = file;
INIT_LIST_HEAD(&link->list);
- list_add_tail_rcu(&link->list, &tp->files);
+ list_add_tail_rcu(&link->list, &tp->event->files);
trace_probe_set_flag(tp, TP_FLAG_TRACE);
return 0;
}
@@ -982,8 +1086,51 @@ int trace_probe_remove_file(struct trace_probe *tp,
synchronize_rcu();
kfree(link);
- if (list_empty(&tp->files))
+ if (list_empty(&tp->event->files))
trace_probe_clear_flag(tp, TP_FLAG_TRACE);
return 0;
}
+
+/*
+ * Return the smallest index of different type argument (start from 1).
+ * If all argument types and name are same, return 0.
+ */
+int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b)
+{
+ int i;
+
+ /* In case of more arguments */
+ if (a->nr_args < b->nr_args)
+ return a->nr_args + 1;
+ if (a->nr_args > b->nr_args)
+ return b->nr_args + 1;
+
+ for (i = 0; i < a->nr_args; i++) {
+ if ((b->nr_args <= i) ||
+ ((a->args[i].type != b->args[i].type) ||
+ (a->args[i].count != b->args[i].count) ||
+ strcmp(a->args[i].name, b->args[i].name)))
+ return i + 1;
+ }
+
+ return 0;
+}
+
+bool trace_probe_match_command_args(struct trace_probe *tp,
+ int argc, const char **argv)
+{
+ char buf[MAX_ARGSTR_LEN + 1];
+ int i;
+
+ if (tp->nr_args < argc)
+ return false;
+
+ for (i = 0; i < argc; i++) {
+ snprintf(buf, sizeof(buf), "%s=%s",
+ tp->args[i].name, tp->args[i].comm);
+ if (strcmp(buf, argv[i]))
+ return false;
+ }
+ return true;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index d1714820efe1..a0ff9e200ef6 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -89,6 +89,7 @@ enum fetch_op {
FETCH_OP_COMM, /* Current comm */
FETCH_OP_ARG, /* Function argument : .param */
FETCH_OP_FOFFS, /* File offset: .immediate */
+ FETCH_OP_DATA, /* Allocated data: .data */
// Stage 2 (dereference) op
FETCH_OP_DEREF, /* Dereference: .offset */
FETCH_OP_UDEREF, /* User-space Dereference: .offset */
@@ -222,11 +223,25 @@ struct probe_arg {
const struct fetch_type *type; /* Type of this argument */
};
-struct trace_probe {
+struct trace_uprobe_filter {
+ rwlock_t rwlock;
+ int nr_systemwide;
+ struct list_head perf_events;
+};
+
+/* Event call and class holder */
+struct trace_probe_event {
unsigned int flags; /* For TP_FLAG_* */
struct trace_event_class class;
struct trace_event_call call;
struct list_head files;
+ struct list_head probes;
+ struct trace_uprobe_filter filter[0];
+};
+
+struct trace_probe {
+ struct list_head list;
+ struct trace_probe_event *event;
ssize_t size; /* trace entry size */
unsigned int nr_args;
struct probe_arg args[];
@@ -240,19 +255,19 @@ struct event_file_link {
static inline bool trace_probe_test_flag(struct trace_probe *tp,
unsigned int flag)
{
- return !!(tp->flags & flag);
+ return !!(tp->event->flags & flag);
}
static inline void trace_probe_set_flag(struct trace_probe *tp,
unsigned int flag)
{
- tp->flags |= flag;
+ tp->event->flags |= flag;
}
static inline void trace_probe_clear_flag(struct trace_probe *tp,
unsigned int flag)
{
- tp->flags &= ~flag;
+ tp->event->flags &= ~flag;
}
static inline bool trace_probe_is_enabled(struct trace_probe *tp)
@@ -262,45 +277,76 @@ static inline bool trace_probe_is_enabled(struct trace_probe *tp)
static inline const char *trace_probe_name(struct trace_probe *tp)
{
- return trace_event_name(&tp->call);
+ return trace_event_name(&tp->event->call);
}
static inline const char *trace_probe_group_name(struct trace_probe *tp)
{
- return tp->call.class->system;
+ return tp->event->call.class->system;
}
static inline struct trace_event_call *
trace_probe_event_call(struct trace_probe *tp)
{
- return &tp->call;
+ return &tp->event->call;
+}
+
+static inline struct trace_probe_event *
+trace_probe_event_from_call(struct trace_event_call *event_call)
+{
+ return container_of(event_call, struct trace_probe_event, call);
+}
+
+static inline struct trace_probe *
+trace_probe_primary_from_call(struct trace_event_call *call)
+{
+ struct trace_probe_event *tpe = trace_probe_event_from_call(call);
+
+ return list_first_entry(&tpe->probes, struct trace_probe, list);
+}
+
+static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp)
+{
+ return &tp->event->probes;
+}
+
+static inline bool trace_probe_has_sibling(struct trace_probe *tp)
+{
+ struct list_head *list = trace_probe_probe_list(tp);
+
+ return !list_empty(list) && !list_is_singular(list);
}
static inline int trace_probe_unregister_event_call(struct trace_probe *tp)
{
/* tp->event is unregistered in trace_remove_event_call() */
- return trace_remove_event_call(&tp->call);
+ return trace_remove_event_call(&tp->event->call);
}
static inline bool trace_probe_has_single_file(struct trace_probe *tp)
{
- return !!list_is_singular(&tp->files);
+ return !!list_is_singular(&tp->event->files);
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group);
+ const char *group, bool alloc_filter);
void trace_probe_cleanup(struct trace_probe *tp);
+int trace_probe_append(struct trace_probe *tp, struct trace_probe *to);
+void trace_probe_unlink(struct trace_probe *tp);
int trace_probe_register_event_call(struct trace_probe *tp);
int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file);
int trace_probe_remove_file(struct trace_probe *tp,
struct trace_event_file *file);
struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp,
struct trace_event_file *file);
+int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b);
+bool trace_probe_match_command_args(struct trace_probe *tp,
+ int argc, const char **argv);
#define trace_probe_for_each_link(pos, tp) \
- list_for_each_entry(pos, &(tp)->files, list)
+ list_for_each_entry(pos, &(tp)->event->files, list)
#define trace_probe_for_each_link_rcu(pos, tp) \
- list_for_each_entry_rcu(pos, &(tp)->files, list)
+ list_for_each_entry_rcu(pos, &(tp)->event->files, list)
/* Check the name is good for event/group/fields */
static inline bool is_good_name(const char *name)
@@ -370,6 +416,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_VAR, "Invalid $-valiable specified"), \
C(BAD_REG_NAME, "Invalid register name"), \
C(BAD_MEM_ADDR, "Invalid memory address"), \
+ C(BAD_IMM, "Invalid immediate value"), \
+ C(IMMSTR_NO_CLOSE, "String is not closed with '\"'"), \
C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \
C(BAD_FILE_OFFS, "Invalid file offset value"), \
C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \
@@ -393,7 +441,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(ARG_TOO_LONG, "Argument expression is too long"), \
C(NO_ARG_BODY, "No argument expression"), \
C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\
- C(FAIL_REG_PROBE, "Failed to register probe event"),
+ C(FAIL_REG_PROBE, "Failed to register probe event"),\
+ C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\
+ C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\
+ C(SAME_PROBE, "There is already the exact same probe event"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e288168661e1..e304196d7c28 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -89,8 +89,10 @@ static void tracing_sched_unregister(void)
static void tracing_start_sched_switch(int ops)
{
- bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
+ bool sched_register;
+
mutex_lock(&sched_register_mutex);
+ sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
switch (ops) {
case RECORD_CMDLINE:
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 743b2b520d34..97b10bb31a1f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -82,7 +82,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
if (cpu != wakeup_current_cpu)
goto out_enable;
- *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ *data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&(*data)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -378,7 +378,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
unsigned long flags, int pc)
{
struct trace_event_call *call = &event_context_switch;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -408,7 +408,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct trace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
sizeof(*entry), flags, pc);
@@ -459,7 +459,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
- disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
+ disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (likely(disabled != 1))
goto out;
@@ -471,7 +471,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
goto out_unlock;
/* The task we are waiting for is waking up */
- data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
+ data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -494,7 +494,7 @@ out_unlock:
arch_spin_unlock(&wakeup_lock);
local_irq_restore(flags);
out:
- atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
+ atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
}
static void __wakeup_reset(struct trace_array *tr)
@@ -513,7 +513,7 @@ static void wakeup_reset(struct trace_array *tr)
{
unsigned long flags;
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
local_irq_save(flags);
arch_spin_lock(&wakeup_lock);
@@ -551,7 +551,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
return;
pc = preempt_count();
- disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
+ disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -579,12 +579,11 @@ probe_wakeup(void *ignore, struct task_struct *p)
else
tracing_dl = 0;
- wakeup_task = p;
- get_task_struct(wakeup_task);
+ wakeup_task = get_task_struct(p);
local_save_flags(flags);
- data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
+ data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
data->preempt_timestamp = ftrace_now(cpu);
tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
__trace_stack(wakeup_trace, flags, 0, pc);
@@ -599,7 +598,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
out_locked:
arch_spin_unlock(&wakeup_lock);
out:
- atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
+ atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
}
static void start_wakeup_tracer(struct trace_array *tr)
@@ -631,7 +630,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_migrate_task\n");
- return;
+ goto fail_deprobe_sched_switch;
}
wakeup_reset(tr);
@@ -649,6 +648,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
+fail_deprobe_sched_switch:
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
fail_deprobe_wake_new:
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
fail_deprobe:
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 69ee8ef12cee..b5e3496cf803 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -23,7 +23,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
return 0;
}
-static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
+static int trace_test_buffer_cpu(struct array_buffer *buf, int cpu)
{
struct ring_buffer_event *event;
struct trace_entry *entry;
@@ -60,7 +60,7 @@ static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
* Test the trace buffer to see if all the elements
* are still sane.
*/
-static int __maybe_unused trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
+static int __maybe_unused trace_test_buffer(struct array_buffer *buf, unsigned long *count)
{
unsigned long flags, cnt = 0;
int cpu, ret = 0;
@@ -362,7 +362,7 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
msleep(100);
/* we should have nothing in the buffer */
- ret = trace_test_buffer(&tr->trace_buffer, &count);
+ ret = trace_test_buffer(&tr->array_buffer, &count);
if (ret)
goto out;
@@ -383,7 +383,7 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
ftrace_enabled = 0;
/* check the trace buffer */
- ret = trace_test_buffer(&tr->trace_buffer, &count);
+ ret = trace_test_buffer(&tr->array_buffer, &count);
ftrace_enabled = 1;
tracing_start();
@@ -682,7 +682,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
ftrace_enabled = 0;
/* check the trace buffer */
- ret = trace_test_buffer(&tr->trace_buffer, &count);
+ ret = trace_test_buffer(&tr->array_buffer, &count);
ftrace_enabled = 1;
trace->reset(tr);
@@ -768,7 +768,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* Simulate the init() callback but we attach a watchdog callback
* to detect and recover from possible hangs
*/
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(&tr->array_buffer);
set_graph_array(tr);
ret = register_ftrace_graph(&fgraph_ops);
if (ret) {
@@ -790,7 +790,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
tracing_stop();
/* check the trace buffer */
- ret = trace_test_buffer(&tr->trace_buffer, &count);
+ ret = trace_test_buffer(&tr->array_buffer, &count);
/* Need to also simulate the tr->reset to remove this fgraph_ops */
tracing_stop_cmdline_record();
@@ -848,7 +848,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ ret = trace_test_buffer(&tr->array_buffer, NULL);
if (!ret)
ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
@@ -910,7 +910,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ ret = trace_test_buffer(&tr->array_buffer, NULL);
if (!ret)
ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
@@ -976,7 +976,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ ret = trace_test_buffer(&tr->array_buffer, NULL);
if (ret)
goto out;
@@ -1006,7 +1006,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ ret = trace_test_buffer(&tr->array_buffer, NULL);
if (ret)
goto out;
@@ -1136,7 +1136,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ ret = trace_test_buffer(&tr->array_buffer, NULL);
if (!ret)
ret = trace_test_buffer(&tr->max_buffer, &count);
@@ -1177,7 +1177,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check the trace buffer */
- ret = trace_test_buffer(&tr->trace_buffer, &count);
+ ret = trace_test_buffer(&tr->array_buffer, &count);
trace->reset(tr);
tracing_start();
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 6b1c562ffdaf..1d84fcc78e3e 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -30,9 +30,6 @@
/* How much buffer is left on the trace_seq? */
#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
-/* How much buffer is written? */
-#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
-
/*
* trace_seq should work with being initialized with 0s.
*/
@@ -376,3 +373,33 @@ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
return seq_buf_to_user(&s->seq, ubuf, cnt);
}
EXPORT_SYMBOL_GPL(trace_seq_to_user);
+
+int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str,
+ int prefix_type, int rowsize, int groupsize,
+ const void *buf, size_t len, bool ascii)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return 0;
+
+ __trace_seq_init(s);
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return 0;
+ }
+
+ seq_buf_hex_dump(&(s->seq), prefix_str,
+ prefix_type, rowsize, groupsize,
+ buf, len, ascii);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(trace_seq_hex_dump);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 5d16f73898db..c557f42a9397 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -5,6 +5,7 @@
*/
#include <linux/sched/task_stack.h>
#include <linux/stacktrace.h>
+#include <linux/security.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
@@ -53,6 +54,104 @@ static void print_max_stack(void)
}
}
+/*
+ * The stack tracer looks for a maximum stack at each call from a function. It
+ * registers a callback from ftrace, and in that callback it examines the stack
+ * size. It determines the stack size from the variable passed in, which is the
+ * address of a local variable in the stack_trace_call() callback function.
+ * The stack size is calculated by the address of the local variable to the top
+ * of the current stack. If that size is smaller than the currently saved max
+ * stack size, nothing more is done.
+ *
+ * If the size of the stack is greater than the maximum recorded size, then the
+ * following algorithm takes place.
+ *
+ * For architectures (like x86) that store the function's return address before
+ * saving the function's local variables, the stack will look something like
+ * this:
+ *
+ * [ top of stack ]
+ * 0: sys call entry frame
+ * 10: return addr to entry code
+ * 11: start of sys_foo frame
+ * 20: return addr to sys_foo
+ * 21: start of kernel_func_bar frame
+ * 30: return addr to kernel_func_bar
+ * 31: [ do trace stack here ]
+ *
+ * The save_stack_trace() is called returning all the functions it finds in the
+ * current stack. Which would be (from the bottom of the stack to the top):
+ *
+ * return addr to kernel_func_bar
+ * return addr to sys_foo
+ * return addr to entry code
+ *
+ * Now to figure out how much each of these functions' local variable size is,
+ * a search of the stack is made to find these values. When a match is made, it
+ * is added to the stack_dump_trace[] array. The offset into the stack is saved
+ * in the stack_trace_index[] array. The above example would show:
+ *
+ * stack_dump_trace[] | stack_trace_index[]
+ * ------------------ + -------------------
+ * return addr to kernel_func_bar | 30
+ * return addr to sys_foo | 20
+ * return addr to entry | 10
+ *
+ * The print_max_stack() function above, uses these values to print the size of
+ * each function's portion of the stack.
+ *
+ * for (i = 0; i < nr_entries; i++) {
+ * size = i == nr_entries - 1 ? stack_trace_index[i] :
+ * stack_trace_index[i] - stack_trace_index[i+1]
+ * print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]);
+ * }
+ *
+ * The above shows
+ *
+ * depth size location
+ * ----- ---- --------
+ * 0 30 10 kernel_func_bar
+ * 1 20 10 sys_foo
+ * 2 10 10 entry code
+ *
+ * Now for architectures that might save the return address after the functions
+ * local variables (saving the link register before calling nested functions),
+ * this will cause the stack to look a little different:
+ *
+ * [ top of stack ]
+ * 0: sys call entry frame
+ * 10: start of sys_foo_frame
+ * 19: return addr to entry code << lr saved before calling kernel_func_bar
+ * 20: start of kernel_func_bar frame
+ * 29: return addr to sys_foo_frame << lr saved before calling next function
+ * 30: [ do trace stack here ]
+ *
+ * Although the functions returned by save_stack_trace() may be the same, the
+ * placement in the stack will be different. Using the same algorithm as above
+ * would yield:
+ *
+ * stack_dump_trace[] | stack_trace_index[]
+ * ------------------ + -------------------
+ * return addr to kernel_func_bar | 30
+ * return addr to sys_foo | 29
+ * return addr to entry | 19
+ *
+ * Where the mapping is off by one:
+ *
+ * kernel_func_bar stack frame size is 29 - 19 not 30 - 29!
+ *
+ * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the
+ * values in stack_trace_index[] are shifted by one to and the number of
+ * stack trace entries is decremented by one.
+ *
+ * stack_dump_trace[] | stack_trace_index[]
+ * ------------------ + -------------------
+ * return addr to kernel_func_bar | 29
+ * return addr to sys_foo | 19
+ *
+ * Although the entry function is not displayed, the first function (sys_foo)
+ * will still include the stack size of it.
+ */
static void check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags; unsigned long *p, *top, *start;
@@ -158,6 +257,20 @@ static void check_stack(unsigned long ip, unsigned long *stack)
i++;
}
+#ifdef ARCH_FTRACE_SHIFT_STACK_TRACER
+ /*
+ * Some archs will store the link register before calling
+ * nested functions. This means the saved return address
+ * comes after the local storage, and we need to shift
+ * for that.
+ */
+ if (x > 1) {
+ memmove(&stack_trace_index[0], &stack_trace_index[1],
+ sizeof(stack_trace_index[0]) * (x - 1));
+ x--;
+ }
+#endif
+
stack_trace_nr_entries = x;
if (task_stack_end_corrupted(current)) {
@@ -170,6 +283,11 @@ static void check_stack(unsigned long ip, unsigned long *stack)
local_irq_restore(flags);
}
+/* Some archs may not define MCOUNT_INSN_SIZE */
+#ifndef MCOUNT_INSN_SIZE
+# define MCOUNT_INSN_SIZE 0
+#endif
+
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -358,6 +476,12 @@ static const struct seq_operations stack_trace_seq_ops = {
static int stack_trace_open(struct inode *inode, struct file *file)
{
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
return seq_open(file, &stack_trace_seq_ops);
}
@@ -375,6 +499,7 @@ stack_trace_filter_open(struct inode *inode, struct file *file)
{
struct ftrace_ops *ops = inode->i_private;
+ /* Checks for tracefs lockdown */
return ftrace_regex_open(ops, FTRACE_ITER_FILTER,
inode, file);
}
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75bf1bcb4a8a..d1fa19773cc8 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -9,7 +9,7 @@
*
*/
-
+#include <linux/security.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
@@ -72,9 +72,7 @@ static void destroy_session(struct stat_session *session)
kfree(session);
}
-typedef int (*cmp_stat_t)(void *, void *);
-
-static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
+static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp)
{
struct rb_node **new = &(root->rb_node), *parent = NULL;
struct stat_node *data;
@@ -112,7 +110,7 @@ static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
* This one will force an insertion as right-most node
* in the rbtree.
*/
-static int dummy_cmp(void *p1, void *p2)
+static int dummy_cmp(const void *p1, const void *p2)
{
return -1;
}
@@ -238,6 +236,10 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
struct seq_file *m;
struct stat_session *session = inode->i_private;
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
ret = stat_seq_init(session);
if (ret)
return ret;
@@ -278,18 +280,22 @@ static int tracing_stat_init(void)
d_tracing = tracing_init_dentry();
if (IS_ERR(d_tracing))
- return 0;
+ return -ENODEV;
stat_dir = tracefs_create_dir("trace_stat", d_tracing);
- if (!stat_dir)
+ if (!stat_dir) {
pr_warn("Could not create tracefs 'trace_stat' entry\n");
+ return -ENOMEM;
+ }
return 0;
}
static int init_stat_file(struct stat_session *session)
{
- if (!stat_dir && tracing_stat_init())
- return -ENODEV;
+ int ret;
+
+ if (!stat_dir && (ret = tracing_stat_init()))
+ return ret;
session->file = tracefs_create_file(session->ts->name, 0644,
stat_dir,
@@ -302,7 +308,7 @@ static int init_stat_file(struct stat_session *session)
int register_stat_tracer(struct tracer_stat *trace)
{
struct stat_session *session, *node;
- int ret;
+ int ret = -EINVAL;
if (!trace)
return -EINVAL;
@@ -313,17 +319,15 @@ int register_stat_tracer(struct tracer_stat *trace)
/* Already registered? */
mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry(node, &all_stat_sessions, session_list) {
- if (node->ts == trace) {
- mutex_unlock(&all_stat_sessions_mutex);
- return -EINVAL;
- }
+ if (node->ts == trace)
+ goto out;
}
- mutex_unlock(&all_stat_sessions_mutex);
+ ret = -ENOMEM;
/* Init the session */
session = kzalloc(sizeof(*session), GFP_KERNEL);
if (!session)
- return -ENOMEM;
+ goto out;
session->ts = trace;
INIT_LIST_HEAD(&session->session_list);
@@ -332,15 +336,16 @@ int register_stat_tracer(struct tracer_stat *trace)
ret = init_stat_file(session);
if (ret) {
destroy_session(session);
- return ret;
+ goto out;
}
+ ret = 0;
/* Register */
- mutex_lock(&all_stat_sessions_mutex);
list_add_tail(&session->session_list, &all_stat_sessions);
+ out:
mutex_unlock(&all_stat_sessions_mutex);
- return 0;
+ return ret;
}
void unregister_stat_tracer(struct tracer_stat *trace)
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 8786d17caf49..31d7dc5bf1db 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -16,7 +16,7 @@ struct tracer_stat {
void *(*stat_start)(struct tracer_stat *trace);
void *(*stat_next)(void *prev, int idx);
/* Compare two entries for stats sorting */
- int (*stat_cmp)(void *p1, void *p2);
+ cmp_func_t stat_cmp;
/* Print a stat entry */
int (*stat_show)(struct seq_file *s, void *p);
/* Release an entry */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index fa8fbff736d6..d85a2f0f316b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,6 +7,7 @@
#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
#include <linux/ftrace.h>
#include <linux/perf_event.h>
+#include <linux/xarray.h>
#include <asm/syscall.h>
#include "trace_output.h"
@@ -30,6 +31,7 @@ syscall_get_enter_fields(struct trace_event_call *call)
extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];
+static DEFINE_XARRAY(syscalls_metadata_sparse);
static struct syscall_metadata **syscalls_metadata;
#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
@@ -101,6 +103,9 @@ find_syscall_meta(unsigned long syscall)
static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
+ if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
+ return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);
+
if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
return NULL;
@@ -198,11 +203,10 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
extern char *__bad_type_size(void);
-#define SYSCALL_FIELD(type, field, name) \
- sizeof(type) != sizeof(trace.field) ? \
- __bad_type_size() : \
- #type, #name, offsetof(typeof(trace), field), \
- sizeof(trace.field), is_signed_type(type)
+#define SYSCALL_FIELD(_type, _name) { \
+ .type = #_type, .name = #_name, \
+ .size = sizeof(_type), .align = __alignof__(_type), \
+ .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
static int __init
__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
@@ -269,42 +273,23 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
- int ret;
- int i;
int offset = offsetof(typeof(trace), args);
-
- ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
- FILTER_OTHER);
- if (ret)
- return ret;
+ int ret = 0;
+ int i;
for (i = 0; i < meta->nb_args; i++) {
ret = trace_define_field(call, meta->types[i],
meta->args[i], offset,
sizeof(unsigned long), 0,
FILTER_OTHER);
+ if (ret)
+ break;
offset += sizeof(unsigned long);
}
return ret;
}
-static int __init syscall_exit_define_fields(struct trace_event_call *call)
-{
- struct syscall_trace_exit trace;
- int ret;
-
- ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
- FILTER_OTHER);
- if (ret)
- return ret;
-
- ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret),
- FILTER_OTHER);
-
- return ret;
-}
-
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
@@ -312,7 +297,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long irq_flags;
unsigned long args[6];
int pc;
@@ -340,7 +325,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
local_save_flags(irq_flags);
pc = preempt_count();
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer,
sys_data->enter_event->event.type, size, irq_flags, pc);
if (!event)
@@ -362,7 +347,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
+ struct trace_buffer *buffer;
unsigned long irq_flags;
int pc;
int syscall_nr;
@@ -386,7 +371,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
local_save_flags(irq_flags);
pc = preempt_count();
- buffer = tr->trace_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer,
sys_data->exit_event->event.type, sizeof(*entry),
irq_flags, pc);
@@ -502,6 +487,13 @@ static int __init init_syscall_trace(struct trace_event_call *call)
return id;
}
+static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
+ SYSCALL_FIELD(int, __syscall_nr),
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = syscall_enter_define_fields },
+ {}
+};
+
struct trace_event_functions enter_syscall_print_funcs = {
.trace = print_syscall_enter,
};
@@ -513,7 +505,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
struct trace_event_class __refdata event_class_syscall_enter = {
.system = "syscalls",
.reg = syscall_enter_register,
- .define_fields = syscall_enter_define_fields,
+ .fields_array = syscall_enter_fields_array,
.get_fields = syscall_get_enter_fields,
.raw_init = init_syscall_trace,
};
@@ -521,7 +513,11 @@ struct trace_event_class __refdata event_class_syscall_enter = {
struct trace_event_class __refdata event_class_syscall_exit = {
.system = "syscalls",
.reg = syscall_exit_register,
- .define_fields = syscall_exit_define_fields,
+ .fields_array = (struct trace_event_fields[]){
+ SYSCALL_FIELD(int, __syscall_nr),
+ SYSCALL_FIELD(long, ret),
+ {}
+ },
.fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
.raw_init = init_syscall_trace,
};
@@ -536,12 +532,16 @@ void __init init_ftrace_syscalls(void)
struct syscall_metadata *meta;
unsigned long addr;
int i;
-
- syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
- GFP_KERNEL);
- if (!syscalls_metadata) {
- WARN_ON(1);
- return;
+ void *ret;
+
+ if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
+ syscalls_metadata = kcalloc(NR_syscalls,
+ sizeof(*syscalls_metadata),
+ GFP_KERNEL);
+ if (!syscalls_metadata) {
+ WARN_ON(1);
+ return;
+ }
}
for (i = 0; i < NR_syscalls; i++) {
@@ -551,7 +551,16 @@ void __init init_ftrace_syscalls(void)
continue;
meta->syscall_nr = i;
- syscalls_metadata[i] = meta;
+
+ if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
+ syscalls_metadata[i] = meta;
+ } else {
+ ret = xa_store(&syscalls_metadata_sparse, i, meta,
+ GFP_KERNEL);
+ WARN(xa_is_err(ret),
+ "Syscall memory allocation failed\n");
+ }
+
}
}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 1ceedb9146b1..18d16f3ef980 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -7,6 +7,7 @@
*/
#define pr_fmt(fmt) "trace_uprobe: " fmt
+#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/uaccess.h>
@@ -33,18 +34,12 @@ struct uprobe_trace_entry_head {
#define DATAOF_TRACE_ENTRY(entry, is_return) \
((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
-struct trace_uprobe_filter {
- rwlock_t rwlock;
- int nr_systemwide;
- struct list_head perf_events;
-};
-
static int trace_uprobe_create(int argc, const char **argv);
static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_uprobe_release(struct dyn_event *ev);
static bool trace_uprobe_is_busy(struct dyn_event *ev);
static bool trace_uprobe_match(const char *system, const char *event,
- struct dyn_event *ev);
+ int argc, const char **argv, struct dyn_event *ev);
static struct dyn_event_operations trace_uprobe_ops = {
.create = trace_uprobe_create,
@@ -59,7 +54,6 @@ static struct dyn_event_operations trace_uprobe_ops = {
*/
struct trace_uprobe {
struct dyn_event devent;
- struct trace_uprobe_filter filter;
struct uprobe_consumer consumer;
struct path path;
struct inode *inode;
@@ -248,6 +242,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
case FETCH_OP_COMM:
val = FETCH_TOKEN_COMM;
break;
+ case FETCH_OP_DATA:
+ val = (unsigned long)code->data;
+ break;
case FETCH_OP_FOFFS:
val = translate_user_vaddr(code->immediate);
break;
@@ -284,13 +281,54 @@ static bool trace_uprobe_is_busy(struct dyn_event *ev)
return trace_probe_is_enabled(&tu->tp);
}
+static bool trace_uprobe_match_command_head(struct trace_uprobe *tu,
+ int argc, const char **argv)
+{
+ char buf[MAX_ARGSTR_LEN + 1];
+ int len;
+
+ if (!argc)
+ return true;
+
+ len = strlen(tu->filename);
+ if (strncmp(tu->filename, argv[0], len) || argv[0][len] != ':')
+ return false;
+
+ if (tu->ref_ctr_offset == 0)
+ snprintf(buf, sizeof(buf), "0x%0*lx",
+ (int)(sizeof(void *) * 2), tu->offset);
+ else
+ snprintf(buf, sizeof(buf), "0x%0*lx(0x%lx)",
+ (int)(sizeof(void *) * 2), tu->offset,
+ tu->ref_ctr_offset);
+ if (strcmp(buf, &argv[0][len + 1]))
+ return false;
+
+ argc--; argv++;
+
+ return trace_probe_match_command_args(&tu->tp, argc, argv);
+}
+
static bool trace_uprobe_match(const char *system, const char *event,
- struct dyn_event *ev)
+ int argc, const char **argv, struct dyn_event *ev)
{
struct trace_uprobe *tu = to_trace_uprobe(ev);
return strcmp(trace_probe_name(&tu->tp), event) == 0 &&
- (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0);
+ (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) &&
+ trace_uprobe_match_command_head(tu, argc, argv);
+}
+
+static nokprobe_inline struct trace_uprobe *
+trace_uprobe_primary_from_call(struct trace_event_call *call)
+{
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return NULL;
+
+ return container_of(tp, struct trace_uprobe, tp);
}
/*
@@ -306,7 +344,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (!tu)
return ERR_PTR(-ENOMEM);
- ret = trace_probe_init(&tu->tp, event, group);
+ ret = trace_probe_init(&tu->tp, event, group, true);
if (ret < 0)
goto error;
@@ -314,7 +352,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
tu->consumer.handler = uprobe_dispatcher;
if (is_ret)
tu->consumer.ret_handler = uretprobe_dispatcher;
- init_trace_uprobe_filter(&tu->filter);
+ init_trace_uprobe_filter(tu->tp.event->filter);
return tu;
error:
@@ -352,15 +390,76 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
{
int ret;
+ if (trace_probe_has_sibling(&tu->tp))
+ goto unreg;
+
ret = unregister_uprobe_event(tu);
if (ret)
return ret;
+unreg:
dyn_event_remove(&tu->devent);
+ trace_probe_unlink(&tu->tp);
free_trace_uprobe(tu);
return 0;
}
+static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig,
+ struct trace_uprobe *comp)
+{
+ struct trace_probe_event *tpe = orig->tp.event;
+ struct trace_probe *pos;
+ struct inode *comp_inode = d_real_inode(comp->path.dentry);
+ int i;
+
+ list_for_each_entry(pos, &tpe->probes, list) {
+ orig = container_of(pos, struct trace_uprobe, tp);
+ if (comp_inode != d_real_inode(orig->path.dentry) ||
+ comp->offset != orig->offset)
+ continue;
+
+ /*
+ * trace_probe_compare_arg_type() ensured that nr_args and
+ * each argument name and type are same. Let's compare comm.
+ */
+ for (i = 0; i < orig->tp.nr_args; i++) {
+ if (strcmp(orig->tp.args[i].comm,
+ comp->tp.args[i].comm))
+ break;
+ }
+
+ if (i == orig->tp.nr_args)
+ return true;
+ }
+
+ return false;
+}
+
+static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to)
+{
+ int ret;
+
+ ret = trace_probe_compare_arg_type(&tu->tp, &to->tp);
+ if (ret) {
+ /* Note that argument starts index = 2 */
+ trace_probe_log_set_index(ret + 1);
+ trace_probe_log_err(0, DIFF_ARG_TYPE);
+ return -EEXIST;
+ }
+ if (trace_uprobe_has_same_uprobe(to, tu)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, SAME_PROBE);
+ return -EEXIST;
+ }
+
+ /* Append to existing event */
+ ret = trace_probe_append(&tu->tp, &to->tp);
+ if (!ret)
+ dyn_event_add(&tu->devent);
+
+ return ret;
+}
+
/*
* Uprobe with multiple reference counter is not allowed. i.e.
* If inode and offset matches, reference counter offset *must*
@@ -370,25 +469,21 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
* as the new one does not conflict with any other existing
* ones.
*/
-static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new)
+static int validate_ref_ctr_offset(struct trace_uprobe *new)
{
struct dyn_event *pos;
- struct trace_uprobe *tmp, *old = NULL;
+ struct trace_uprobe *tmp;
struct inode *new_inode = d_real_inode(new->path.dentry);
- old = find_probe_event(trace_probe_name(&new->tp),
- trace_probe_group_name(&new->tp));
-
for_each_trace_uprobe(tmp, pos) {
- if ((old ? old != tmp : true) &&
- new_inode == d_real_inode(tmp->path.dentry) &&
+ if (new_inode == d_real_inode(tmp->path.dentry) &&
new->offset == tmp->offset &&
new->ref_ctr_offset != tmp->ref_ctr_offset) {
pr_warn("Reference counter offset mismatch.");
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
}
- return old;
+ return 0;
}
/* Register a trace_uprobe and probe_event */
@@ -399,18 +494,22 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
mutex_lock(&event_mutex);
- /* register as an event */
- old_tu = find_old_trace_uprobe(tu);
- if (IS_ERR(old_tu)) {
- ret = PTR_ERR(old_tu);
+ ret = validate_ref_ctr_offset(tu);
+ if (ret)
goto end;
- }
+ /* register as an event */
+ old_tu = find_probe_event(trace_probe_name(&tu->tp),
+ trace_probe_group_name(&tu->tp));
if (old_tu) {
- /* delete old event */
- ret = unregister_trace_uprobe(old_tu);
- if (ret)
- goto end;
+ if (is_ret_probe(tu) != is_ret_probe(old_tu)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, DIFF_PROBE_TYPE);
+ ret = -EEXIST;
+ } else {
+ ret = append_trace_uprobe(tu, old_tu);
+ }
+ goto end;
}
ret = register_uprobe_event(tu);
@@ -664,6 +763,10 @@ static int probes_open(struct inode *inode, struct file *file)
{
int ret;
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
ret = dyn_events_release_all(&trace_uprobe_ops);
if (ret)
@@ -713,6 +816,12 @@ static const struct seq_operations profile_seq_op = {
static int profile_open(struct inode *inode, struct file *file)
{
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
return seq_open(file, &profile_seq_op);
}
@@ -822,8 +931,8 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
+ struct trace_buffer *buffer;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
void *data;
int size, esize;
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
@@ -897,7 +1006,10 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
u8 *data;
entry = (struct uprobe_trace_entry_head *)iter->ent;
- tu = container_of(event, struct trace_uprobe, tp.call.event);
+ tu = trace_uprobe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (unlikely(!tu))
+ goto out;
if (is_ret_probe(tu)) {
trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
@@ -924,28 +1036,73 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
enum uprobe_filter_ctx ctx,
struct mm_struct *mm);
-static int
-probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
- filter_func_t filter)
+static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
+{
+ int ret;
+
+ tu->consumer.filter = filter;
+ tu->inode = d_real_inode(tu->path.dentry);
+
+ if (tu->ref_ctr_offset)
+ ret = uprobe_register_refctr(tu->inode, tu->offset,
+ tu->ref_ctr_offset, &tu->consumer);
+ else
+ ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+
+ if (ret)
+ tu->inode = NULL;
+
+ return ret;
+}
+
+static void __probe_event_disable(struct trace_probe *tp)
{
- bool enabled = trace_probe_is_enabled(&tu->tp);
+ struct trace_probe *pos;
+ struct trace_uprobe *tu;
+
+ tu = container_of(tp, struct trace_uprobe, tp);
+ WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tu = container_of(pos, struct trace_uprobe, tp);
+ if (!tu->inode)
+ continue;
+
+ uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+ tu->inode = NULL;
+ }
+}
+
+static int probe_event_enable(struct trace_event_call *call,
+ struct trace_event_file *file, filter_func_t filter)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_uprobe *tu;
+ bool enabled;
int ret;
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This may also change "enabled" state */
if (file) {
- if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
+ if (trace_probe_test_flag(tp, TP_FLAG_PROFILE))
return -EINTR;
- ret = trace_probe_add_file(&tu->tp, file);
+ ret = trace_probe_add_file(tp, file);
if (ret < 0)
return ret;
} else {
- if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
+ if (trace_probe_test_flag(tp, TP_FLAG_TRACE))
return -EINTR;
- trace_probe_set_flag(&tu->tp, TP_FLAG_PROFILE);
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
}
- WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+ tu = container_of(tp, struct trace_uprobe, tp);
+ WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
if (enabled)
return 0;
@@ -954,18 +1111,15 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
if (ret)
goto err_flags;
- tu->consumer.filter = filter;
- tu->inode = d_real_inode(tu->path.dentry);
- if (tu->ref_ctr_offset) {
- ret = uprobe_register_refctr(tu->inode, tu->offset,
- tu->ref_ctr_offset, &tu->consumer);
- } else {
- ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tu = container_of(pos, struct trace_uprobe, tp);
+ ret = trace_uprobe_enable(tu, filter);
+ if (ret) {
+ __probe_event_disable(tp);
+ goto err_buffer;
+ }
}
- if (ret)
- goto err_buffer;
-
return 0;
err_buffer:
@@ -973,33 +1127,35 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
err_flags:
if (file)
- trace_probe_remove_file(&tu->tp, file);
+ trace_probe_remove_file(tp, file);
else
- trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE);
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
return ret;
}
-static void
-probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
+static void probe_event_disable(struct trace_event_call *call,
+ struct trace_event_file *file)
{
- if (!trace_probe_is_enabled(&tu->tp))
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return;
+
+ if (!trace_probe_is_enabled(tp))
return;
if (file) {
- if (trace_probe_remove_file(&tu->tp, file) < 0)
+ if (trace_probe_remove_file(tp, file) < 0)
return;
- if (trace_probe_is_enabled(&tu->tp))
+ if (trace_probe_is_enabled(tp))
return;
} else
- trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE);
-
- WARN_ON(!uprobe_filter_is_empty(&tu->filter));
-
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
- tu->inode = NULL;
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+ __probe_event_disable(tp);
uprobe_buffer_disable();
}
@@ -1007,7 +1163,11 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret, size;
struct uprobe_trace_entry_head field;
- struct trace_uprobe *tu = event_call->data;
+ struct trace_uprobe *tu;
+
+ tu = trace_uprobe_primary_from_call(event_call);
+ if (unlikely(!tu))
+ return -ENODEV;
if (is_ret_probe(tu)) {
DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
@@ -1039,39 +1199,39 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
}
static inline bool
-uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+trace_uprobe_filter_event(struct trace_uprobe_filter *filter,
+ struct perf_event *event)
{
- return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
+ return __uprobe_perf_filter(filter, event->hw.target->mm);
}
-static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+static bool trace_uprobe_filter_remove(struct trace_uprobe_filter *filter,
+ struct perf_event *event)
{
bool done;
- write_lock(&tu->filter.rwlock);
+ write_lock(&filter->rwlock);
if (event->hw.target) {
list_del(&event->hw.tp_list);
- done = tu->filter.nr_systemwide ||
+ done = filter->nr_systemwide ||
(event->hw.target->flags & PF_EXITING) ||
- uprobe_filter_event(tu, event);
+ trace_uprobe_filter_event(filter, event);
} else {
- tu->filter.nr_systemwide--;
- done = tu->filter.nr_systemwide;
+ filter->nr_systemwide--;
+ done = filter->nr_systemwide;
}
- write_unlock(&tu->filter.rwlock);
-
- if (!done)
- return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+ write_unlock(&filter->rwlock);
- return 0;
+ return done;
}
-static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+/* This returns true if the filter always covers target mm */
+static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter,
+ struct perf_event *event)
{
bool done;
- int err;
- write_lock(&tu->filter.rwlock);
+ write_lock(&filter->rwlock);
if (event->hw.target) {
/*
* event->parent != NULL means copy_process(), we can avoid
@@ -1081,35 +1241,83 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
* attr.enable_on_exec means that exec/mmap will install the
* breakpoints we need.
*/
- done = tu->filter.nr_systemwide ||
+ done = filter->nr_systemwide ||
event->parent || event->attr.enable_on_exec ||
- uprobe_filter_event(tu, event);
- list_add(&event->hw.tp_list, &tu->filter.perf_events);
+ trace_uprobe_filter_event(filter, event);
+ list_add(&event->hw.tp_list, &filter->perf_events);
} else {
- done = tu->filter.nr_systemwide;
- tu->filter.nr_systemwide++;
+ done = filter->nr_systemwide;
+ filter->nr_systemwide++;
}
- write_unlock(&tu->filter.rwlock);
+ write_unlock(&filter->rwlock);
+
+ return done;
+}
+
+static int uprobe_perf_close(struct trace_event_call *call,
+ struct perf_event *event)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_uprobe *tu;
+ int ret = 0;
- err = 0;
- if (!done) {
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ tu = container_of(tp, struct trace_uprobe, tp);
+ if (trace_uprobe_filter_remove(tu->tp.event->filter, event))
+ return 0;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tu = container_of(pos, struct trace_uprobe, tp);
+ ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int uprobe_perf_open(struct trace_event_call *call,
+ struct perf_event *event)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_uprobe *tu;
+ int err = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ tu = container_of(tp, struct trace_uprobe, tp);
+ if (trace_uprobe_filter_add(tu->tp.event->filter, event))
+ return 0;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
- if (err)
- uprobe_perf_close(tu, event);
+ if (err) {
+ uprobe_perf_close(call, event);
+ break;
+ }
}
+
return err;
}
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
+ struct trace_uprobe_filter *filter;
struct trace_uprobe *tu;
int ret;
tu = container_of(uc, struct trace_uprobe, consumer);
- read_lock(&tu->filter.rwlock);
- ret = __uprobe_perf_filter(&tu->filter, mm);
- read_unlock(&tu->filter.rwlock);
+ filter = tu->tp.event->filter;
+
+ read_lock(&filter->rwlock);
+ ret = __uprobe_perf_filter(filter, mm);
+ read_unlock(&filter->rwlock);
return ret;
}
@@ -1213,30 +1421,29 @@ static int
trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
void *data)
{
- struct trace_uprobe *tu = event->data;
struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
- return probe_event_enable(tu, file, NULL);
+ return probe_event_enable(event, file, NULL);
case TRACE_REG_UNREGISTER:
- probe_event_disable(tu, file);
+ probe_event_disable(event, file);
return 0;
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return probe_event_enable(tu, NULL, uprobe_perf_filter);
+ return probe_event_enable(event, NULL, uprobe_perf_filter);
case TRACE_REG_PERF_UNREGISTER:
- probe_event_disable(tu, NULL);
+ probe_event_disable(event, NULL);
return 0;
case TRACE_REG_PERF_OPEN:
- return uprobe_perf_open(tu, data);
+ return uprobe_perf_open(event, data);
case TRACE_REG_PERF_CLOSE:
- return uprobe_perf_close(tu, data);
+ return uprobe_perf_close(event, data);
#endif
default:
@@ -1321,16 +1528,20 @@ static struct trace_event_functions uprobe_funcs = {
.trace = print_uprobe_event
};
+static struct trace_event_fields uprobe_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = uprobe_event_define_fields },
+ {}
+};
+
static inline void init_trace_event_call(struct trace_uprobe *tu)
{
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
-
call->event.funcs = &uprobe_funcs;
- call->class->define_fields = uprobe_event_define_fields;
+ call->class->fields_array = uprobe_fields_array;
call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY;
call->class->reg = trace_uprobe_register;
- call->data = tu;
}
static int register_uprobe_event(struct trace_uprobe *tu)
@@ -1399,7 +1610,7 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call)
{
struct trace_uprobe *tu;
- tu = container_of(event_call, struct trace_uprobe, tp.call);
+ tu = trace_uprobe_primary_from_call(event_call);
free_trace_uprobe(tu);
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 9a1c22310323..9e31bfc818ff 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -148,8 +148,8 @@ static int tracing_map_cmp_atomic64(void *val_a, void *val_b)
#define DEFINE_TRACING_MAP_CMP_FN(type) \
static int tracing_map_cmp_##type(void *val_a, void *val_b) \
{ \
- type a = *(type *)val_a; \
- type b = *(type *)val_b; \
+ type a = (type)(*(u64 *)val_a); \
+ type b = (type)(*(u64 *)val_b); \
\
return (a > b) ? 1 : ((a < b) ? -1 : 0); \
}
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 7be3e7530841..257ffb993ea2 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -24,6 +24,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
const struct cred *tcred;
u64 utime, stime, utimescaled, stimescaled;
u64 delta;
+ time64_t btime;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -32,9 +33,11 @@ void bacct_add_tsk(struct user_namespace *user_ns,
/* Convert to micro seconds */
do_div(delta, NSEC_PER_USEC);
stats->ac_etime = delta;
- /* Convert to seconds for btime */
- do_div(delta, USEC_PER_SEC);
- stats->ac_btime = get_seconds() - delta;
+ /* Convert to seconds for btime (note y2106 limit) */
+ btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC);
+ stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
+ stats->ac_btime64 = btime;
+
if (thread_group_leader(tsk)) {
stats->ac_exitcode = tsk->exit_code;
if (tsk->flags & PF_FORKNOEXEC)
diff --git a/kernel/up.c b/kernel/up.c
index 862b460ab97a..c6f323dcd45b 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -14,7 +14,8 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
{
unsigned long flags;
- WARN_ON(cpu != 0);
+ if (cpu != 0)
+ return -ENXIO;
local_irq_save(flags);
func(info);
@@ -68,9 +69,8 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* Preemption is disabled here to make sure the cond_func is called under the
* same condtions in UP and SMP.
*/
-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags, const struct cpumask *mask)
+void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, const struct cpumask *mask)
{
unsigned long flags;
@@ -84,11 +84,10 @@ void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags)
+void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait)
{
- on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
+ on_each_cpu_cond_mask(cond_func, func, info, wait, NULL);
}
EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9e7b9306fe..b6b1f54a7837 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -161,6 +161,8 @@ static void lockup_detector_update_enable(void)
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+#define SOFTLOCKUP_RESET ULONG_MAX
+
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -173,8 +175,6 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
-static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
-static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
@@ -274,7 +274,7 @@ notrace void touch_softlockup_watchdog_sched(void)
* Preemption can be enabled. It doesn't matter which CPU's timestamp
* gets zeroed here, so use the raw_ operation.
*/
- raw_cpu_write(watchdog_touch_ts, 0);
+ raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
notrace void touch_softlockup_watchdog(void)
@@ -298,14 +298,14 @@ void touch_all_softlockup_watchdogs(void)
* the softlockup check.
*/
for_each_cpu(cpu, &watchdog_allowed_mask)
- per_cpu(watchdog_touch_ts, cpu) = 0;
+ per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
wq_watchdog_touch(-1);
}
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
- __this_cpu_write(watchdog_touch_ts, 0);
+ __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
static int is_softlockup(unsigned long touch_ts)
@@ -350,8 +350,6 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
*/
static int softlockup_fn(void *data)
{
- __this_cpu_write(soft_lockup_hrtimer_cnt,
- __this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
complete(this_cpu_ptr(&softlockup_completion));
@@ -383,7 +381,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
- if (touch_ts == 0) {
+ if (touch_ts == SOFTLOCKUP_RESET) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
@@ -416,22 +414,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
/* only warn once */
- if (__this_cpu_read(soft_watchdog_warn) == true) {
- /*
- * When multiple processes are causing softlockups the
- * softlockup detector only warns on the first one
- * because the code relies on a full quiet cycle to
- * re-arm. The second process prevents the quiet cycle
- * and never gets reported. Use task pointers to detect
- * this.
- */
- if (__this_cpu_read(softlockup_task_ptr_saved) !=
- current) {
- __this_cpu_write(soft_watchdog_warn, false);
- __touch_watchdog();
- }
+ if (__this_cpu_read(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
- }
if (softlockup_all_cpu_backtrace) {
/* Prevent multiple soft-lockup reports if one cpu is already
@@ -447,7 +431,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
- __this_cpu_write(softlockup_task_ptr_saved, current);
print_modules();
print_irqtrace_events(current);
if (regs)
@@ -490,10 +473,10 @@ static void watchdog_enable(unsigned int cpu)
* Start the timer first to prevent the NMI watchdog triggering
* before the timer has a chance to fire.
*/
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hrtimer->function = watchdog_timer_fn;
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
/* Initialize timestamp */
__touch_watchdog();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 601d61150b65..301db4406bc3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -248,7 +248,7 @@ struct workqueue_struct {
struct list_head flusher_overflow; /* WQ: flush overflow list */
struct list_head maydays; /* MD: pwqs requesting rescue */
- struct worker *rescuer; /* I: rescue worker */
+ struct worker *rescuer; /* MD: rescue worker */
int nr_drainers; /* WQ: drain in progress */
int saved_max_active; /* WQ: saved pwq max_active */
@@ -355,6 +355,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
+static void show_pwq(struct pool_workqueue *pwq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
@@ -364,11 +365,6 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
!lockdep_is_held(&wq_pool_mutex), \
"RCU or wq_pool_mutex should be held")
-#define assert_rcu_or_wq_mutex(wq) \
- RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
- !lockdep_is_held(&wq->mutex), \
- "RCU or wq->mutex should be held")
-
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&wq->mutex) && \
@@ -425,9 +421,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
* ignored.
*/
#define for_each_pwq(pwq, wq) \
- list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
- if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
- else
+ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \
+ lockdep_is_held(&(wq->mutex)))
#ifdef CONFIG_DEBUG_OBJECTS_WORK
@@ -2271,7 +2266,7 @@ __acquires(&pool->lock)
* While we must be careful to not use "work" after this, the trace
* point will only record its address.
*/
- trace_workqueue_execute_end(work);
+ trace_workqueue_execute_end(work, worker->current_func);
lock_map_release(&lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
@@ -2285,7 +2280,7 @@ __acquires(&pool->lock)
}
/*
- * The following prevents a kworker from hogging CPU on !PREEMPT
+ * The following prevents a kworker from hogging CPU on !PREEMPTION
* kernels, where a requeueing work item waiting for something to
* happen could deadlock with stop_machine as such work item could
* indefinitely requeue itself while all other CPUs are trapped in
@@ -2532,8 +2527,14 @@ repeat:
*/
if (need_to_create_worker(pool)) {
spin_lock(&wq_mayday_lock);
- get_pwq(pwq);
- list_move_tail(&pwq->mayday_node, &wq->maydays);
+ /*
+ * Queue iff we aren't racing destruction
+ * and somebody else hasn't queued it already.
+ */
+ if (wq->rescuer && list_empty(&pwq->mayday_node)) {
+ get_pwq(pwq);
+ list_add_tail(&pwq->mayday_node, &wq->maydays);
+ }
spin_unlock(&wq_mayday_lock);
}
}
@@ -3329,7 +3330,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
*
* Undo alloc_workqueue_attrs().
*/
-static void free_workqueue_attrs(struct workqueue_attrs *attrs)
+void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
if (attrs) {
free_cpumask_var(attrs->cpumask);
@@ -3345,7 +3346,7 @@ static void free_workqueue_attrs(struct workqueue_attrs *attrs)
*
* Return: The allocated new workqueue_attr on success. %NULL on failure.
*/
-static struct workqueue_attrs *alloc_workqueue_attrs(void)
+struct workqueue_attrs *alloc_workqueue_attrs(void)
{
struct workqueue_attrs *attrs;
@@ -4030,16 +4031,20 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
*
* Performs GFP_KERNEL allocations.
*
+ * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus().
+ *
* Return: 0 on success and -errno on failure.
*/
-static int apply_workqueue_attrs(struct workqueue_struct *wq,
+int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
int ret;
- apply_wqattrs_lock();
+ lockdep_assert_cpus_held();
+
+ mutex_lock(&wq_pool_mutex);
ret = apply_workqueue_attrs_locked(wq, attrs);
- apply_wqattrs_unlock();
+ mutex_unlock(&wq_pool_mutex);
return ret;
}
@@ -4152,16 +4157,21 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
mutex_unlock(&wq->mutex);
}
return 0;
- } else if (wq->flags & __WQ_ORDERED) {
+ }
+
+ get_online_cpus();
+ if (wq->flags & __WQ_ORDERED) {
ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
- return ret;
} else {
- return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
+ ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
+ put_online_cpus();
+
+ return ret;
}
static int wq_clamp_max_active(int max_active, unsigned int flags,
@@ -4305,6 +4315,22 @@ err_destroy:
}
EXPORT_SYMBOL_GPL(alloc_workqueue);
+static bool pwq_busy(struct pool_workqueue *pwq)
+{
+ int i;
+
+ for (i = 0; i < WORK_NR_COLORS; i++)
+ if (pwq->nr_in_flight[i])
+ return true;
+
+ if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
+ return true;
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ return true;
+
+ return false;
+}
+
/**
* destroy_workqueue - safely terminate a workqueue
* @wq: target workqueue
@@ -4316,31 +4342,51 @@ void destroy_workqueue(struct workqueue_struct *wq)
struct pool_workqueue *pwq;
int node;
+ /*
+ * Remove it from sysfs first so that sanity check failure doesn't
+ * lead to sysfs name conflicts.
+ */
+ workqueue_sysfs_unregister(wq);
+
/* drain it before proceeding with destruction */
drain_workqueue(wq);
- /* sanity checks */
- mutex_lock(&wq->mutex);
- for_each_pwq(pwq, wq) {
- int i;
+ /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
+ if (wq->rescuer) {
+ struct worker *rescuer = wq->rescuer;
- for (i = 0; i < WORK_NR_COLORS; i++) {
- if (WARN_ON(pwq->nr_in_flight[i])) {
- mutex_unlock(&wq->mutex);
- show_workqueue_state();
- return;
- }
- }
+ /* this prevents new queueing */
+ spin_lock_irq(&wq_mayday_lock);
+ wq->rescuer = NULL;
+ spin_unlock_irq(&wq_mayday_lock);
+
+ /* rescuer will empty maydays list before exiting */
+ kthread_stop(rescuer->task);
+ kfree(rescuer);
+ }
- if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) ||
- WARN_ON(pwq->nr_active) ||
- WARN_ON(!list_empty(&pwq->delayed_works))) {
+ /*
+ * Sanity checks - grab all the locks so that we wait for all
+ * in-flight operations which may do put_pwq().
+ */
+ mutex_lock(&wq_pool_mutex);
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq) {
+ spin_lock_irq(&pwq->pool->lock);
+ if (WARN_ON(pwq_busy(pwq))) {
+ pr_warn("%s: %s has the following busy pwq\n",
+ __func__, wq->name);
+ show_pwq(pwq);
+ spin_unlock_irq(&pwq->pool->lock);
mutex_unlock(&wq->mutex);
+ mutex_unlock(&wq_pool_mutex);
show_workqueue_state();
return;
}
+ spin_unlock_irq(&pwq->pool->lock);
}
mutex_unlock(&wq->mutex);
+ mutex_unlock(&wq_pool_mutex);
/*
* wq list is used to freeze wq, remove from list after
@@ -4350,11 +4396,6 @@ void destroy_workqueue(struct workqueue_struct *wq)
list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
- workqueue_sysfs_unregister(wq);
-
- if (wq->rescuer)
- kthread_stop(wq->rescuer->task);
-
if (!(wq->flags & WQ_UNBOUND)) {
wq_unregister_lockdep(wq);
/*
@@ -4629,7 +4670,8 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_info(" pwq %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ pr_cont(" active=%d/%d refcnt=%d%s\n",
+ pwq->nr_active, pwq->max_active, pwq->refcnt,
!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
@@ -4648,7 +4690,7 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_cont("%s %d%s:%ps", comma ? "," : "",
task_pid_nr(worker->task),
- worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+ worker->rescue_wq ? "(RESCUER)" : "",
worker->current_func);
list_for_each_entry(work, &worker->scheduled, entry)
pr_cont_work(false, work);
OpenPOWER on IntegriCloud