summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit_watch.c14
-rw-r--r--kernel/bpf/hashtab.c30
-rw-r--r--kernel/bpf/syscall.c4
-rw-r--r--kernel/bpf/verifier.c21
-rw-r--r--kernel/cgroup/cgroup-internal.h3
-rw-r--r--kernel/cgroup/cgroup.c66
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/cpu.c1
-rw-r--r--kernel/events/core.c186
-rw-r--r--kernel/events/internal.h6
-rw-r--r--kernel/events/ring_buffer.c31
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/exit.c10
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/futex.c5
-rw-r--r--kernel/irq/chip.c10
-rw-r--r--kernel/irq/cpuhotplug.c9
-rw-r--r--kernel/irq/ipi.c4
-rw-r--r--kernel/kmod.c25
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/locking/qspinlock.c117
-rw-r--r--kernel/membarrier.c70
-rw-r--r--kernel/pid.c14
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/rcu/Kconfig3
-rw-r--r--kernel/rcu/rcu.h128
-rw-r--r--kernel/rcu/rcu_segcblist.c108
-rw-r--r--kernel/rcu/rcu_segcblist.h28
-rw-r--r--kernel/rcu/rcuperf.c17
-rw-r--r--kernel/rcu/rcutorture.c83
-rw-r--r--kernel/rcu/srcutiny.c8
-rw-r--r--kernel/rcu/srcutree.c50
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tiny_plugin.h47
-rw-r--r--kernel/rcu/tree.c213
-rw-r--r--kernel/rcu/tree.h15
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h238
-rw-r--r--kernel/rcu/update.c18
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/completion.c11
-rw-r--r--kernel/sched/core.c40
-rw-r--r--kernel/sched/membarrier.c152
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/signal.c17
-rw-r--r--kernel/task_work.c8
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/time/timer.c52
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/bpf_trace.c34
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/ring_buffer.c14
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c19
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/trace/tracing_map.c11
-rw-r--r--kernel/watchdog.c1
-rw-r--r--kernel/watchdog_hld.c59
-rw-r--r--kernel/workqueue.c30
64 files changed, 1099 insertions, 992 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb8e8b23c6e..9c323a6daa46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_HAS_IOMEM) += memremap.o
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 62d686d96581..9eb8b3511636 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule)
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
+ /*
+ * audit_remove_watch() drops our reference to 'parent' which
+ * can get freed. Grab our own reference to be safe.
+ */
+ audit_get_parent(parent);
audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- audit_get_parent(parent);
+ if (list_empty(&parent->watches))
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
- audit_put_parent(parent);
- }
+ audit_put_parent(parent);
}
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4fb463172aa8..d11c8181f4c5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -652,12 +652,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
}
}
+static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
+ BITS_PER_LONG == 64;
+}
+
+static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
+{
+ u32 size = htab->map.value_size;
+
+ if (percpu || fd_htab_map_needs_adjust(htab))
+ size = round_up(size, 8);
+ return size;
+}
+
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
- u32 size = htab->map.value_size;
+ u32 size = htab_size_value(htab, percpu);
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
@@ -696,9 +711,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
memcpy(l_new->key, key, key_size);
if (percpu) {
- /* round up value_size to 8 bytes */
- size = round_up(size, 8);
-
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
@@ -1209,17 +1221,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
{
- struct bpf_map *map;
-
if (attr->value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
-
- /* pointer is stored internally */
- attr->value_size = sizeof(void *);
- map = htab_map_alloc(attr);
- attr->value_size = sizeof(u32);
-
- return map;
+ return htab_map_alloc(attr);
}
static void fd_htab_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 045646da97cc..6c772adabad2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1289,7 +1289,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info_len = min_t(u32, sizeof(info), info_len);
if (copy_from_user(&info, uinfo, info_len))
- return err;
+ return -EFAULT;
info.type = prog->type;
info.id = prog->aux->id;
@@ -1312,7 +1312,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
}
ulen = info.xlated_prog_len;
- info.xlated_prog_len = bpf_prog_size(prog->len);
+ info.xlated_prog_len = bpf_prog_insn_size(prog);
if (info.xlated_prog_len && ulen) {
uinsns = u64_to_user_ptr(info.xlated_prog_insns);
ulen = min_t(u32, info.xlated_prog_len, ulen);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index af9e84a4944e..664d93972373 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1865,10 +1865,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* do our normal operations to the register, we need to set the values
* to the min/max since they are undefined.
*/
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (opcode != BPF_SUB) {
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ }
switch (opcode) {
case BPF_ADD:
@@ -1879,10 +1881,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
dst_reg->min_align = min(src_align, dst_align);
break;
case BPF_SUB:
+ /* If one of our values was at the end of our ranges, then the
+ * _opposite_ value in the dst_reg goes to the end of our range.
+ */
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value -= min_val;
+ dst_reg->min_value -= max_val;
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value -= max_val;
+ dst_reg->max_value -= min_val;
dst_reg->min_align = min(src_align, dst_align);
break;
case BPF_MUL:
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 793565c05742..8b4c3c2f2509 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -33,6 +33,9 @@ struct cgroup_taskset {
struct list_head src_csets;
struct list_head dst_csets;
+ /* the number of tasks in the set */
+ int nr_tasks;
+
/* the subsys currently being processed */
int ssid;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 620794a20a33..df2e0f14a95d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2006,6 +2006,8 @@ static void cgroup_migrate_add_task(struct task_struct *task,
if (!cset->mg_src_cgrp)
return;
+ mgctx->tset.nr_tasks++;
+
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
@@ -2094,21 +2096,19 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
struct css_set *cset, *tmp_cset;
int ssid, failed_ssid, ret;
- /* methods shouldn't be called if no task is actually migrating */
- if (list_empty(&tset->src_csets))
- return 0;
-
/* check that we can legitimately attach to the cgroup */
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->can_attach) {
- tset->ssid = ssid;
- ret = ss->can_attach(tset);
- if (ret) {
- failed_ssid = ssid;
- goto out_cancel_attach;
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->can_attach) {
+ tset->ssid = ssid;
+ ret = ss->can_attach(tset);
+ if (ret) {
+ failed_ssid = ssid;
+ goto out_cancel_attach;
+ }
}
- }
- } while_each_subsys_mask();
+ } while_each_subsys_mask();
+ }
/*
* Now that we're guaranteed success, proceed to move all tasks to
@@ -2137,25 +2137,29 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
*/
tset->csets = &tset->dst_csets;
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->attach) {
- tset->ssid = ssid;
- ss->attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->attach) {
+ tset->ssid = ssid;
+ ss->attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
ret = 0;
goto out_release_tset;
out_cancel_attach:
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ssid == failed_ssid)
- break;
- if (ss->cancel_attach) {
- tset->ssid = ssid;
- ss->cancel_attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ssid == failed_ssid)
+ break;
+ if (ss->cancel_attach) {
+ tset->ssid = ssid;
+ ss->cancel_attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
out_release_tset:
spin_lock_irq(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2997,11 +3001,11 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
cgrp->subtree_control &= ~disable;
ret = cgroup_apply_control(cgrp);
-
cgroup_finalize_control(cgrp, ret);
+ if (ret)
+ goto out_unlock;
kernfs_activate(cgrp->kn);
- ret = 0;
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
@@ -4669,6 +4673,10 @@ int __init cgroup_init(void)
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
+
+ mutex_lock(&cgroup_mutex);
+ css_populate_dir(init_css_set.subsys[ssid]);
+ mutex_unlock(&cgroup_mutex);
}
/* init_css_set.subsys[] has been updated, re-hash */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ca8376e5008c..87a1213dd326 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -63,6 +63,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
@@ -1891,6 +1892,7 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
},
{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index eee033134262..bfbd649ccdc8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -650,6 +650,7 @@ static int takedown_cpu(unsigned int cpu)
__cpu_die(cpu);
tick_cleanup_dead_cpu(cpu);
+ rcutree_migrate_callbacks(cpu);
return 0;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ffba16d..ce64f3fed5c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx)
return parent_ctx;
}
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
+ enum pid_type type)
{
+ u32 nr;
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
- return task_tgid_nr_ns(p, event->ns);
+ nr = __task_pid_nr_ns(p, type, event->ns);
+ /* avoid -1 if it is idle thread or runs in another ns */
+ if (!nr && !pid_alive(p))
+ nr = -1;
+ return nr;
}
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
+ return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+}
- return task_pid_nr_ns(p, event->ns);
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ return perf_event_pid_type(event, p, PIDTYPE_PID);
}
/*
@@ -1570,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ size += sizeof(data->phys_addr);
+
event->header_size = size;
}
@@ -2217,6 +2225,33 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
+/*
+ * Complement to update_event_times(). This computes the tstamp_* values to
+ * continue 'enabled' state from @now, and effectively discards the time
+ * between the prior tstamp_stopped and now (as we were in the OFF state, or
+ * just switched (context) time base).
+ *
+ * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
+ * cannot have been scheduled in yet. And going into INACTIVE state means
+ * '@event->tstamp_stopped = @now'.
+ *
+ * Thus given the rules of update_event_times():
+ *
+ * total_time_enabled = tstamp_stopped - tstamp_enabled
+ * total_time_running = tstamp_stopped - tstamp_running
+ *
+ * We can insert 'tstamp_stopped == now' and reverse them to compute new
+ * tstamp_* values.
+ */
+static void __perf_event_enable_time(struct perf_event *event, u64 now)
+{
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
+
+ event->tstamp_stopped = now;
+ event->tstamp_enabled = now - event->total_time_enabled;
+ event->tstamp_running = now - event->total_time_running;
+}
+
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
@@ -2224,9 +2259,12 @@ static void add_event_to_ctx(struct perf_event *event,
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = tstamp;
- event->tstamp_running = tstamp;
- event->tstamp_stopped = tstamp;
+ /*
+ * We can be called with event->state == STATE_OFF when we create with
+ * .disabled = 1. In that case the IOC_ENABLE will call this function.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2471,10 +2509,11 @@ static void __perf_event_mark_enabled(struct perf_event *event)
u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = tstamp - event->total_time_enabled;
+ __perf_event_enable_time(event, tstamp);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ /* XXX should not be > INACTIVE if event isn't */
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ __perf_event_enable_time(sub, tstamp);
}
}
@@ -3180,6 +3219,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
return;
perf_ctx_lock(cpuctx, ctx);
+ /*
+ * We must check ctx->nr_events while holding ctx->lock, such
+ * that we serialize against perf_install_in_context().
+ */
+ if (!ctx->nr_events)
+ goto unlock;
+
perf_pmu_disable(ctx->pmu);
/*
* We want to keep the following priority order:
@@ -3193,6 +3239,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
+
+unlock:
perf_ctx_unlock(cpuctx, ctx);
}
@@ -5090,7 +5138,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->rb->aux_mmap_count);
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -5113,7 +5161,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
unsigned long size = perf_data_size(rb);
if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event);
+ event->pmu->event_unmapped(event, vma->vm_mm);
/*
* rb->aux_mmap_count will always drop before rb->mmap_count and
@@ -5411,7 +5459,7 @@ aux_unlock:
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
return ret;
}
@@ -5972,6 +6020,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ perf_output_put(handle, data->phys_addr);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -5987,6 +6038,38 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+static u64 perf_virt_to_phys(u64 virt)
+{
+ u64 phys_addr = 0;
+ struct page *p = NULL;
+
+ if (!virt)
+ return 0;
+
+ if (virt >= TASK_SIZE) {
+ /* If it's vmalloc()d memory, leave phys_addr as 0 */
+ if (virt_addr_valid((void *)(uintptr_t)virt) &&
+ !(virt >= VMALLOC_START && virt < VMALLOC_END))
+ phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+ } else {
+ /*
+ * Walking the pages tables for user address.
+ * Interrupts are disabled, so it prevents any tear down
+ * of the page tables.
+ * Try IRQ-safe __get_user_pages_fast first.
+ * If failed, leave phys_addr as 0.
+ */
+ if ((current->mm != NULL) &&
+ (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+ if (p)
+ put_page(p);
+ }
+
+ return phys_addr;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
@@ -6105,6 +6188,9 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ data->phys_addr = perf_virt_to_phys(data->addr);
}
static void __always_inline
@@ -7256,6 +7342,11 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+void perf_event_itrace_started(struct perf_event *event)
+{
+ event->attach_state |= PERF_ATTACH_ITRACE;
+}
+
static void perf_log_itrace_start(struct perf_event *event)
{
struct perf_output_handle handle;
@@ -7271,7 +7362,7 @@ static void perf_log_itrace_start(struct perf_event *event)
event = event->parent;
if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
- event->hw.itrace_started)
+ event->attach_state & PERF_ATTACH_ITRACE)
return;
rec.header.type = PERF_RECORD_ITRACE_START;
@@ -7875,16 +7966,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- rctx, task);
+ rctx, task, NULL);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task)
+ struct task_struct *task, struct perf_event *event)
{
struct perf_sample_data data;
- struct perf_event *event;
struct perf_raw_record raw = {
.frag = {
@@ -7898,9 +7988,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ /* Use the given event instead of the hlist */
+ if (event) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
+ } else {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
}
/*
@@ -9580,6 +9676,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (ret)
return -EFAULT;
+ attr->size = size;
+
if (attr->__reserved_1)
return -EINVAL;
@@ -9852,6 +9950,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /* Only privileged users can get physical addresses */
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+ perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
if (!attr.sample_max_stack)
attr.sample_max_stack = sysctl_perf_event_max_stack;
@@ -10001,28 +10104,27 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
/*
- * Do not allow to attach to a group in a different
- * task or CPU context:
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
*/
- if (move_group) {
- /*
- * Make sure we're both on the same task, or both
- * per-cpu events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ if (group_leader->cpu != event->cpu)
+ goto err_context;
- /*
- * Make sure we're both events for the same CPU;
- * grouping events for different CPUs is broken; since
- * you can never concurrently schedule them anyhow.
- */
- if (group_leader->cpu != event->cpu)
- goto err_context;
- } else {
- if (group_leader->ctx != ctx)
- goto err_context;
- }
+ /*
+ * Make sure we're both on the same task, or both
+ * per-CPU events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Do not allow to attach to a group in a different task
+ * or CPU context. If we're moving SW events, we'll fix
+ * this up later, so allow that.
+ */
+ if (!move_group && group_leader->ctx != ctx)
+ goto err_context;
/*
* Only a group leader can be exclusive or pinned
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 486fd78eb8d5..843e97047335 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -38,9 +38,9 @@ struct ring_buffer {
struct user_struct *mmap_user;
/* AUX area */
- local_t aux_head;
+ long aux_head;
local_t aux_nest;
- local_t aux_wakeup;
+ long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
@@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion)
{
int rctx;
- if (in_nmi())
+ if (unlikely(in_nmi()))
rctx = 3;
else if (in_irq())
rctx = 2;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ee97196bb151..af71a84e12ee 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
goto err_put;
- aux_head = local_read(&rb->aux_head);
+ aux_head = rb->aux_head;
handle->rb = rb;
handle->event = event;
@@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
*/
if (!rb->aux_overwrite) {
aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
- handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
@@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
aux_head = handle->head;
- local_set(&rb->aux_head, aux_head);
+ rb->aux_head = aux_head;
} else {
handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
- aux_head = local_read(&rb->aux_head);
- local_add(size, &rb->aux_head);
+ aux_head = rb->aux_head;
+ rb->aux_head += size;
}
if (size || handle->aux_flags) {
@@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->aux_flags);
}
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
-
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ rb->user_page->aux_head = rb->aux_head;
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
wakeup = true;
- local_add(rb->aux_watermark, &rb->aux_wakeup);
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
}
if (wakeup) {
@@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
{
struct ring_buffer *rb = handle->rb;
- unsigned long aux_head;
if (size > handle->size)
return -ENOSPC;
- local_add(size, &rb->aux_head);
+ rb->aux_head += size;
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ rb->user_page->aux_head = rb->aux_head;
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
perf_output_wakeup(handle);
- local_add(rb->aux_watermark, &rb->aux_wakeup);
- handle->wakeup = local_read(&rb->aux_wakeup) +
- rb->aux_watermark;
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
}
- handle->head = aux_head;
+ handle->head = rb->aux_head;
handle->size -= size;
return 0;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0e137f98a50c..267f6ef91d97 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void)
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- newmm->uprobes_state.xol_area = NULL;
-
if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
set_bit(MMF_HAS_UPROBES, &newmm->flags);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
diff --git a/kernel/exit.c b/kernel/exit.c
index c5548faa9f37..f9ef3ecc78c1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -764,7 +764,6 @@ void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
- TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
kcov_task_exit(tsk);
@@ -819,7 +818,8 @@ void __noreturn do_exit(long code)
* Ensure that we must observe the pi_state in exit_mm() ->
* mm_release() -> exit_pi_state_list().
*/
- raw_spin_unlock_wait(&tsk->pi_lock);
+ raw_spin_lock_irq(&tsk->pi_lock);
+ raw_spin_unlock_irq(&tsk->pi_lock);
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -881,9 +881,7 @@ void __noreturn do_exit(long code)
*/
flush_ptrace_hw_breakpoint(tsk);
- TASKS_RCU(preempt_disable());
- TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
- TASKS_RCU(preempt_enable());
+ exit_tasks_rcu_start();
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
mpol_put_task_policy(tsk);
@@ -918,7 +916,7 @@ void __noreturn do_exit(long code)
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
- TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
+ exit_tasks_rcu_finish();
do_task_dead();
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 17921b0390b4..b7e9e57b71ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -785,6 +785,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
+static void mm_init_uprobes_state(struct mm_struct *mm)
+{
+#ifdef CONFIG_UPROBES
+ mm->uprobes_state.xol_area = NULL;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
@@ -806,11 +813,13 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
- clear_tlb_flush_pending(mm);
+ init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
+ mm_init_uprobes_state(mm);
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
diff --git a/kernel/futex.c b/kernel/futex.c
index 16dbe4c93895..f50b434756c1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -670,13 +670,14 @@ again:
* this reference was taken by ihold under the page lock
* pinning the inode in place so i_lock was unnecessary. The
* only way for this check to fail is if the inode was
- * truncated in parallel so warn for now if this happens.
+ * truncated in parallel which is almost certainly an
+ * application bug. In such a case, just retry.
*
* We are not calling into get_futex_key_refs() in file-backed
* cases, therefore a successful atomic_inc return below will
* guarantee that get_futex_key() will still imply smp_mb(); (B).
*/
- if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+ if (!atomic_inc_not_zero(&inode->i_count)) {
rcu_read_unlock();
put_page(page);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a3cc37c0c85e..3675c6004f2a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- unsigned long flags;
+ unsigned long flags, trigger, tmp;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
@@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
irq_settings_clr_and_set(desc, clr, set);
+ trigger = irqd_get_trigger_type(&desc->irq_data);
+
irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
if (irq_settings_has_no_balance_set(desc))
@@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
if (irq_settings_is_level(desc))
irqd_set(&desc->irq_data, IRQD_LEVEL);
- irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+ tmp = irq_settings_get_trigger_mask(desc);
+ if (tmp != IRQ_TYPE_NONE)
+ trigger = tmp;
+
+ irqd_set(&desc->irq_data, trigger);
irq_put_desc_unlock(desc, flags);
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index aee8f7ec40af..638eb9c83d9f 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -95,8 +95,13 @@ static bool migrate_one_irq(struct irq_desc *desc)
affinity = cpu_online_mask;
brokeaff = true;
}
-
- err = irq_do_set_affinity(d, affinity, true);
+ /*
+ * Do not set the force argument of irq_do_set_affinity() as this
+ * disables the masking of offline CPUs from the supplied affinity
+ * mask and therefore might keep/reassign the irq to the outgoing
+ * CPU.
+ */
+ err = irq_do_set_affinity(d, affinity, false);
if (err) {
pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
d->irq, err);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 1a9abc1c8ea0..259a22aa9934 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
struct irq_data *data = irq_get_irq_data(irq);
struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
- if (!data || !ipimask || cpu > nr_cpu_ids)
+ if (!data || !ipimask || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
if (!cpumask_test_cpu(cpu, ipimask))
@@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (!chip->ipi_send_single && !chip->ipi_send_mask)
return -EINVAL;
- if (cpu > nr_cpu_ids)
+ if (cpu >= nr_cpu_ids)
return -EINVAL;
if (dest) {
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6d016c5d97c8..2f37acde640b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -71,6 +71,18 @@ static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
/*
+ * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
+ * running at the same time without returning. When this happens we
+ * believe you've somehow ended up with a recursive module dependency
+ * creating a loop.
+ *
+ * We have no option but to fail.
+ *
+ * Userspace should proactively try to detect and prevent these.
+ */
+#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
+
+/*
modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
@@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...)
pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
atomic_read(&kmod_concurrent_max),
MAX_KMOD_CONCURRENT, module_name);
- wait_event_interruptible(kmod_wq,
- atomic_dec_if_positive(&kmod_concurrent_max) >= 0);
+ ret = wait_event_killable_timeout(kmod_wq,
+ atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
+ MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+ if (!ret) {
+ pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+ module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+ return -ETIME;
+ } else if (ret == -ERESTARTSYS) {
+ pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
+ return ret;
+ }
}
trace_module_request(module_name, wait, _RET_IP_);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 26db528c1d88..1c19edf82427 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -637,6 +637,7 @@ repeat:
schedule();
try_to_freeze();
+ cond_resched();
goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index fd24153e8a48..294294c71ba4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -268,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif
-/*
- * Various notes on spin_is_locked() and spin_unlock_wait(), which are
- * 'interesting' functions:
- *
- * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
- * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
- * PPC). Also qspinlock has a similar issue per construction, the setting of
- * the locked byte can be unordered acquiring the lock proper.
- *
- * This gets to be 'interesting' in the following cases, where the /should/s
- * end up false because of this issue.
- *
- *
- * CASE 1:
- *
- * So the spin_is_locked() correctness issue comes from something like:
- *
- * CPU0 CPU1
- *
- * global_lock(); local_lock(i)
- * spin_lock(&G) spin_lock(&L[i])
- * for (i) if (!spin_is_locked(&G)) {
- * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
- * return;
- * }
- * // deal with fail
- *
- * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
- * that there is exclusion between the two critical sections.
- *
- * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
- * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
- * /should/ be constrained by the ACQUIRE from spin_lock(&G).
- *
- * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
- *
- *
- * CASE 2:
- *
- * For spin_unlock_wait() there is a second correctness issue, namely:
- *
- * CPU0 CPU1
- *
- * flag = set;
- * smp_mb(); spin_lock(&l)
- * spin_unlock_wait(&l); if (!flag)
- * // add to lockless list
- * spin_unlock(&l);
- * // iterate lockless list
- *
- * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
- * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
- * semantics etc..)
- *
- * Where flag /should/ be ordered against the locked store of l.
- */
-
-/*
- * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
- * issuing an _unordered_ store to set _Q_LOCKED_VAL.
- *
- * This means that the store can be delayed, but no later than the
- * store-release from the unlock. This means that simply observing
- * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
- *
- * There are two paths that can issue the unordered store:
- *
- * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
- *
- * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
- * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
- *
- * However, in both cases we have other !0 state we've set before to queue
- * ourseves:
- *
- * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
- * load is constrained by that ACQUIRE to not pass before that, and thus must
- * observe the store.
- *
- * For (2) we have a more intersting scenario. We enqueue ourselves using
- * xchg_tail(), which ends up being a RELEASE. This in itself is not
- * sufficient, however that is followed by an smp_cond_acquire() on the same
- * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
- * guarantees we must observe that store.
- *
- * Therefore both cases have other !0 state that is observable before the
- * unordered locked byte store comes through. This means we can use that to
- * wait for the lock store, and then wait for an unlock.
- */
-#ifndef queued_spin_unlock_wait
-void queued_spin_unlock_wait(struct qspinlock *lock)
-{
- u32 val;
-
- for (;;) {
- val = atomic_read(&lock->val);
-
- if (!val) /* not locked, we're done */
- goto done;
-
- if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
- break;
-
- /* not locked, but pending, wait until we observe the lock */
- cpu_relax();
- }
-
- /* any unlock is good */
- while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
- cpu_relax();
-
-done:
- smp_acquire__after_ctrl_dep();
-}
-EXPORT_SYMBOL(queued_spin_unlock_wait);
-#endif
-
#endif /* _GEN_PV_LOCK_SLOWPATH */
/**
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f37f8d..000000000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- *
- * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-
-/*
- * Bitmask made from a "or" of all commands within enum membarrier_cmd,
- * except MEMBARRIER_CMD_QUERY.
- */
-#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
-
-/**
- * sys_membarrier - issue memory barriers on a set of threads
- * @cmd: Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
- *
- * If this system call is not implemented, -ENOSYS is returned. If the
- * command specified does not exist, or if the command argument is invalid,
- * this system call returns -EINVAL. For a given command, with flags argument
- * set to 0, this system call is guaranteed to always return the same value
- * until reboot.
- *
- * All memory accesses performed in program order from each targeted thread
- * is guaranteed to be ordered with respect to sys_membarrier(). If we use
- * the semantic "barrier()" to represent a compiler barrier forcing memory
- * accesses to be performed in program order across the barrier, and
- * smp_mb() to represent explicit memory barriers forcing full memory
- * ordering across the barrier, we have the following ordering table for
- * each pair of barrier(), sys_membarrier() and smp_mb():
- *
- * The pair ordering is detailed as (O: ordered, X: not ordered):
- *
- * barrier() smp_mb() sys_membarrier()
- * barrier() X X O
- * smp_mb() X O O
- * sys_membarrier() O O O
- */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
-{
- /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
- if (tick_nohz_full_enabled())
- return -ENOSYS;
- if (unlikely(flags))
- return -EINVAL;
- switch (cmd) {
- case MEMBARRIER_CMD_QUERY:
- return MEMBARRIER_CMD_BITMASK;
- case MEMBARRIER_CMD_SHARED:
- if (num_online_cpus() > 1)
- synchronize_sched();
- return 0;
- default:
- return -EINVAL;
- }
-}
diff --git a/kernel/pid.c b/kernel/pid.c
index 731c4e528f4e..020dedbdf066 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (!ns)
ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
- if (type != PIDTYPE_PID)
+ if (type != PIDTYPE_PID) {
+ if (type == __PIDTYPE_TGID)
+ type = PIDTYPE_PID;
task = task->group_leader;
+ }
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
rcu_read_unlock();
@@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
}
EXPORT_SYMBOL(__task_pid_nr_ns);
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return pid_nr_ns(task_tgid(tsk), ns);
-}
-EXPORT_SYMBOL(task_tgid_nr_ns);
-
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
@@ -575,13 +572,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
- unsigned int pidhash_size;
-
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
- pidhash_size = 1U << pidhash_shift;
}
void __init pidmap_init(void)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 222317721c5a..0972a8e09d08 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1650,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_page_state(NR_SLAB_RECLAIMABLE)
+ size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index be90c945063f..9210379c0353 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -69,8 +69,7 @@ config TREE_SRCU
This option selects the full-fledged version of SRCU.
config TASKS_RCU
- bool
- default n
+ def_bool PREEMPT
select SRCU
help
This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 808b8c85f626..e4b43fef89f5 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -356,22 +356,10 @@ do { \
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
-static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
-{
- return true;
-}
-static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
-{
- return false;
-}
-
-static inline void rcu_expedite_gp(void)
-{
-}
-
-static inline void rcu_unexpedite_gp(void)
-{
-}
+static inline bool rcu_gp_is_normal(void) { return true; }
+static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline void rcu_expedite_gp(void) { }
+static inline void rcu_unexpedite_gp(void) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
@@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
*gpnum = 0;
*completed = 0;
}
-static inline void rcutorture_record_test_transition(void)
-{
-}
-static inline void rcutorture_record_progress(unsigned long vernum)
-{
-}
+static inline void rcutorture_record_test_transition(void) { }
+static inline void rcutorture_record_progress(unsigned long vernum) { }
#ifdef CONFIG_RCU_TRACE
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
@@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
#endif
#ifdef CONFIG_TINY_RCU
-
-/*
- * Return the number of grace periods started.
- */
-static inline unsigned long rcu_batches_started(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods started.
- */
-static inline unsigned long rcu_batches_started_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods started.
- */
-static inline unsigned long rcu_batches_started_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of grace periods completed.
- */
-static inline unsigned long rcu_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited sched grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed_sched(void)
-{
- return 0;
-}
-
-static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
- return 0;
-}
-
-static inline void rcu_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_bh_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_sched_force_quiescent_state(void)
-{
-}
-
-static inline void show_rcu_gp_kthreads(void)
-{
-}
-
+static inline unsigned long rcu_batches_started(void) { return 0; }
+static inline unsigned long rcu_batches_started_bh(void) { return 0; }
+static inline unsigned long rcu_batches_started_sched(void) { return 0; }
+static inline unsigned long rcu_batches_completed(void) { return 0; }
+static inline unsigned long rcu_batches_completed_bh(void) { return 0; }
+static inline unsigned long rcu_batches_completed_sched(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; }
+static inline unsigned long
+srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+static inline void rcu_force_quiescent_state(void) { }
+static inline void rcu_bh_force_quiescent_state(void) { }
+static inline void rcu_sched_force_quiescent_state(void) { }
+static inline void show_rcu_gp_kthreads(void) { }
#else /* #ifdef CONFIG_TINY_RCU */
extern unsigned long rcutorture_testseq;
extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2b62a38b080f..7649fcd2c4c7 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -36,24 +36,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
}
/*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
- int cnt = 0;
- struct rcu_head **rhpp = &rclp->head;
-
- for (;;) {
- if (!*rhpp)
- return cnt;
- if (++cnt > lim)
- return -1;
- rhpp = &(*rhpp)->next;
- }
-}
-
-/*
* Dequeue the oldest rcu_head structure from the specified callback
* list. This function assumes that the callback is non-lazy, but
* the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -103,17 +85,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
}
/*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
- if (seg == RCU_DONE_TAIL)
- return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
- return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-
-/*
* Does the specified rcu_segcblist structure contain callbacks that
* are ready to be invoked?
*/
@@ -134,50 +105,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
}
/*
- * Dequeue and return the first ready-to-invoke callback. If there
- * are no ready-to-invoke callbacks, return NULL. Disables interrupts
- * to avoid interference. Does not protect from interference from other
- * CPUs or tasks.
- */
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
- unsigned long flags;
- int i;
- struct rcu_head *rhp;
-
- local_irq_save(flags);
- if (!rcu_segcblist_ready_cbs(rsclp)) {
- local_irq_restore(flags);
- return NULL;
- }
- rhp = rsclp->head;
- BUG_ON(!rhp);
- rsclp->head = rhp->next;
- for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
- if (rsclp->tails[i] != &rhp->next)
- break;
- rsclp->tails[i] = &rsclp->head;
- }
- smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
- WRITE_ONCE(rsclp->len, rsclp->len - 1);
- local_irq_restore(flags);
- return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rsclp->len_lazy--;
- local_irq_restore(flags);
-}
-
-/*
* Return a pointer to the first callback in the specified rcu_segcblist
* structure. This is useful for diagnostics.
*/
@@ -203,17 +130,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
}
/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
- return rcu_segcblist_is_enabled(rsclp) &&
- !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-
-/*
* Enqueue the specified callback onto the specified rcu_segcblist
* structure, updating accounting as needed. Note that the ->len
* field may be accessed locklessly, hence the WRITE_ONCE().
@@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
return true;
return false;
}
+
+/*
+ * Merge the source rcu_segcblist structure into the destination
+ * rcu_segcblist structure, then initialize the source. Any pending
+ * callbacks from the source get to start over. It is best to
+ * advance and accelerate both the destination and the source
+ * before merging.
+ */
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp)
+{
+ struct rcu_cblist donecbs;
+ struct rcu_cblist pendcbs;
+
+ rcu_cblist_init(&donecbs);
+ rcu_cblist_init(&pendcbs);
+ rcu_segcblist_extract_count(src_rsclp, &donecbs);
+ rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
+ rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+ rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+ rcu_segcblist_init(src_rsclp);
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 6e36e36478cd..581c12b63544 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
rclp->len_lazy--;
}
-/*
- * Interim function to return rcu_cblist head pointer. Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
- return rclp->head;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer. Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
- WARN_ON_ONCE(!rclp->head);
- return rclp->tail;
-}
-
void rcu_cblist_init(struct rcu_cblist *rclp);
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
/*
@@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp, bool lazy);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
@@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
unsigned long seq);
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cc18110b612..1f87a02c3399 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = {
.name = "sched"
};
-#ifdef CONFIG_TASKS_RCU
-
/*
* Definitions for RCU-tasks perf testing.
*/
@@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = {
.name = "tasks"
};
-#define RCUPERF_TASKS_OPS &tasks_ops,
-
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops;
}
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUPERF_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
/*
* If performance tests complete, wait for shutdown to commence.
*/
@@ -658,7 +643,7 @@ rcu_perf_init(void)
int firsterr = 0;
static struct rcu_perf_ops *perf_ops[] = {
&rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
- RCUPERF_TASKS_OPS
+ &tasks_ops,
};
if (!torture_init_begin(perf_type, verbose, &perf_runnable))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b8f7f8ce8575..45f2ffbc1e78 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
static u64 notrace rcu_trace_clock_local(void)
{
u64 ts = trace_clock_local();
- unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+
+ (void)do_div(ts, NSEC_PER_USEC);
return ts;
}
#else /* #ifdef CONFIG_RCU_TRACE */
@@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
- .name = "rcu_busted"
+ .name = "busted"
};
/*
@@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
delay = torture_random(rrsp) %
(nrealreaders * 2 * longdelay * uspertick);
- if (!delay)
+ if (!delay && in_task())
schedule_timeout_interruptible(longdelay);
else
rcu_read_delay(rrsp);
@@ -561,44 +562,7 @@ static void srcu_torture_barrier(void)
static void srcu_torture_stats(void)
{
- int __maybe_unused cpu;
- int idx;
-
-#ifdef CONFIG_TREE_SRCU
- idx = srcu_ctlp->srcu_idx & 0x1;
- pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
- torture_type, TORTURE_FLAG, idx);
- for_each_possible_cpu(cpu) {
- unsigned long l0, l1;
- unsigned long u0, u1;
- long c0, c1;
- struct srcu_data *counts;
-
- counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
- u0 = counts->srcu_unlock_count[!idx];
- u1 = counts->srcu_unlock_count[idx];
-
- /*
- * Make sure that a lock is always counted if the corresponding
- * unlock is counted.
- */
- smp_rmb();
-
- l0 = counts->srcu_lock_count[!idx];
- l1 = counts->srcu_lock_count[idx];
-
- c0 = l0 - u0;
- c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
- }
- pr_cont("\n");
-#elif defined(CONFIG_TINY_SRCU)
- idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
- pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
- torture_type, TORTURE_FLAG, idx,
- READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
- READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
-#endif
+ srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);
}
static void srcu_torture_synchronize_expedited(void)
@@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .irq_capable = 1,
.name = "srcu"
};
@@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .irq_capable = 1,
.name = "srcud"
};
@@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
-#ifdef CONFIG_TASKS_RCU
-
/*
* Definitions for RCU-tasks torture testing.
*/
@@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = {
.name = "tasks"
};
-#define RCUTORTURE_TASKS_OPS &tasks_ops,
-
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops;
}
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUTORTURE_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
/*
* RCU torture priority-boost testing. Runs one real-time thread per
* CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg)
return 0;
}
+static void rcu_torture_timer_cb(struct rcu_head *rhp)
+{
+ kfree(rhp);
+}
+
/*
* RCU torture reader from timer handler. Dereferences rcu_torture_current,
* incrementing the corresponding element of the pipeline array. The
@@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
+
+ /* Test call_rcu() invocation from interrupt handler. */
+ if (cur_ops->call) {
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT);
+
+ if (rhp)
+ cur_ops->call(rhp, rcu_torture_timer_cb);
+ }
}
/*
@@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void)
srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
&flags, &gpnum, &completed);
wtp = READ_ONCE(writer_task);
- pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n",
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
gpnum, completed, flags,
- wtp == NULL ? ~0UL : wtp->state);
+ wtp == NULL ? ~0UL : wtp->state,
+ wtp == NULL ? -1 : (int)task_cpu(wtp));
show_rcu_gp_kthreads();
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1749,7 +1714,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &sched_ops, RCUTORTURE_TASKS_OPS
+ &sched_ops, &tasks_ops,
};
if (!torture_init_begin(torture_type, verbose, &torture_runnable))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 1a1c1047d2ed..76ac5f50b2c7 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -33,6 +33,8 @@
#include "rcu_segcblist.h"
#include "rcu.h"
+int rcu_scheduler_active __read_mostly;
+
static int init_srcu_struct_fields(struct srcu_struct *sp)
{
sp->srcu_lock_nesting[0] = 0;
@@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp)
destroy_rcu_head_on_stack(&rs.head);
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/* Lockdep diagnostics. */
+void __init rcu_scheduler_starting(void)
+{
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d0ca524bf042..729a8706751d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444);
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void process_srcu(struct work_struct *work);
/*
* Initialize SRCU combining tree. Note that statically allocated
@@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
__call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
wait_for_completion(&rcu.completion);
destroy_rcu_head_on_stack(&rcu.head);
+
+ /*
+ * Make sure that later code is ordered after the SRCU grace
+ * period. This pairs with the raw_spin_lock_irq_rcu_node()
+ * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
+ * because the current CPU might have been totally uninvolved with
+ * (and thus unordered against) that grace period.
+ */
+ smp_mb();
}
/**
@@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
/*
* This is the work-queue function that handles SRCU grace periods.
*/
-void process_srcu(struct work_struct *work)
+static void process_srcu(struct work_struct *work)
{
struct srcu_struct *sp;
@@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work)
srcu_advance_state(sp);
srcu_reschedule(sp, srcu_get_delay(sp));
}
-EXPORT_SYMBOL_GPL(process_srcu);
void srcutorture_get_gp_data(enum rcutorture_type test_type,
struct srcu_struct *sp, int *flags,
@@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
+{
+ int cpu;
+ int idx;
+ unsigned long s0 = 0, s1 = 0;
+
+ idx = sp->srcu_idx & 0x1;
+ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *counts;
+
+ counts = per_cpu_ptr(sp->sda, cpu);
+ u0 = counts->srcu_unlock_count[!idx];
+ u1 = counts->srcu_unlock_count[idx];
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = counts->srcu_lock_count[!idx];
+ l1 = counts->srcu_lock_count[idx];
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
+}
+EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
+
static int __init srcu_bootup_announce(void)
{
pr_info("Hierarchical SRCU implementation.\n");
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f8488965250f..a64eee0db39e 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.curtail = &rcu_bh_ctrlblk.rcucblist,
};
-#include "tiny_plugin.h"
-
void rcu_barrier_bh(void)
{
wait_rcu_gp(call_rcu_bh);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
deleted file mode 100644
index f0a01b2a3062..000000000000
--- a/kernel/rcu/tiny_plugin.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (c) 2010 Linaro
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
-#include <linux/kernel_stat.h>
-
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
-/*
- * During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously. Note that unlike
- * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
- * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
- * The reason for this is that Tiny RCU does not need kthreads, so does
- * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process. Unless SRCU is in the mix.
- */
-void __init rcu_scheduler_starting(void)
-{
- WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
- ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
-}
-
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51d4c3acf32d..84fe96641b2e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \
.gp_state = RCU_GP_IDLE, \
.gpnum = 0UL - 300UL, \
.completed = 0UL - 300UL, \
- .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
- .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
- .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
@@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user)
*/
void rcu_idle_enter(void)
{
- unsigned long flags;
-
- local_irq_save(flags);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
rcu_eqs_enter(false);
- local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
*/
void rcu_user_enter(void)
{
- rcu_eqs_enter(1);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
+ rcu_eqs_enter(true);
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user)
if (oldval & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
+ __this_cpu_inc(disable_rcu_irq_enter);
rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_eqs_exit_common(oldval, user);
+ __this_cpu_dec(disable_rcu_irq_enter);
}
}
@@ -979,7 +975,6 @@ void rcu_idle_exit(void)
rcu_eqs_exit(false);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ) {
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
rsp->name, j - gpa,
rsp->gpnum, rsp->completed,
rsp->gp_flags,
gp_state_getname(rsp->gp_state), rsp->gp_state,
- rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+ rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
+ rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
if (rsp->gp_kthread) {
sched_show_task(rsp->gp_kthread);
wake_up_process(rsp->gp_kthread);
@@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp)
}
/*
- * Helper function for wait_event_interruptible_timeout() wakeup
- * at force-quiescent-state time.
+ * Helper function for swait_event_idle() wakeup at force-quiescent-state
+ * time.
*/
static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
{
@@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- swait_event_interruptible(rsp->gp_wq,
- READ_ONCE(rsp->gp_flags) &
- RCU_GP_FLAG_INIT);
+ swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
@@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
- ret = swait_event_interruptible_timeout(rsp->gp_wq,
+ ret = swait_event_idle_timeout(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
@@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+ rcu_preempt_blocked_readers_cgp(rnp));
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
mask, rnp->qsmask, rnp->level,
@@ -2563,85 +2560,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * Send the specified CPU's RCU callbacks to the orphanage. The
- * specified CPU must be offline, and the caller must hold the
- * ->orphan_lock.
- */
-static void
-rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
- struct rcu_node *rnp, struct rcu_data *rdp)
-{
- lockdep_assert_held(&rsp->orphan_lock);
-
- /* No-CBs CPUs do not have orphanable callbacks. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
- return;
-
- /*
- * Orphan the callbacks. First adjust the counts. This is safe
- * because _rcu_barrier() excludes CPU-hotplug operations, so it
- * cannot be running now. Thus no memory barrier is required.
- */
- rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
- rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
-
- /*
- * Next, move those callbacks still needing a grace period to
- * the orphanage, where some other CPU will pick them up.
- * Some of the callbacks might have gone partway through a grace
- * period, but that is too bad. They get to start over because we
- * cannot assume that grace periods are synchronized across CPUs.
- */
- rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-
- /*
- * Then move the ready-to-invoke callbacks to the orphanage,
- * where some other CPU will pick them up. These will not be
- * required to pass though another grace period: They are done.
- */
- rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
-
- /* Finally, disallow further callbacks on this CPU. */
- rcu_segcblist_disable(&rdp->cblist);
-}
-
-/*
- * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage. The caller must hold the ->orphan_lock.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
-{
- struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-
- lockdep_assert_held(&rsp->orphan_lock);
-
- /* No-CBs CPUs are handled specially. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
- rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
- return;
-
- /* Do the accounting first. */
- rdp->n_cbs_adopted += rsp->orphan_done.len;
- if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
- rcu_idle_count_callbacks_posted();
- rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
-
- /*
- * We do not need a memory barrier here because the only way we
- * can get here if there is an rcu_barrier() in flight is if
- * we are the task doing the rcu_barrier().
- */
-
- /* First adopt the ready-to-invoke callbacks, then the done ones. */
- rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
- WARN_ON_ONCE(rsp->orphan_done.head);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
- WARN_ON_ONCE(rsp->orphan_pend.head);
- WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
- !rcu_segcblist_n_cbs(&rdp->cblist));
-}
-
-/*
* Trace the fact that this CPU is going offline.
*/
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
@@ -2704,14 +2622,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
/*
* The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup,
- * including orphaning the outgoing CPU's RCU callbacks, and also
- * adopting them. There can only be one CPU hotplug operation at a time,
- * so no other CPU can be attempting to update rcu_cpu_kthread_task.
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
*/
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
- unsigned long flags;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
@@ -2720,18 +2636,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
-
- /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
- raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
- rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
- rcu_adopt_orphan_cbs(rsp, flags);
- raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-
- WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
- !rcu_segcblist_empty(&rdp->cblist),
- "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
- cpu, rcu_segcblist_n_cbs(&rdp->cblist),
- rcu_segcblist_first_cb(&rdp->cblist));
}
/*
@@ -3569,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
struct rcu_state *rsp = rdp->rsp;
if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("LastCB"), -1,
+ rsp->barrier_sequence);
complete(&rsp->barrier_completion);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence);
}
}
@@ -3584,14 +3489,15 @@ static void rcu_barrier_func(void *type)
struct rcu_state *rsp = type;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
atomic_inc(&rsp->barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
- _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1,
+ rsp->barrier_sequence);
}
}
@@ -3605,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp)
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Begin", -1, s);
+ _rcu_barrier_trace(rsp, TPS("Begin"), -1, s);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rsp->barrier_mutex);
/* Did someone else do our work for us? */
if (rcu_seq_done(&rsp->barrier_sequence, s)) {
- _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1,
+ rsp->barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rsp->barrier_mutex);
return;
@@ -3620,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Mark the start of the barrier operation. */
rcu_seq_start(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence);
/*
* Initialize the count to one rather than to zero in order to
@@ -3643,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
rdp = per_cpu_ptr(rsp->rda, cpu);
if (rcu_is_nocb_cpu(cpu)) {
if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
- _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+ _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu,
rsp->barrier_sequence);
} else {
- _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu,
rsp->barrier_sequence);
smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
@@ -3654,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
rcu_barrier_callback, rsp, cpu, 0);
}
} else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
- _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu,
rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
} else {
- _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu,
rsp->barrier_sequence);
}
}
@@ -3675,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
wait_for_completion(&rsp->barrier_completion);
/* Mark the end of the barrier operation. */
- _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence);
rcu_seq_end(&rsp->barrier_sequence);
/* Other rcu_barrier() invocations can now safely proceed. */
@@ -3777,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
*/
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- if (!rdp->beenonline)
- WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
rdp->beenonline = true; /* We have now been online. */
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
@@ -3882,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu)
{
unsigned long flags;
unsigned long mask;
+ int nbits;
+ unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_state *rsp;
@@ -3892,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu)
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->qsmaskinitnext |= mask;
+ oldmask = rnp->expmaskinitnext;
rnp->expmaskinitnext |= mask;
+ oldmask ^= rnp->expmaskinitnext;
+ nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
+ /* Allow lockless access for expedited grace periods. */
+ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+ smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -3937,6 +3850,50 @@ void rcu_report_dead(unsigned int cpu)
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_idle_cpu(cpu, rsp);
}
+
+/* Migrate the dead CPU's callbacks to the current CPU. */
+static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_data *my_rdp;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+ if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
+ return; /* No callbacks to migrate. */
+
+ local_irq_save(flags);
+ my_rdp = this_cpu_ptr(rsp->rda);
+ if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
+ local_irq_restore(flags);
+ return;
+ }
+ raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
+ rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+ rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
+ !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
+ !rcu_segcblist_empty(&rdp->cblist),
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+ cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_first_cb(&rdp->cblist));
+}
+
+/*
+ * The outgoing CPU has just passed through the dying-idle state,
+ * and we are being invoked from the CPU that was IPIed to continue the
+ * offline operation. We need to migrate the outgoing CPU's callbacks.
+ */
+void rcutree_migrate_callbacks(int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_migrate_callbacks(cpu, rsp);
+}
#endif
/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9af0f31d6847..8e1f285f0a70 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -219,8 +219,6 @@ struct rcu_data {
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
- unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
- unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -268,7 +266,9 @@ struct rcu_data {
struct rcu_head **nocb_follower_tail;
struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
+ raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+ struct timer_list nocb_timer; /* Enforce finite deferral. */
/* The following fields are used by the leader, hence own cacheline. */
struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -350,15 +350,6 @@ struct rcu_state {
/* End of fields guarded by root rcu_node's lock. */
- raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
- /* Protect following fields. */
- struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
- /* need a grace period. */
- struct rcu_cblist orphan_done; /* Orphaned callbacks that */
- /* are ready to invoke. */
- /* (Contains counts.) */
- /* End of fields guarded by orphan_lock. */
-
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
@@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index dd21ca47e4b4..46d61b597731 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
unsigned long flags;
unsigned long mask;
unsigned long oldmask;
- int ncpus = READ_ONCE(rsp->ncpus);
+ int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */
struct rcu_node *rnp;
struct rcu_node *rnp_up;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 908b309d60d7..55bde94b9572 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
struct task_struct *t = current;
lockdep_assert_held(&rnp->lock);
+ WARN_ON_ONCE(rdp->mynode != rnp);
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
/*
* Decide where to queue the newly blocked task. In theory,
@@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
+ WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
+ !(rnp->qsmask & rdp->grpmask));
+ WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
+ !(rnp->expmask & rdp->grpmask));
raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
/*
@@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp = t->rcu_blocked_node;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t)
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp->boost_tasks = np;
/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
}
/*
@@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
+ struct task_struct *t;
+
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
- if (rcu_preempt_has_tasks(rnp))
+ if (rcu_preempt_has_tasks(rnp)) {
rnp->gp_tasks = rnp->blkd_tasks.next;
+ t = container_of(rnp->gp_tasks, struct task_struct,
+ rcu_node_entry);
+ trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
+ rnp->gpnum, t->pid);
+ }
WARN_ON_ONCE(rnp->qsmask);
}
@@ -1788,23 +1802,62 @@ bool rcu_is_nocb_cpu(int cpu)
}
/*
- * Kick the leader kthread for this NOCB group.
+ * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock
+ * and this function releases it.
*/
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
{
struct rcu_data *rdp_leader = rdp->nocb_leader;
- if (!READ_ONCE(rdp_leader->nocb_kthread))
+ lockdep_assert_held(&rdp->nocb_lock);
+ if (!READ_ONCE(rdp_leader->nocb_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
return;
- if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ }
+ if (rdp_leader->nocb_leader_sleep || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+ del_timer(&rdp->nocb_timer);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
+ } else {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
}
/*
+ * Kick the leader kthread for this NOCB group, but caller has not
+ * acquired locks.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ __wake_nocb_leader(rdp, force, flags);
+}
+
+/*
+ * Arrange to wake the leader kthread for this NOCB group at some
+ * future time when it is safe to do so.
+ */
+static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
+ mod_timer(&rdp->nocb_timer, jiffies + 1);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+}
+
+/*
* Does the specified CPU need an RCU callback for the specified flavor
* of rcu_barrier()?
*/
@@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
- /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeEmptyIsDeferred"));
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
@@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
- /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeOvfIsDeferred"));
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeOvfIsDeferred"));
}
rdp->qlen_last_fqs_check = LONG_MAX / 2;
} else {
@@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
* Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
* not a no-CBs CPU.
*/
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
- long ql = rsp->orphan_done.len;
- long qll = rsp->orphan_done.len_lazy;
-
- /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
if (!rcu_is_nocb_cpu(smp_processor_id()))
- return false;
-
- /* First, enqueue the donelist, if any. This preserves CB ordering. */
- if (rsp->orphan_done.head) {
- __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
- rcu_cblist_tail(&rsp->orphan_done),
- ql, qll, flags);
- }
- if (rsp->orphan_pend.head) {
- __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
- rcu_cblist_tail(&rsp->orphan_pend),
- ql, qll, flags);
- }
- rcu_cblist_init(&rsp->orphan_done);
- rcu_cblist_init(&rsp->orphan_pend);
+ return false; /* Not NOCBs CPU, caller must migrate CBs. */
+ __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
+ rcu_segcblist_tail(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_disable(&rdp->cblist);
return true;
}
@@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
static void nocb_leader_wait(struct rcu_data *my_rdp)
{
bool firsttime = true;
+ unsigned long flags;
bool gotcbs;
struct rcu_data *rdp;
struct rcu_head **tail;
@@ -2039,13 +2076,17 @@ wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
swait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
- /* Memory barrier handled by smp_mb() calls below and repoll. */
+ raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
+ my_rdp->nocb_leader_sleep = true;
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll"));
}
/*
@@ -2054,7 +2095,7 @@ wait_again:
* nocb_gp_head, where they await a grace period.
*/
gotcbs = false;
- smp_mb(); /* wakeup before ->nocb_head reads. */
+ smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
@@ -2066,56 +2107,41 @@ wait_again:
gotcbs = true;
}
- /*
- * If there were no callbacks, sleep a bit, rescan after a
- * memory barrier, and go retry.
- */
+ /* No callbacks? Sleep a bit if polling, and go retry. */
if (unlikely(!gotcbs)) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
- "WokeEmpty");
WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
-
- /* Rescan in case we were a victim of memory ordering. */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
- if (READ_ONCE(rdp->nocb_head)) {
- /* Found CB, so short-circuit next wait. */
- my_rdp->nocb_leader_sleep = false;
- break;
- }
+ if (rcu_nocb_poll) {
+ schedule_timeout_interruptible(1);
+ } else {
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+ TPS("WokeEmpty"));
+ }
goto wait_again;
}
/* Wait for one grace period. */
rcu_nocb_wait_gp(my_rdp);
- /*
- * We left ->nocb_leader_sleep unset to reduce cache thrashing.
- * We set it now, but recheck for new callbacks while
- * traversing our follower list.
- */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
-
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (READ_ONCE(rdp->nocb_head))
+ if (!rcu_nocb_poll &&
+ READ_ONCE(rdp->nocb_head) &&
+ READ_ONCE(my_rdp->nocb_leader_sleep)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
+ }
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
/* Append callbacks to follower's "done" list. */
- tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ tail = rdp->nocb_follower_tail;
+ rdp->nocb_follower_tail = rdp->nocb_gp_tail;
*tail = rdp->nocb_gp_head;
- smp_mb__after_atomic(); /* Store *tail before wakeup. */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
- /*
- * List was empty, wake up the follower.
- * Memory barriers supplied by atomic_long_add().
- */
+ /* List was empty, so wake up the follower. */
swake_up(&rdp->nocb_wq);
}
}
@@ -2131,28 +2157,16 @@ wait_again:
*/
static void nocb_follower_wait(struct rcu_data *rdp)
{
- bool firsttime = true;
-
for (;;) {
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "FollowerSleep");
- swait_event_interruptible(rdp->nocb_wq,
- READ_ONCE(rdp->nocb_follower_head));
- } else if (firsttime) {
- /* Don't drown trace log with "Poll"! */
- firsttime = false;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
- }
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
+ swait_event_interruptible(rdp->nocb_wq,
+ READ_ONCE(rdp->nocb_follower_head));
if (smp_load_acquire(&rdp->nocb_follower_head)) {
/* ^^^ Ensure CB invocation follows _head test. */
return;
}
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "WokeEmpty");
WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty"));
}
}
@@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
+ unsigned long flags;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
@@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg)
nocb_follower_wait(rdp);
/* Pull the ready-to-invoke callbacks onto local list. */
- list = READ_ONCE(rdp->nocb_follower_head);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ list = rdp->nocb_follower_head;
+ rdp->nocb_follower_head = NULL;
+ tail = rdp->nocb_follower_tail;
+ rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
BUG_ON(!list);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
- WRITE_ONCE(rdp->nocb_follower_head, NULL);
- tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name,
@@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
{
+ unsigned long flags;
int ndw;
- if (!rcu_nocb_need_deferred_wakeup(rdp))
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (!rcu_nocb_need_deferred_wakeup(rdp)) {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
return;
+ }
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
+ __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(unsigned long x)
+{
+ do_nocb_deferred_wakeup_common((struct rcu_data *)x);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ if (rcu_nocb_need_deferred_wakeup(rdp))
+ do_nocb_deferred_wakeup_common(rdp);
+}
+
void __init rcu_init_nohz(void)
{
int cpu;
@@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
rdp->nocb_tail = &rdp->nocb_head;
init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ raw_spin_lock_init(&rdp->nocb_lock);
+ setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer,
+ (unsigned long)rdp);
}
/*
@@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
return false;
}
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 00e77c470017..5033b66d2753 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_SRCU(tasks_rcu_exit_srcu);
+DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void)
mutex_unlock(&rcu_tasks_kthread_mutex);
}
+/* Do the srcu_read_lock() for the above synchronize_srcu(). */
+void exit_tasks_rcu_start(void)
+{
+ preempt_disable();
+ current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+ preempt_enable();
+}
+
+/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
+void exit_tasks_rcu_finish(void)
+{
+ preempt_disable();
+ __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
+ preempt_enable();
+}
+
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifndef CONFIG_TINY_RCU
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..78f54932ea1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 13fc5ae9bf2f..c9524d2d9316 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -300,6 +300,8 @@ EXPORT_SYMBOL(try_wait_for_completion);
*/
bool completion_done(struct completion *x)
{
+ unsigned long flags;
+
if (!READ_ONCE(x->done))
return false;
@@ -307,14 +309,9 @@ bool completion_done(struct completion *x)
* If ->done, we need to wait for complete() to release ->wait.lock
* otherwise we can end up freeing the completion before complete()
* is done referencing it.
- *
- * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
- * the loads of ->done and ->wait.lock such that we cannot observe
- * the lock before complete() acquires it while observing the ->done
- * after it's acquired the lock.
*/
- smp_rmb();
- spin_unlock_wait(&x->wait.lock);
+ spin_lock_irqsave(&x->wait.lock, flags);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c667b427b4..e053c31d96da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -951,8 +951,13 @@ struct migration_arg {
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
- if (unlikely(!cpu_active(dest_cpu)))
- return rq;
+ if (p->flags & PF_KTHREAD) {
+ if (unlikely(!cpu_online(dest_cpu)))
+ return rq;
+ } else {
+ if (unlikely(!cpu_active(dest_cpu)))
+ return rq;
+ }
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
@@ -2069,7 +2074,7 @@ out:
/**
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
- * @cookie: context's cookie for pinning
+ * @rf: request-queue flags for pinning
*
* Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
+ /*
+ * The membarrier system call requires a full memory barrier
+ * after storing to rq->curr, before going back to user-space.
+ *
+ * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+ * up adding a full barrier to switch_mm(), or we should figure
+ * out if a smp_mb__after_unlock_lock is really the proper API
+ * to use.
+ */
+ smp_mb__after_unlock_lock();
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
+ /*
+ * The membarrier system call requires each architecture
+ * to have a full memory barrier after updating
+ * rq->curr, before returning to user-space. For TSO
+ * (e.g. x86), the architecture must provide its own
+ * barrier in switch_mm(). For weakly ordered machines
+ * for which spin_unlock() acts as a full memory
+ * barrier, finish_lock_switch() in common code takes
+ * care of this barrier. For weakly ordered machines for
+ * which spin_unlock() acts as a RELEASE barrier (only
+ * arm64 and PowerPC), arm64 has a full barrier in
+ * switch_to(), and PowerPC has
+ * smp_mb__after_unlock_lock() before
+ * finish_lock_switch().
+ */
++*switch_count;
trace_sched_switch(preempt, prev, next);
@@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void)
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
- smp_mb();
- raw_spin_unlock_wait(&current->pi_lock);
+ raw_spin_lock_irq(&current->pi_lock);
+ raw_spin_unlock_irq(&current->pi_lock);
/* Causes final put_task_struct in finish_task_switch(): */
__set_current_state(TASK_DEAD);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..a92fddc22747
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/cpumask.h>
+
+#include "sched.h" /* for cpu_rq(). */
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK \
+ (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+}
+
+static void membarrier_private_expedited(void)
+{
+ int cpu;
+ bool fallback = false;
+ cpumask_var_t tmpmask;
+
+ if (num_online_cpus() == 1)
+ return;
+
+ /*
+ * Matches memory barriers around rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ /*
+ * Expedited membarrier commands guarantee that they won't
+ * block, hence the GFP_NOWAIT allocation flag and fallback
+ * implementation.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+ /* Fallback for OOM. */
+ fallback = true;
+ }
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm == current->mm) {
+ if (!fallback)
+ __cpumask_set_cpu(cpu, tmpmask);
+ else
+ smp_call_function_single(cpu, ipi_mb, NULL, 1);
+ }
+ rcu_read_unlock();
+ }
+ if (!fallback) {
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ free_cpumask_var(tmpmask);
+ }
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers around
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ * barrier() smp_mb() sys_membarrier()
+ * barrier() X X O
+ * smp_mb() X O O
+ * sys_membarrier() O O O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_QUERY:
+ {
+ int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+ if (tick_nohz_full_enabled())
+ cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+ return cmd_mask;
+ }
+ case MEMBARRIER_CMD_SHARED:
+ /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -EINVAL;
+ if (num_online_cpus() > 1)
+ synchronize_sched();
+ return 0;
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ membarrier_private_expedited();
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 17f11c6b0a9f..d6afed6d0752 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
-
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ int ret = curr->func(curr, mode, wake_flags, key);
+ if (ret < 0)
+ break;
+ if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
diff --git a/kernel/signal.c b/kernel/signal.c
index caed9133ae52..ed804a470dcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
recalc_sigpending_and_wake(t);
}
}
- if (action->sa.sa_handler == SIG_DFL)
+ /*
+ * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
+ * debugging to leave init killable.
+ */
+ if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = specific_send_sig_info(sig, info, t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -3303,12 +3307,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
+#ifdef __BIG_ENDIAN
sigset_t set;
- int err = do_sigpending(&set, sizeof(old_sigset_t));
- if (err == 0)
- if (copy_to_user(set32, &set, sizeof(old_sigset_t)))
- err = -EFAULT;
+ int err = do_sigpending(&set, sizeof(set.sig[0]));
+ if (!err)
+ err = put_user(set.sig[0], set32);
return err;
+#else
+ return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
+#endif
}
#endif
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d513051fcca2..836a72a66fba 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -96,20 +96,16 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
+ raw_spin_lock_irq(&task->pi_lock);
do {
work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ?
&work_exited : NULL;
} while (cmpxchg(&task->task_works, work, head) != work);
+ raw_spin_unlock_irq(&task->pi_lock);
if (!work)
break;
- /*
- * Synchronize with task_work_cancel(). It can't remove
- * the first entry == work, cmpxchg(task_works) should
- * fail, but it can play with *work and other entries.
- */
- raw_spin_unlock_wait(&task->pi_lock);
do {
next = work->next;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cedafa008de5..7e7e61c00d61 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -637,9 +637,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
tk->ktime_sec = seconds;
/* Update the monotonic raw base */
- seconds = tk->raw_sec;
- nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
- tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
/* must hold timekeeper_lock */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 71ce3f4eead3..f2674a056c26 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -203,6 +203,7 @@ struct timer_base {
bool migration_enabled;
bool nohz_active;
bool is_idle;
+ bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
- unsigned long jnow = READ_ONCE(jiffies);
+ unsigned long jnow;
/*
- * We only forward the base when it's idle and we have a delta between
- * base clock and jiffies.
+ * We only forward the base when we are idle or have just come out of
+ * idle (must_forward_clk logic), and have a delta between base clock
+ * and jiffies. In the common case, run_timers will take care of it.
*/
- if (!base->is_idle || (long) (jnow - base->clk) < 2)
+ if (likely(!base->must_forward_clk))
+ return;
+
+ jnow = READ_ONCE(jiffies);
+ base->must_forward_clk = base->is_idle;
+ if ((long)(jnow - base->clk) < 2)
return;
/*
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* same array bucket then just return:
*/
if (timer_pending(timer)) {
+ /*
+ * The downside of this optimization is that it can result in
+ * larger granularity than you would get from adding a new
+ * timer with this expiry.
+ */
if (timer->expires == expires)
return 1;
@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
clk = base->clk;
idx = calc_wheel_index(expires, clk);
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
} else {
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
}
ret = detach_if_pending(timer, base, false);
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
raw_spin_lock(&base->lock);
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | base->cpu);
+ forward_timer_base(base);
}
}
- /* Try to forward a stale timer base clock */
- forward_timer_base(base);
-
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | cpu);
}
+ forward_timer_base(base);
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
@@ -1495,12 +1508,18 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
base->is_idle = false;
} else {
if (!is_max_delta)
- expires = basem + (nextevt - basej) * TICK_NSEC;
+ expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
- * If we expect to sleep more than a tick, mark the base idle:
+ * If we expect to sleep more than a tick, mark the base idle.
+ * Also the tick is stopped so any added timer must forward
+ * the base clk itself to keep granularity small. This idle
+ * logic is only maintained for the BASE_STD base, deferrable
+ * timers may still see large granularity skew (by design).
*/
- if ((expires - basem) > TICK_NSEC)
+ if ((expires - basem) > TICK_NSEC) {
+ base->must_forward_clk = true;
base->is_idle = true;
+ }
}
raw_spin_unlock(&base->lock);
@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ /*
+ * must_forward_clk must be cleared before running timers so that any
+ * timer functions that call mod_timer will not try to forward the
+ * base. idle trcking / clock forwarding logic is only used with
+ * BASE_STD timers.
+ *
+ * The deferrable base does not do idle tracking at all, so we do
+ * not forward it. This can result in very large variations in
+ * granularity for deferrable timers, but they can be deferred for
+ * long periods due to idle.
+ */
+ base->must_forward_clk = false;
+
__run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
diff --git a/kernel/torture.c b/kernel/torture.c
index 55de96529287..637e172835d8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
torture_type, cpu);
(*n_offl_successes)++;
delta = jiffies - starttime;
- sum_offl += delta;
+ *sum_offl += delta;
if (*min_offl < 0) {
*min_offl = delta;
*max_offl = delta;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 37385193a608..dc498b605d5d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
fmt_cnt++;
}
- return __trace_printk(1/* fake ip will not be printed */, fmt,
- mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
- mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
- mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
+/* Horrid workaround for getting va_list handling working with different
+ * argument type combinations generically for 32 and 64 bit archs.
+ */
+#define __BPF_TP_EMIT() __BPF_ARG3_TP()
+#define __BPF_TP(...) \
+ __trace_printk(1 /* Fake ip will not be printed. */, \
+ fmt, ##__VA_ARGS__)
+
+#define __BPF_ARG1_TP(...) \
+ ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_TP(arg1, ##__VA_ARGS__) \
+ : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
+ : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
+
+#define __BPF_ARG2_TP(...) \
+ ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
+ : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
+ : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
+
+#define __BPF_ARG3_TP(...) \
+ ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
+ : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
+ : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
+
+ return __BPF_TP_EMIT();
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02004ae91860..96cea88fa00f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
function_profile_call(trace->func, 0, NULL, NULL);
+ /* If function graph is shutting down, ret_stack can be NULL */
+ if (!current->ret_stack)
+ return 0;
+
if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
current->ret_stack[index].subtime = 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 529cc50d7243..81279c6602ff 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* the page that was allocated, with the read page of the buffer.
*
* Returns:
- * The page allocated, or NULL on error.
+ * The page allocated, or ERR_PTR
*/
void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = NULL;
unsigned long flags;
struct page *page;
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-ENODEV);
+
+ cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bpage = page_address(page);
@@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
*
* for example:
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
- * if (!rpage)
- * return error;
+ * if (IS_ERR(rpage))
+ * return PTR_ERR(rpage);
* ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
* if (ret >= 0)
* process_page(rpage, ret);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 9fbcaf567886..68ee79afe31c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -113,7 +113,7 @@ static enum event_status read_page(int cpu)
int i;
bpage = ring_buffer_alloc_read_page(buffer, cpu);
- if (!bpage)
+ if (IS_ERR(bpage))
return EVENT_DROPPED;
ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 42b9355033d4..44004d8aa3b3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
- ssize_t ret;
+ ssize_t ret = 0;
ssize_t size;
if (!count)
@@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
iter->cpu_file);
- info->spare_cpu = iter->cpu_file;
+ if (IS_ERR(info->spare)) {
+ ret = PTR_ERR(info->spare);
+ info->spare = NULL;
+ } else {
+ info->spare_cpu = iter->cpu_file;
+ }
}
if (!info->spare)
- return -ENOMEM;
+ return ret;
/* Do we have previous read data to read? */
if (info->read < PAGE_SIZE)
@@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
ref->ref = 1;
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
- if (!ref->page) {
- ret = -ENOMEM;
+ if (IS_ERR(ref->page)) {
+ ret = PTR_ERR(ref->page);
+ ref->page = NULL;
kfree(ref);
break;
}
@@ -8293,6 +8299,7 @@ __init static int tracer_alloc_buffers(void)
if (ret < 0)
goto out_free_cpumask;
/* Used for event triggers */
+ ret = -ENOMEM;
temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
if (!temp_buffer)
goto out_rm_hp_state;
@@ -8407,4 +8414,4 @@ __init static int clear_boot_tracer(void)
}
fs_initcall(tracer_init_tracefs);
-late_initcall(clear_boot_tracer);
+late_initcall_sync(clear_boot_tracer);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 562fa69df5d3..13ba2d3f6a91 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -306,6 +306,7 @@ static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
+ struct perf_event *event;
struct ftrace_entry *entry;
struct hlist_head *head;
struct pt_regs regs;
@@ -329,8 +330,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry->ip = ip;
entry->parent_ip = parent_ip;
+ event = container_of(ops, struct perf_event, ftrace_ops);
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1, &regs, head, NULL);
+ 1, &regs, head, NULL, event);
#undef ENTRY_SIZE
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 59a411ff60c7..181e139a8057 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call,
if (err && set_str)
append_filter_err(ps, filter);
}
+ if (err && !set_str) {
+ free_event_filter(filter);
+ filter = NULL;
+ }
create_filter_finish(ps);
*filterp = filter;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c9b5aa10fbf9..8a907e12b6b9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e10395da88e..74d9a86eccc0 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -596,7 +596,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
(unsigned long *)&rec->args);
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
@@ -667,7 +667,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
- 1, regs, head, NULL);
+ 1, regs, head, NULL, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a7581fec9681..4525e0271a53 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
}
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
out:
preempt_enable();
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 0a689bbb78ef..305039b122fa 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a)
if (!a)
return;
- if (!a->pages) {
- kfree(a);
- return;
- }
+ if (!a->pages)
+ goto free;
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
free_page((unsigned long)a->pages[i]);
}
+
+ kfree(a->pages);
+
+ free:
+ kfree(a);
}
struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 06d3389bca0d..f5d52024f6b7 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -240,6 +240,7 @@ static void set_sample_period(void)
* hardlockup detector generates a warning
*/
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ watchdog_update_hrtimer_threshold(sample_period);
}
/* Commands for resetting the watchdog */
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 295a0d84934c..3a09ea1b1d3d 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void)
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
+static DEFINE_PER_CPU(ktime_t, last_timestamp);
+static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
+static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
+
+void watchdog_update_hrtimer_threshold(u64 period)
+{
+ /*
+ * The hrtimer runs with a period of (watchdog_threshold * 2) / 5
+ *
+ * So it runs effectively with 2.5 times the rate of the NMI
+ * watchdog. That means the hrtimer should fire 2-3 times before
+ * the NMI watchdog expires. The NMI watchdog on x86 is based on
+ * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
+ * might run way faster than expected and the NMI fires in a
+ * smaller period than the one deduced from the nominal CPU
+ * frequency. Depending on the Turbo-Mode factor this might be fast
+ * enough to get the NMI period smaller than the hrtimer watchdog
+ * period and trigger false positives.
+ *
+ * The sample threshold is used to check in the NMI handler whether
+ * the minimum time between two NMI samples has elapsed. That
+ * prevents false positives.
+ *
+ * Set this to 4/5 of the actual watchdog threshold period so the
+ * hrtimer is guaranteed to fire at least once within the real
+ * watchdog threshold.
+ */
+ watchdog_hrtimer_sample_threshold = period * 2;
+}
+
+static bool watchdog_check_timestamp(void)
+{
+ ktime_t delta, now = ktime_get_mono_fast_ns();
+
+ delta = now - __this_cpu_read(last_timestamp);
+ if (delta < watchdog_hrtimer_sample_threshold) {
+ /*
+ * If ktime is jiffies based, a stalled timer would prevent
+ * jiffies from being incremented and the filter would look
+ * at a stale timestamp and never trigger.
+ */
+ if (__this_cpu_inc_return(nmi_rearmed) < 10)
+ return false;
+ }
+ __this_cpu_write(nmi_rearmed, 0);
+ __this_cpu_write(last_timestamp, now);
+ return true;
+}
+#else
+static inline bool watchdog_check_timestamp(void)
+{
+ return true;
+}
+#endif
+
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
@@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
}
+ if (!watchdog_check_timestamp())
+ return;
+
/* check for a hardlockup
* This is done by making sure our timer interrupt
* is incrementing. The timer interrupt should have
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a86688fabc55..ca937b0c3a96 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
/* yeap, return possible CPUs in @node that @attrs wants */
cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+
+ if (cpumask_empty(cpumask)) {
+ pr_warn_once("WARNING: workqueue cpumask: online intersect > "
+ "possible intersect\n");
+ return false;
+ }
+
return !cpumask_equal(cpumask, attrs->cpumask);
use_dfl:
@@ -3744,8 +3751,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;
/* creating multiple pwqs breaks ordering guarantee */
- if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
- return -EINVAL;
+ if (!list_empty(&wq->pwqs)) {
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+ return -EINVAL;
+
+ wq->flags &= ~__WQ_ORDERED;
+ }
ctx = apply_wqattrs_prepare(wq, attrs);
if (!ctx)
@@ -3929,6 +3940,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
+ /*
+ * Unbound && max_active == 1 used to imply ordered, which is no
+ * longer the case on NUMA machines due to per-node pools. While
+ * alloc_ordered_workqueue() is the right way to create an ordered
+ * workqueue, keep the previous behavior to avoid subtle breakages
+ * on NUMA.
+ */
+ if ((flags & WQ_UNBOUND) && max_active == 1)
+ flags |= __WQ_ORDERED;
+
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;
@@ -4119,13 +4140,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
struct pool_workqueue *pwq;
/* disallow meddling with max_active for ordered workqueues */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
mutex_lock(&wq->mutex);
+ wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
for_each_pwq(pwq, wq)
@@ -5253,7 +5275,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
* attributes breaks ordering guarantee. Disallow exposing ordered
* workqueues.
*/
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;
wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
OpenPOWER on IntegriCloud