diff options
Diffstat (limited to 'kernel')
92 files changed, 6823 insertions, 4710 deletions
diff --git a/kernel/async.c b/kernel/async.c index d2edd6efec56..2cbd3dd5940d 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work) ktime_t uninitialized_var(calltime), delta, rettime; /* 1) run (and print duration) */ - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); } entry->func(entry->data, entry->cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", @@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain { ktime_t uninitialized_var(starttime), delta, endtime; - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } wait_event(async_done, lowest_in_progress(domain) >= cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { endtime = ktime_get(); delta = ktime_sub(endtime, starttime); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5e00b2333c26..172dc8ee0e3b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; + array->map.map_flags = attr->map_flags; array->elem_size = elem_size; if (!percpu) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 39cfafd895b8..b09185f0f17d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) trie->map.key_size = attr->key_size; trie->map.value_size = attr->value_size; trie->map.max_entries = attr->max_entries; + trie->map.map_flags = attr->map_flags; trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4dfd6f2ec2f9..31147d730abf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->map.key_size = attr->key_size; smap->map.value_size = value_size; smap->map.max_entries = attr->max_entries; + smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fd2411fd6914..265a0d854e33 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -783,7 +783,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD kern_version +#define BPF_PROG_LOAD_LAST_FIELD prog_flags static int bpf_prog_load(union bpf_attr *attr) { @@ -796,6 +796,9 @@ static int bpf_prog_load(union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; + if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) + return -EINVAL; + /* copy eBPF program license from user space */ if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c5b56c92f8e2..a8a725697bed 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -140,7 +140,7 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 65536 +#define BPF_COMPLEXITY_LIMIT_INSNS 98304 #define BPF_COMPLEXITY_LIMIT_STACK 1024 #define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) @@ -241,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state *state) if (reg->max_value != BPF_REGISTER_MAX_RANGE) verbose(",max_value=%llu", (unsigned long long)reg->max_value); + if (reg->min_align) + verbose(",min_align=%u", reg->min_align); + if (reg->aux_off) + verbose(",aux_off=%u", reg->aux_off); + if (reg->aux_off_align) + verbose(",aux_off_align=%u", reg->aux_off_align); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) @@ -457,16 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + + memset(®s[regno], 0, sizeof(regs[regno])); + regs[regno].type = NOT_INIT; + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + static void init_reg_state(struct bpf_reg_state *regs) { int i; - for (i = 0; i < MAX_BPF_REG; i++) { - regs[i].type = NOT_INIT; - regs[i].imm = 0; - regs[i].min_value = BPF_REGISTER_MIN_RANGE; - regs[i].max_value = BPF_REGISTER_MAX_RANGE; - } + for (i = 0; i < MAX_BPF_REG; i++) + mark_reg_not_init(regs, i); /* frame pointer */ regs[BPF_REG_FP].type = FRAME_PTR; @@ -492,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; + regs[regno].min_align = 0; } static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, @@ -779,17 +792,37 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, - int off, int size) + int off, int size, bool strict) { - if (reg->id && size != 1) { - verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n"); - return -EACCES; + int ip_align; + int reg_off; + + /* Byte size accesses are always allowed. */ + if (!strict || size == 1) + return 0; + + reg_off = reg->off; + if (reg->id) { + if (reg->aux_off_align % size) { + verbose("Packet access is only %u byte aligned, %d byte access not allowed\n", + reg->aux_off_align, size); + return -EACCES; + } + reg_off += reg->aux_off; } - /* skb->data is NET_IP_ALIGN-ed */ - if ((NET_IP_ALIGN + reg->off + off) % size != 0) { + /* For platforms that do not have a Kconfig enabling + * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of + * NET_IP_ALIGN is universally set to '2'. And on platforms + * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get + * to this code only in strict mode where we want to emulate + * the NET_IP_ALIGN==2 checking. Therefore use an + * unconditional IP align value of '2'. + */ + ip_align = 2; + if ((ip_align + reg_off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", - NET_IP_ALIGN, reg->off, off, size); + ip_align, reg_off, off, size); return -EACCES; } @@ -797,9 +830,9 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, } static int check_val_ptr_alignment(const struct bpf_reg_state *reg, - int size) + int size, bool strict) { - if (size != 1) { + if (strict && size != 1) { verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); return -EACCES; } @@ -807,16 +840,17 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg, return 0; } -static int check_ptr_alignment(const struct bpf_reg_state *reg, +static int check_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int off, int size) { + bool strict = env->strict_alignment; + switch (reg->type) { case PTR_TO_PACKET: - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : - check_pkt_ptr_alignment(reg, off, size); + return check_pkt_ptr_alignment(reg, off, size, strict); case PTR_TO_MAP_VALUE_ADJ: - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : - check_val_ptr_alignment(reg, size); + return check_val_ptr_alignment(reg, size, strict); default: if (off % size != 0) { verbose("misaligned access off %d size %d\n", @@ -849,7 +883,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, if (size < 0) return size; - err = check_ptr_alignment(reg, off, size); + err = check_ptr_alignment(env, reg, off, size); if (err) return err; @@ -883,6 +917,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, value_regno); /* note that reg.[id|off|range] == 0 */ state->regs[value_regno].type = reg_type; + state->regs[value_regno].aux_off = 0; + state->regs[value_regno].aux_off_align = 0; } } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { @@ -953,6 +989,11 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d leaks addr into mem\n", insn->src_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); @@ -1313,7 +1354,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs = state->regs; - struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1380,11 +1420,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } /* reset caller saved regs */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* update return register */ if (fn->ret_type == RET_INTEGER) { @@ -1455,6 +1492,8 @@ add_imm: */ dst_reg->off += imm; } else { + bool had_id; + if (src_reg->type == PTR_TO_PACKET) { /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ tmp_reg = *dst_reg; /* save r7 state */ @@ -1488,14 +1527,23 @@ add_imm: src_reg->imm); return -EACCES; } + + had_id = (dst_reg->id != 0); + /* dst_reg stays as pkt_ptr type and since some positive * integer value was added to the pointer, increment its 'id' */ dst_reg->id = ++env->id_gen; - /* something was added to pkt_ptr, set range and off to zero */ + /* something was added to pkt_ptr, set range to zero */ + dst_reg->aux_off += dst_reg->off; dst_reg->off = 0; dst_reg->range = 0; + if (had_id) + dst_reg->aux_off_align = min(dst_reg->aux_off_align, + src_reg->min_align); + else + dst_reg->aux_off_align = src_reg->min_align; } return 0; } @@ -1669,6 +1717,13 @@ static void check_reg_overflow(struct bpf_reg_state *reg) reg->min_value = BPF_REGISTER_MIN_RANGE; } +static u32 calc_align(u32 imm) +{ + if (!imm) + return 1U << 31; + return imm - ((imm - 1) & imm); +} + static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { @@ -1676,8 +1731,10 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, s64 min_val = BPF_REGISTER_MIN_RANGE; u64 max_val = BPF_REGISTER_MAX_RANGE; u8 opcode = BPF_OP(insn->code); + u32 dst_align, src_align; dst_reg = ®s[insn->dst_reg]; + src_align = 0; if (BPF_SRC(insn->code) == BPF_X) { check_reg_overflow(®s[insn->src_reg]); min_val = regs[insn->src_reg].min_value; @@ -1693,12 +1750,18 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, regs[insn->src_reg].type != UNKNOWN_VALUE) { min_val = BPF_REGISTER_MIN_RANGE; max_val = BPF_REGISTER_MAX_RANGE; + src_align = 0; + } else { + src_align = regs[insn->src_reg].min_align; } } else if (insn->imm < BPF_REGISTER_MAX_RANGE && (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { min_val = max_val = insn->imm; + src_align = calc_align(insn->imm); } + dst_align = dst_reg->min_align; + /* We don't know anything about what was done to this register, mark it * as unknown. */ @@ -1723,18 +1786,21 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg->min_value += min_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value += max_val; + dst_reg->min_align = min(src_align, dst_align); break; case BPF_SUB: if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value -= min_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value -= max_val; + dst_reg->min_align = min(src_align, dst_align); break; case BPF_MUL: if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value *= min_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value *= max_val; + dst_reg->min_align = max(src_align, dst_align); break; case BPF_AND: /* Disallow AND'ing of negative numbers, ain't nobody got time @@ -1746,17 +1812,23 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, else dst_reg->min_value = 0; dst_reg->max_value = max_val; + dst_reg->min_align = max(src_align, dst_align); break; case BPF_LSH: /* Gotta have special overflow logic here, if we're shifting * more than MAX_RANGE then just assume we have an invalid * range. */ - if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) + if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) { dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value <<= min_val; - + dst_reg->min_align = 1; + } else { + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value <<= min_val; + if (!dst_reg->min_align) + dst_reg->min_align = 1; + dst_reg->min_align <<= min_val; + } if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->max_value = BPF_REGISTER_MAX_RANGE; else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) @@ -1766,11 +1838,19 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* RSH by a negative number is undefined, and the BPF_RSH is an * unsigned shift, so make the appropriate casts. */ - if (min_val < 0 || dst_reg->min_value < 0) + if (min_val < 0 || dst_reg->min_value < 0) { dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else + } else { dst_reg->min_value = (u64)(dst_reg->min_value) >> min_val; + } + if (min_val < 0) { + dst_reg->min_align = 1; + } else { + dst_reg->min_align >>= (u64) min_val; + if (!dst_reg->min_align) + dst_reg->min_align = 1; + } if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value >>= max_val; break; @@ -1872,6 +1952,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].imm = insn->imm; regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; + regs[insn->dst_reg].min_align = calc_align(insn->imm); } } else if (opcode > BPF_END) { @@ -2368,7 +2449,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -2401,11 +2481,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* reset caller saved regs to unreadable */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* mark destination R0 register as readable, since it contains * the value fetched from the packet @@ -2564,6 +2641,7 @@ peek_stack: env->explored_states[t + 1] = STATE_LIST_MARK; } else { /* conditional jump with two edges */ + env->explored_states[t] = STATE_LIST_MARK; ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret == 1) goto peek_stack; @@ -2615,7 +2693,8 @@ err_free: /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct bpf_reg_state *old, +static bool compare_ptrs_to_packet(struct bpf_verifier_env *env, + struct bpf_reg_state *old, struct bpf_reg_state *cur) { if (old->id != cur->id) @@ -2658,7 +2737,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old, * 'if (R4 > data_end)' and all further insn were already good with r=20, * so they will be good with r=30 and we can prune the search. */ - if (old->off <= cur->off && + if (!env->strict_alignment && old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; @@ -2722,8 +2801,14 @@ static bool states_equal(struct bpf_verifier_env *env, rcur->type != NOT_INIT)) continue; + /* Don't care about the reg->id in this case. */ + if (rold->type == PTR_TO_MAP_VALUE_OR_NULL && + rcur->type == PTR_TO_MAP_VALUE_OR_NULL && + rold->map_ptr == rcur->map_ptr) + continue; + if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && - compare_ptrs_to_packet(rold, rcur)) + compare_ptrs_to_packet(env, rold, rcur)) continue; return false; @@ -2856,8 +2941,15 @@ static int do_check(struct bpf_verifier_env *env) goto process_bpf_exit; } - if (log_level && do_print_state) { - verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); + if (need_resched()) + cond_resched(); + + if (log_level > 1 || (log_level && do_print_state)) { + if (log_level > 1) + verbose("%d:", insn_idx); + else + verbose("\nfrom %d to %d:", + prev_insn_idx, insn_idx); print_verifier_state(&env->cur_state); do_print_state = false; } @@ -3495,6 +3587,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log_level = 0; } + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + env->strict_alignment = true; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; @@ -3600,6 +3696,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, log_level = 0; + env->strict_alignment = false; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + env->strict_alignment = true; + env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_KERNEL); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c3c9a0e1b3c9..8d4e85eae42c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4265,6 +4265,11 @@ static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_mutex); + if (css->flags & CSS_DYING) + return; + + css->flags |= CSS_DYING; + /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f6501f4f6040..ae643412948a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -176,9 +176,9 @@ typedef enum { } cpuset_flagbits_t; /* convenient tests for these bits */ -static inline bool is_cpuset_online(const struct cpuset *cs) +static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags); + return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ae6fbe5b5cf..cb5103413bd8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1658,13 +1658,13 @@ static ssize_t write_cpuhp_target(struct device *dev, ret = !sp->name || sp->cant_stop ? -EINVAL : 0; mutex_unlock(&cpuhp_state_mutex); if (ret) - return ret; + goto out; if (st->state < target) ret = do_cpu_up(dev->id, target); else ret = do_cpu_down(dev->id, target); - +out: unlock_device_hotplug(); return ret ? ret : count; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c04917cad1bf..1b2be63c8528 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -229,12 +229,18 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, } if (regs) { + mm_segment_t fs; + if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + + fs = get_fs(); + set_fs(USER_DS); perf_callchain_user(&ctx, regs); + set_fs(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..bc63f8db1b0d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -925,11 +925,6 @@ static inline int is_cgroup_event(struct perf_event *event) return 0; } -static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) -{ - return 0; -} - static inline void update_cgrp_time_from_event(struct perf_event *event) { } @@ -5729,9 +5724,6 @@ static void perf_output_read_one(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); } -/* - * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. - */ static void perf_output_read_group(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) @@ -5776,6 +5768,13 @@ static void perf_output_read_group(struct perf_output_handle *handle, #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ PERF_FORMAT_TOTAL_TIME_RUNNING) +/* + * XXX PERF_SAMPLE_READ vs inherited events seems difficult. + * + * The problem is that its both hard and excessively expensive to iterate the + * child list, not to mention that its impossible to IPI the children running + * on another CPU, from interrupt/NMI context. + */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { @@ -7316,6 +7315,21 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + * To avoid leaking information to userspace, we must always + * reject kernel samples when exclude_kernel is set. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + /* * Generic event overflow handling, sampling. */ @@ -7337,6 +7351,12 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); /* + * For security, drop the skid kernel samples if necessary. + */ + if (!sample_is_allowed(event, regs)) + return ret; + + /* * XXX event_limit might not quite work as expected on inherited * events */ @@ -9172,7 +9192,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { - struct pmu *pmu = NULL; + struct pmu *pmu; int idx; int ret; @@ -9441,9 +9461,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, local64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_FORMAT_GROUP on inherited events + * We currently do not support PERF_SAMPLE_READ on inherited events. + * See perf_output_read(). */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) goto err_ns; if (!has_branch_stack(event)) @@ -9456,9 +9477,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } pmu = perf_init_event(event); - if (!pmu) - goto err_ns; - else if (IS_ERR(pmu)) { + if (IS_ERR(pmu)) { err = PTR_ERR(pmu); goto err_ns; } @@ -9471,8 +9490,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, sizeof(unsigned long), GFP_KERNEL); - if (!event->addr_filters_offs) + if (!event->addr_filters_offs) { + err = -ENOMEM; goto err_per_task; + } /* force hw sync on the address filters */ event->addr_filters_gen = 1; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 2831480c63a2..ee97196bb151 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -580,7 +580,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, int ret = -ENOMEM, max_order = 0; if (!has_aux(event)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { /* diff --git a/kernel/exit.c b/kernel/exit.c index 516acdb0e0ec..c63226283aef 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -318,19 +318,6 @@ void rcuwait_wake_up(struct rcuwait *w) rcu_read_unlock(); } -struct task_struct *try_get_task_struct(struct task_struct **ptask) -{ - struct task_struct *task; - - rcu_read_lock(); - task = task_rcu_dereference(ptask); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - return task; -} - /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected @@ -1004,7 +991,7 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; - wait_queue_t child_wait; + wait_queue_entry_t child_wait; int notask_error; }; @@ -1541,7 +1528,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } -static int child_wait_callback(wait_queue_t *wait, unsigned mode, +static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, diff --git a/kernel/extable.c b/kernel/extable.c index 2676d7f8baf6..0fbdd8582f08 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr) addr < (unsigned long)_etext) return 1; - if (system_state == SYSTEM_BOOTING && + if (system_state < SYSTEM_RUNNING && init_kernel_text(addr)) return 1; return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 08ba696aa561..e53770d2bf95 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -560,7 +560,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_int(); + tsk->stack_canary = get_random_long(); #endif /* @@ -1337,7 +1337,7 @@ void __cleanup_sighand(struct sighand_struct *sighand) if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* - * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand(). */ kmem_cache_free(sighand_cachep, sighand); @@ -1577,6 +1577,18 @@ static __latent_entropy struct task_struct *copy_process( if (!p) goto fork_out; + /* + * This _must_ happen before we call free_task(), i.e. before we jump + * to any of the bad_fork_* labels. This is to avoid freeing + * p->set_child_tid which is (ab)used as a kthread's data pointer for + * kernel threads (PF_KTHREAD). + */ + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? + */ + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + ftrace_graph_init_task(p); rt_mutex_init_task(p); @@ -1743,11 +1755,6 @@ static __latent_entropy struct task_struct *copy_process( } } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; - /* - * Clear TID on mm_release()? - */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1845,11 +1852,13 @@ static __latent_entropy struct task_struct *copy_process( */ recalc_sigpending(); if (signal_pending(current)) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } + if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + retval = -ENOMEM; + goto bad_fork_cancel_cgroup; + } if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -1907,6 +1916,8 @@ static __latent_entropy struct task_struct *copy_process( return p; bad_fork_cancel_cgroup: + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); bad_fork_free_pid: cgroup_threadgroup_change_end(current); @@ -2176,7 +2187,7 @@ void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..d6cf71d08f21 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -225,7 +225,7 @@ struct futex_pi_state { * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * - * We use this hashed waitqueue, instead of a normal wait_queue_t, so + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 2f9df37940a0..c51a49c9be70 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_icall_topn); +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 6a5c239c7669..46a18e72bce6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) +#if (__GNUC__ >= 7) +#define GCOV_COUNTERS 9 +#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 686be4b73018..c94da688ee9b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -880,8 +880,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, if (!desc) return; - __irq_do_set_handler(desc, handle, 1, NULL); desc->irq_common_data.handler_data = data; + __irq_do_set_handler(desc, handle, 1, NULL); irq_put_desc_busunlock(desc, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 070be980c37a..425170d4439b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1312,8 +1312,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); - if (ret) + if (ret) { + irq_release_resources(desc); goto out_mask; + } } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index ae1a3ba24df5..154ffb489b93 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -38,6 +38,7 @@ #include <linux/syscore_ops.h> #include <linux/compiler.h> #include <linux/hugetlb.h> +#include <linux/frame.h> #include <asm/page.h> #include <asm/sections.h> @@ -874,7 +875,7 @@ int kexec_load_disabled; * only when panic_cpu holds the current CPU number; this is the only CPU * which processes crash_kexec routines. */ -void __crash_kexec(struct pt_regs *regs) +void __noclone __crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel @@ -896,6 +897,7 @@ void __crash_kexec(struct pt_regs *regs) mutex_unlock(&kexec_mutex); } } +STACK_FRAME_NON_STANDARD(__crash_kexec); void crash_kexec(struct pt_regs *regs) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7367e0ec6f81..adfe3b4cfe05 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -122,7 +122,7 @@ static void *alloc_insn_page(void) return module_alloc(PAGE_SIZE); } -static void free_insn_page(void *page) +void __weak free_insn_page(void *page) { module_memfree(page); } @@ -595,7 +595,7 @@ static void kprobe_optimizer(struct work_struct *work) } /* Wait for completing optimization and unoptimization */ -static void wait_for_kprobe_optimizer(void) +void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); @@ -2183,6 +2183,12 @@ static int kprobes_module_callback(struct notifier_block *nb, * The vaddr this probe is installed will soon * be vfreed buy not synced to disk. Hence, * disarming the breakpoint isn't needed. + * + * Note, this will also move any optimized probes + * that are pending to be removed from their + * corresponding lists to the freeing_list and + * will not be touched by the delayed + * kprobe_optimizer work handler. */ kill_kprobe(p); } diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 045022557936..ec4565122e65 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -10,6 +10,7 @@ config LIVEPATCH depends on SYSFS depends on KALLSYMS_ALL depends on HAVE_LIVEPATCH + depends on !TRIM_UNUSED_KSYMS help Say Y here if you want to support kernel live patching. This option has no runtime impact until a kernel "patch" diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index f8269036bf0b..52c4e907c14b 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -59,7 +59,11 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); - rcu_read_lock(); + /* + * A variant of synchronize_sched() is used to allow patching functions + * where RCU is not watching, see klp_synchronize_transition(). + */ + preempt_disable_notrace(); func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, stack_node); @@ -115,7 +119,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, klp_arch_set_pc(regs, (unsigned long)func->new_func); unlock: - rcu_read_unlock(); + preempt_enable_notrace(); } /* diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index adc0cc64aa4b..b004a1fb6032 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -49,6 +49,28 @@ static void klp_transition_work_fn(struct work_struct *work) static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + */ +static void klp_sync(struct work_struct *work) +{ +} + +/* + * We allow to patch also functions where RCU is not watching, + * e.g. before user_exit(). We can not rely on the RCU infrastructure + * to do the synchronization. Instead hard force the sched synchronization. + * + * This approach allows to use RCU functions for manipulating func_stack + * safely. + */ +static void klp_synchronize_transition(void) +{ + schedule_on_each_cpu(klp_sync); +} + +/* * The transition to the target patch state is complete. Clean up the data * structures. */ @@ -73,7 +95,7 @@ static void klp_complete_transition(void) * func->transition gets cleared, the handler may choose a * removed function. */ - synchronize_rcu(); + klp_synchronize_transition(); } if (klp_transition_patch->immediate) @@ -92,7 +114,7 @@ static void klp_complete_transition(void) /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ if (klp_target_state == KLP_PATCHED) - synchronize_rcu(); + klp_synchronize_transition(); read_lock(&tasklist_lock); for_each_process_thread(g, task) { @@ -136,7 +158,11 @@ void klp_cancel_transition(void) */ void klp_update_patch_state(struct task_struct *task) { - rcu_read_lock(); + /* + * A variant of synchronize_sched() is used to allow patching functions + * where RCU is not watching, see klp_synchronize_transition(). + */ + preempt_disable_notrace(); /* * This test_and_clear_tsk_thread_flag() call also serves as a read @@ -153,7 +179,7 @@ void klp_update_patch_state(struct task_struct *task) if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING)) task->patch_state = READ_ONCE(klp_target_state); - rcu_read_unlock(); + preempt_enable_notrace(); } /* @@ -539,7 +565,7 @@ void klp_reverse_transition(void) clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); /* Let any remaining calls to klp_update_patch_state() complete */ - synchronize_rcu(); + klp_synchronize_transition(); klp_start_transition(); } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 0a1b3c748478..7d2499bec5fe 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1157,18 +1157,18 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, if (debug_locks_silent) return 0; - printk("\n"); - printk("======================================================\n"); - printk("[ INFO: possible circular locking dependency detected ]\n"); + pr_warn("\n"); + pr_warn("======================================================\n"); + pr_warn("WARNING: possible circular locking dependency detected\n"); print_kernel_ident(); - printk("-------------------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", + pr_warn("------------------------------------------------------\n"); + pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); - printk("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); - printk("\nwhich lock already depends on the new lock.\n\n"); - printk("\nthe existing dependency chain (in reverse order) is:\n"); + pr_warn("\nwhich lock already depends on the new lock.\n\n"); + pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); @@ -1495,13 +1495,13 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); - printk("======================================================\n"); - printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + pr_warn("\n"); + pr_warn("=====================================================\n"); + pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", irqclass, irqclass); print_kernel_ident(); - printk("------------------------------------------------------\n"); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", + pr_warn("-----------------------------------------------------\n"); + pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, @@ -1509,46 +1509,46 @@ print_bad_irq_dependency(struct task_struct *curr, curr->softirqs_enabled); print_lock(next); - printk("\nand this task is already holding:\n"); + pr_warn("\nand this task is already holding:\n"); print_lock(prev); - printk("which would create a new lock dependency:\n"); + pr_warn("which would create a new lock dependency:\n"); print_lock_name(hlock_class(prev)); - printk(KERN_CONT " ->"); + pr_cont(" ->"); print_lock_name(hlock_class(next)); - printk(KERN_CONT "\n"); + pr_cont("\n"); - printk("\nbut this new dependency connects a %s-irq-safe lock:\n", + pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); print_lock_name(backwards_entry->class); - printk("\n... which became %s-irq-safe at:\n", irqclass); + pr_warn("\n... which became %s-irq-safe at:\n", irqclass); print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); - printk("\nto a %s-irq-unsafe lock:\n", irqclass); + pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); - printk("\n... which became %s-irq-unsafe at:\n", irqclass); - printk("..."); + pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); + pr_warn("..."); print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); - printk("\nother info that might help us debug this:\n\n"); + pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, hlock_class(prev), hlock_class(next)); lockdep_print_held_locks(curr); - printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); + pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) return 0; print_shortest_lock_dependencies(backwards_entry, prev_root); - printk("\nthe dependencies between the lock to be acquired"); - printk(" and %s-irq-unsafe lock:\n", irqclass); + pr_warn("\nthe dependencies between the lock to be acquired"); + pr_warn(" and %s-irq-unsafe lock:\n", irqclass); if (!save_trace(&next_root->trace)) return 0; print_shortest_lock_dependencies(forwards_entry, next_root); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -1724,22 +1724,22 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); - printk("=============================================\n"); - printk("[ INFO: possible recursive locking detected ]\n"); + pr_warn("\n"); + pr_warn("============================================\n"); + pr_warn("WARNING: possible recursive locking detected\n"); print_kernel_ident(); - printk("---------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", + pr_warn("--------------------------------------------\n"); + pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); - printk("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(prev); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -2074,21 +2074,21 @@ static void print_collision(struct task_struct *curr, struct held_lock *hlock_next, struct lock_chain *chain) { - printk("\n"); - printk("======================\n"); - printk("[chain_key collision ]\n"); + pr_warn("\n"); + pr_warn("============================\n"); + pr_warn("WARNING: chain_key collision\n"); print_kernel_ident(); - printk("----------------------\n"); - printk("%s/%d: ", current->comm, task_pid_nr(current)); - printk("Hash chain already cached but the contents don't match!\n"); + pr_warn("----------------------------\n"); + pr_warn("%s/%d: ", current->comm, task_pid_nr(current)); + pr_warn("Hash chain already cached but the contents don't match!\n"); - printk("Held locks:"); + pr_warn("Held locks:"); print_chain_keys_held_locks(curr, hlock_next); - printk("Locks in cached chain:"); + pr_warn("Locks in cached chain:"); print_chain_keys_chain(chain); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } #endif @@ -2373,16 +2373,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); - printk("=================================\n"); - printk("[ INFO: inconsistent lock state ]\n"); + pr_warn("\n"); + pr_warn("================================\n"); + pr_warn("WARNING: inconsistent lock state\n"); print_kernel_ident(); - printk("---------------------------------\n"); + pr_warn("--------------------------------\n"); - printk("inconsistent {%s} -> {%s} usage.\n", + pr_warn("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", + pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, @@ -2390,16 +2390,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, trace_softirqs_enabled(curr)); print_lock(this); - printk("{%s} state was registered at:\n", usage_str[prev_bit]); + pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); print_usage_bug_scenario(this); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -2438,28 +2438,28 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); - printk("=========================================================\n"); - printk("[ INFO: possible irq lock inversion dependency detected ]\n"); + pr_warn("\n"); + pr_warn("========================================================\n"); + pr_warn("WARNING: possible irq lock inversion dependency detected\n"); print_kernel_ident(); - printk("---------------------------------------------------------\n"); - printk("%s/%d just changed the state of lock:\n", + pr_warn("--------------------------------------------------------\n"); + pr_warn("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); if (forwards) - printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); + pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else - printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); + pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); print_lock_name(other->class); - printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); + pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); /* Find a middle lock (if one exists) */ depth = get_lock_depth(other); do { if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad path found in chain graph\n", __func__); + pr_warn("lockdep:%s bad path found in chain graph\n", __func__); break; } middle = entry; @@ -2475,12 +2475,12 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); - printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); + pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); if (!save_trace(&root->trace)) return 0; print_shortest_lock_dependencies(other, root); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3189,25 +3189,25 @@ print_lock_nested_lock_not_held(struct task_struct *curr, if (debug_locks_silent) return 0; - printk("\n"); - printk("==================================\n"); - printk("[ BUG: Nested lock was not taken ]\n"); + pr_warn("\n"); + pr_warn("==================================\n"); + pr_warn("WARNING: Nested lock was not taken\n"); print_kernel_ident(); - printk("----------------------------------\n"); + pr_warn("----------------------------------\n"); - printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); print_lock(hlock); - printk("\nbut this task is not holding:\n"); - printk("%s\n", hlock->nest_lock->name); + pr_warn("\nbut this task is not holding:\n"); + pr_warn("%s\n", hlock->nest_lock->name); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3402,21 +3402,21 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n"); - printk("=====================================\n"); - printk("[ BUG: bad unlock balance detected! ]\n"); + pr_warn("\n"); + pr_warn("=====================================\n"); + pr_warn("WARNING: bad unlock balance detected!\n"); print_kernel_ident(); - printk("-------------------------------------\n"); - printk("%s/%d is trying to release lock (", + pr_warn("-------------------------------------\n"); + pr_warn("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(KERN_CONT ") at:\n"); + pr_cont(") at:\n"); print_ip_sym(ip); - printk("but there are no more locks to release!\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("but there are no more locks to release!\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3974,21 +3974,21 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n"); - printk("=================================\n"); - printk("[ BUG: bad contention detected! ]\n"); + pr_warn("\n"); + pr_warn("=================================\n"); + pr_warn("WARNING: bad contention detected!\n"); print_kernel_ident(); - printk("---------------------------------\n"); - printk("%s/%d is trying to contend lock (", + pr_warn("---------------------------------\n"); + pr_warn("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(KERN_CONT ") at:\n"); + pr_cont(") at:\n"); print_ip_sym(ip); - printk("but there are no locks held!\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("but there are no locks held!\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -4318,17 +4318,17 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; - printk("\n"); - printk("=========================\n"); - printk("[ BUG: held lock freed! ]\n"); + pr_warn("\n"); + pr_warn("=========================\n"); + pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); - printk("-------------------------\n"); - printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + pr_warn("-------------------------\n"); + pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } @@ -4376,14 +4376,14 @@ static void print_held_locks_bug(void) if (debug_locks_silent) return; - printk("\n"); - printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", + pr_warn("\n"); + pr_warn("====================================\n"); + pr_warn("WARNING: %s/%d still has locks held!\n", current->comm, task_pid_nr(current)); print_kernel_ident(); - printk("-------------------------------------\n"); + pr_warn("------------------------------------\n"); lockdep_print_held_locks(current); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } @@ -4402,10 +4402,10 @@ void debug_show_all_locks(void) int unlock = 1; if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); + pr_warn("INFO: lockdep is turned off.\n"); return; } - printk("\nShowing all locks held in the system:\n"); + pr_warn("\nShowing all locks held in the system:\n"); /* * Here we try to get the tasklist_lock as hard as possible, @@ -4416,18 +4416,18 @@ void debug_show_all_locks(void) retry: if (!read_trylock(&tasklist_lock)) { if (count == 10) - printk("hm, tasklist_lock locked, retrying... "); + pr_warn("hm, tasklist_lock locked, retrying... "); if (count) { count--; - printk(" #%d", 10-count); + pr_cont(" #%d", 10-count); mdelay(200); goto retry; } - printk(" ignoring it.\n"); + pr_cont(" ignoring it.\n"); unlock = 0; } else { if (count != 10) - printk(KERN_CONT " locked it.\n"); + pr_cont(" locked it.\n"); } do_each_thread(g, p) { @@ -4445,8 +4445,8 @@ retry: unlock = 1; } while_each_thread(g, p); - printk("\n"); - printk("=============================================\n\n"); + pr_warn("\n"); + pr_warn("=============================================\n\n"); if (unlock) read_unlock(&tasklist_lock); @@ -4475,12 +4475,12 @@ asmlinkage __visible void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; - printk("\n"); - printk("================================================\n"); - printk("[ BUG: lock held when returning to user space! ]\n"); + pr_warn("\n"); + pr_warn("================================================\n"); + pr_warn("WARNING: lock held when returning to user space!\n"); print_kernel_ident(); - printk("------------------------------------------------\n"); - printk("%s/%d is leaving the kernel with locks still held!\n", + pr_warn("------------------------------------------------\n"); + pr_warn("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); } @@ -4490,19 +4490,15 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; -#ifndef CONFIG_PROVE_RCU_REPEATEDLY - if (!debug_locks_off()) - return; -#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ - printk("\n"); - pr_err("===============================\n"); - pr_err("[ ERR: suspicious RCU usage. ]\n"); + pr_warn("\n"); + pr_warn("=============================\n"); + pr_warn("WARNING: suspicious RCU usage\n"); print_kernel_ident(); - pr_err("-------------------------------\n"); - pr_err("%s:%d %s!\n", file, line, s); - pr_err("\nother info that might help us debug this:\n\n"); - pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_warn("-----------------------------\n"); + pr_warn("%s:%d %s!\n", file, line, s); + pr_warn("\nother info that might help us debug this:\n\n"); + pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" : !rcu_is_watching() @@ -4529,10 +4525,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) * rcu_read_lock_bh() and so on from extended quiescent states. */ if (!rcu_is_watching()) - printk("RCU used illegally from extended quiescent state!\n"); + pr_warn("RCU used illegally from extended quiescent state!\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 32fe775a2eaf..ac35e648b0e5 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) return; } - printk("\n============================================\n"); - printk( "[ BUG: circular locking deadlock detected! ]\n"); - printk("%s\n", print_tainted()); - printk( "--------------------------------------------\n"); + pr_warn("\n"); + pr_warn("============================================\n"); + pr_warn("WARNING: circular locking deadlock detected!\n"); + pr_warn("%s\n", print_tainted()); + pr_warn("--------------------------------------------\n"); printk("%s/%d is deadlocking current task %s/%d\n\n", task->comm, task_pid_nr(task), current->comm, task_pid_nr(current)); @@ -165,12 +166,16 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) memset(waiter, 0x22, sizeof(*waiter)); } -void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) +void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) { /* * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lock->name = name; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif } diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index b585af9a1b50..5078c6ddf4a5 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -11,7 +11,7 @@ extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b95509416909..78069895032a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1481,6 +1481,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock) { might_sleep(); + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock); @@ -1496,9 +1497,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { + int ret; + might_sleep(); - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); @@ -1526,11 +1534,18 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) { + int ret; + might_sleep(); - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, RT_MUTEX_MIN_CHAINWALK, rt_mutex_slowlock); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); @@ -1547,10 +1562,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { + int ret; + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; - return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_trylock); @@ -1561,6 +1582,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); */ void __sched rt_mutex_unlock(struct rt_mutex *lock) { + mutex_release(&lock->dep_map, 1, _RET_IP_); rt_mutex_fastunlock(lock, rt_mutex_slowunlock); } EXPORT_SYMBOL_GPL(rt_mutex_unlock); @@ -1620,7 +1642,6 @@ void rt_mutex_destroy(struct rt_mutex *lock) lock->magic = NULL; #endif } - EXPORT_SYMBOL_GPL(rt_mutex_destroy); /** @@ -1632,14 +1653,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); * * Initializing of a locked rt lock is not allowed */ -void __rt_mutex_init(struct rt_mutex *lock, const char *name) +void __rt_mutex_init(struct rt_mutex *lock, const char *name, + struct lock_class_key *key) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); lock->waiters = RB_ROOT; lock->waiters_leftmost = NULL; - debug_rt_mutex_init(lock, name); + if (name && key) + debug_rt_mutex_init(lock, name, key); } EXPORT_SYMBOL_GPL(__rt_mutex_init); @@ -1660,7 +1683,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { - __rt_mutex_init(lock, NULL); + __rt_mutex_init(lock, NULL, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); } @@ -1785,12 +1808,14 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, int ret; raw_spin_lock_irq(&lock->wait_lock); - - set_current_state(TASK_INTERRUPTIBLE); - /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -1822,15 +1847,25 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, raw_spin_lock_irq(&lock->wait_lock); /* + * Do an unconditional try-lock, this deals with the lock stealing + * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() + * sets a NULL owner. + * + * We're not interested in the return value, because the subsequent + * test on rt_mutex_owner() will infer that. If the trylock succeeded, + * we will own the lock and it will have removed the waiter. If we + * failed the trylock, we're still not owner and we need to remove + * ourselves. + */ + try_to_take_rt_mutex(lock, current, waiter); + /* * Unless we're the owner; we're still enqueued on the wait_list. * So check if we became owner, if not, take us off the wait_list. */ if (rt_mutex_owner(lock) != current) { remove_waiter(lock, waiter); - fixup_rt_mutex_waiters(lock); cleanup = true; } - /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index 6607802efa8b..5c253caffe91 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -17,7 +17,7 @@ #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) #define debug_rt_mutex_proxy_unlock(l) do { } while (0) #define debug_rt_mutex_unlock(l) do { } while (0) -#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_init(m, n, k) do { } while (0) #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) #define debug_rt_mutex_print_deadlock(w) do { } while (0) #define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index d1f3e9f558b8..74a5a7255b4d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * if reparented. */ for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->nr_hashed == init_pids) break; schedule(); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3b1e0f3ad07f..fa46606f3356 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1425,7 +1425,7 @@ static unsigned int nr_meta_pages; * Numbers of normal and highmem page frames allocated for hibernation image * before suspending devices. */ -unsigned int alloc_normal, alloc_highmem; +static unsigned int alloc_normal, alloc_highmem; /* * Memory bitmap used for marking saveable pages (during hibernation) or * hibernation image pages (during restore) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f80fd33639e0..57d22571f306 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -225,14 +225,14 @@ static struct block_device *hib_resume_bdev; struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; - int error; + blk_status_t error; }; static void hib_init_batch(struct hib_bio_batch *hb) { atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); - hb->error = 0; + hb->error = BLK_STS_OK; } static void hib_end_io(struct bio *bio) @@ -240,7 +240,7 @@ static void hib_end_io(struct bio *bio) struct hib_bio_batch *hb = bio->bi_private; struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), @@ -253,8 +253,8 @@ static void hib_end_io(struct bio *bio) flush_icache_range((unsigned long)page_address(page), (unsigned long)page_address(page) + PAGE_SIZE); - if (bio->bi_error && !hb->error) - hb->error = bio->bi_error; + if (bio->bi_status && !hb->error) + hb->error = bio->bi_status; if (atomic_dec_and_test(&hb->count)) wake_up(&hb->wait); @@ -293,10 +293,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, return error; } -static int hib_wait_io(struct hib_bio_batch *hb) +static blk_status_t hib_wait_io(struct hib_bio_batch *hb) { wait_event(hb->wait, atomic_read(&hb->count) == 0); - return hb->error; + return blk_status_to_errno(hb->error); } /* diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a1aecf44ab07..bd53ea579dc8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -269,7 +269,6 @@ static struct console *exclusive_console; #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; -static int console_cmdline_cnt; static int preferred_console = -1; int console_set_on_cmdline; @@ -1176,7 +1175,7 @@ static void boot_delay_msec(int level) unsigned long long k; unsigned long timeout; - if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress_message_printing(level)) { return; } @@ -1906,25 +1905,12 @@ static int __add_preferred_console(char *name, int idx, char *options, * See if this tty is not yet registered, and * if we have a slot free. */ - for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) { + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { if (strcmp(c->name, name) == 0 && c->index == idx) { - if (brl_options) - return 0; - - /* - * Maintain an invariant that will help to find if - * the matching console is preferred, see - * register_console(): - * - * The last non-braille console is always - * the preferred one. - */ - if (i != console_cmdline_cnt - 1) - swap(console_cmdline[i], - console_cmdline[console_cmdline_cnt - 1]); - - preferred_console = console_cmdline_cnt - 1; - + if (!brl_options) + preferred_console = i; return 0; } } @@ -1937,7 +1923,6 @@ static int __add_preferred_console(char *name, int idx, char *options, braille_set_options(c, brl_options); c->index = idx; - console_cmdline_cnt++; return 0; } /* @@ -2477,23 +2462,12 @@ void register_console(struct console *newcon) } /* - * See if this console matches one we selected on the command line. - * - * There may be several entries in the console_cmdline array matching - * with the same console, one with newcon->match(), another by - * name/index: - * - * pl011,mmio,0x87e024000000,115200 -- added from SPCR - * ttyAMA0 -- added from command line - * - * Traverse the console_cmdline array in reverse order to be - * sure that if this console is preferred then it will be the first - * matching entry. We use the invariant that is maintained in - * __add_preferred_console(). + * See if this console matches one we selected on + * the command line. */ - for (i = console_cmdline_cnt - 1; i >= 0; i--) { - c = console_cmdline + i; - + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 266ddcc1d8bb..60f356d91060 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, } +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, + const struct cred *ptracer_cred) +{ + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; + child->ptracer_cred = get_cred(ptracer_cred); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * * Must be called with the tasklist lock write-held. */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; rcu_read_lock(); - child->ptracer_cred = get_cred(__task_cred(new_parent)); + __ptrace_link(child, new_parent, __task_cred(new_parent)); rcu_read_unlock(); } @@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request, flags |= PT_SEIZED; task->ptrace = flags; - __ptrace_link(task, current); + ptrace_link(task, current); /* SEIZE doesn't trap tracee on attach */ if (!seize) @@ -459,7 +465,7 @@ static int ptrace_traceme(void) */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace = PT_PTRACED; - __ptrace_link(current, current->real_parent); + ptrace_link(current, current->real_parent); } } write_unlock_irq(&tasklist_lock); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig new file mode 100644 index 000000000000..be90c945063f --- /dev/null +++ b/kernel/rcu/Kconfig @@ -0,0 +1,242 @@ +# +# RCU-related configuration options +# + +menu "RCU Subsystem" + +config TREE_RCU + bool + default y if !PREEMPT && SMP + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or + thousands of CPUs. It also scales down nicely to + smaller systems. + +config PREEMPT_RCU + bool + default y if PREEMPT + help + This option selects the RCU implementation that is + designed for very large SMP systems with hundreds or + thousands of CPUs, but for which real-time response + is also required. It also scales down nicely to + smaller systems. + + Select this option if you are unsure. + +config TINY_RCU + bool + default y if !PREEMPT && !SMP + help + This option selects the RCU implementation that is + designed for UP systems from which real-time response + is not required. This option greatly reduces the + memory footprint of RCU. + +config RCU_EXPERT + bool "Make expert-level adjustments to RCU configuration" + default n + help + This option needs to be enabled if you wish to make + expert-level adjustments to RCU configuration. By default, + no such adjustments can be made, which has the often-beneficial + side-effect of preventing "make oldconfig" from asking you all + sorts of detailed questions about how you would like numerous + obscure RCU options to be set up. + + Say Y if you need to make expert-level adjustments to RCU. + + Say N if you are unsure. + +config SRCU + bool + help + This option selects the sleepable version of RCU. This version + permits arbitrary sleeping or blocking within RCU read-side critical + sections. + +config TINY_SRCU + bool + default y if SRCU && TINY_RCU + help + This option selects the single-CPU non-preemptible version of SRCU. + +config TREE_SRCU + bool + default y if SRCU && !TINY_RCU + help + This option selects the full-fledged version of SRCU. + +config TASKS_RCU + bool + default n + select SRCU + help + This option enables a task-based RCU implementation that uses + only voluntary context switch (not preemption!), idle, and + user-mode execution as quiescent states. + +config RCU_STALL_COMMON + def_bool ( TREE_RCU || PREEMPT_RCU ) + help + This option enables RCU CPU stall code that is common between + the TINY and TREE variants of RCU. The purpose is to allow + the tiny variants to disable RCU CPU stall warnings, while + making these warnings mandatory for the tree variants. + +config RCU_NEED_SEGCBLIST + def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) + +config CONTEXT_TRACKING + bool + +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the development of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + + +config RCU_FANOUT + int "Tree-based hierarchical RCU fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + default 64 if 64BIT + default 32 if !64BIT + help + This option controls the fanout of hierarchical implementations + of RCU, allowing RCU to work efficiently on machines with + large numbers of CPUs. This value must be at least the fourth + root of NR_CPUS, which allows NR_CPUS to be insanely large. + The default value of RCU_FANOUT should be used for production + systems, but if you are stress-testing the RCU implementation + itself, small RCU_FANOUT values allow you to test large-system + code paths on small(er) systems. + + Select a specific number if testing RCU itself. + Take the default if unsure. + +config RCU_FANOUT_LEAF + int "Tree-based hierarchical RCU leaf-level fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + default 16 + help + This option controls the leaf-level fanout of hierarchical + implementations of RCU, and allows trading off cache misses + against lock contention. Systems that synchronize their + scheduling-clock interrupts for energy-efficiency reasons will + want the default because the smaller leaf-level fanout keeps + lock contention levels acceptably low. Very large systems + (hundreds or thousands of CPUs) will instead want to set this + value to the maximum value possible in order to reduce the + number of cache misses incurred during RCU's grace-period + initialization. These systems tend to run CPU-bound, and thus + are not helped by synchronized interrupts, and thus tend to + skew them, which reduces lock contention enough that large + leaf-level fanouts work well. That said, setting leaf-level + fanout to a large number will likely cause problematic + lock contention on the leaf-level rcu_node structures unless + you boot with the skew_tick kernel parameter. + + Select a specific number if testing RCU itself. + + Select the maximum permissible value for large systems, but + please understand that you may also need to set the skew_tick + kernel boot parameter to avoid contention on the rcu_node + structure's locks. + + Take the default if unsure. + +config RCU_FAST_NO_HZ + bool "Accelerate last non-dyntick-idle CPU's grace periods" + depends on NO_HZ_COMMON && SMP && RCU_EXPERT + default n + help + This option permits CPUs to enter dynticks-idle state even if + they have RCU callbacks queued, and prevents RCU from waking + these CPUs up more than roughly once every four jiffies (by + default, you can adjust this using the rcutree.rcu_idle_gp_delay + parameter), thus improving energy efficiency. On the other + hand, this option increases the duration of RCU grace periods, + for example, slowing down synchronize_rcu(). + + Say Y if energy efficiency is critically important, and you + don't care about increased grace-period durations. + + Say N if you are unsure. + +config RCU_BOOST + bool "Enable RCU priority boosting" + depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT + default n + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. + This option also prevents heavy loads from blocking RCU + callback invocation for all flavors of RCU. + + Say Y here if you are working with real-time apps or heavy loads + Say N here if you are unsure. + +config RCU_BOOST_DELAY + int "Milliseconds to delay boosting after RCU grace-period start" + range 0 3000 + depends on RCU_BOOST + default 500 + help + This option specifies the time to wait after the beginning of + a given grace period before priority-boosting preempted RCU + readers blocking that grace period. Note that any RCU reader + blocking an expedited RCU grace period is boosted immediately. + + Accept the default if unsure. + +config RCU_NOCB_CPU + bool "Offload RCU callback processing from boot-selected CPUs" + depends on TREE_RCU || PREEMPT_RCU + depends on RCU_EXPERT || NO_HZ_FULL + default n + help + Use this option to reduce OS jitter for aggressive HPC or + real-time workloads. It can also be used to offload RCU + callback invocation to energy-efficient CPUs in battery-powered + asymmetric multiprocessors. + + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and + "s" for RCU-sched. Nothing prevents this kthread from running + on the specified CPUs, but (1) the kthreads may be preempted + between each callback, and (2) affinity or cgroups can be used + to force the kthreads to run on whatever set of CPUs is desired. + + Say Y here if you want to help to debug reduced OS jitter. + Say N here if you are unsure. + +endmenu # "RCU Subsystem" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug new file mode 100644 index 000000000000..0ec7d1d33a14 --- /dev/null +++ b/kernel/rcu/Kconfig.debug @@ -0,0 +1,82 @@ +# +# RCU-related debugging configuration options +# + +menu "RCU Debugging" + +config PROVE_RCU + def_bool PROVE_LOCKING + +config TORTURE_TEST + tristate + default n + +config RCU_PERF_TEST + tristate "performance tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs performance + tests on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU performance tests to be built into + the kernel. + Say M if you want the RCU performance tests to build as a module. + Say N if you are unsure. + +config RCU_TORTURE_TEST + tristate "torture tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs torture tests + on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU torture tests to be built into + the kernel. + Say M if you want the RCU torture tests to build as a module. + Say N if you are unsure. + +config RCU_CPU_STALL_TIMEOUT + int "RCU CPU stall timeout in seconds" + depends on RCU_STALL_COMMON + range 3 300 + default 21 + help + If a given RCU grace period extends more than the specified + number of seconds, a CPU stall warning is printed. If the + RCU grace period persists, additional CPU stall warnings are + printed at more widely spaced intervals. + +config RCU_TRACE + bool "Enable tracing for RCU" + depends on DEBUG_KERNEL + default y if TREE_RCU + select TRACE_CLOCK + help + This option enables additional tracepoints for ftrace-style + event tracing. + + Say Y here if you want to enable RCU tracing + Say N if you are unsure. + +config RCU_EQS_DEBUG + bool "Provide debugging asserts for adding NO_HZ support to an arch" + depends on DEBUG_KERNEL + help + This option provides consistency checks in RCU's handling of + NO_HZ. These checks have proven quite helpful in detecting + bugs in arch-specific NO_HZ code. + + Say N here if you need ultimate kernel/user switch latencies + Say Y if you are unsure + +endmenu # "RCU Debugging" diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 18dfc485225c..13c0fc852767 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,10 +3,11 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o -obj-$(CONFIG_SRCU) += srcu.o +obj-$(CONFIG_TREE_SRCU) += srcutree.o +obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o -obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o +obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0d6ff3e471be..808b8c85f626 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -56,6 +56,83 @@ #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ DYNTICK_TASK_FLAG) + +/* + * Grace-period counter management. + */ + +#define RCU_SEQ_CTR_SHIFT 2 +#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) + +/* + * Return the counter portion of a sequence number previously returned + * by rcu_seq_snap() or rcu_seq_current(). + */ +static inline unsigned long rcu_seq_ctr(unsigned long s) +{ + return s >> RCU_SEQ_CTR_SHIFT; +} + +/* + * Return the state portion of a sequence number previously returned + * by rcu_seq_snap() or rcu_seq_current(). + */ +static inline int rcu_seq_state(unsigned long s) +{ + return s & RCU_SEQ_STATE_MASK; +} + +/* + * Set the state portion of the pointed-to sequence number. + * The caller is responsible for preventing conflicting updates. + */ +static inline void rcu_seq_set_state(unsigned long *sp, int newstate) +{ + WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK); + WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate); +} + +/* Adjust sequence number for start of update-side operation. */ +static inline void rcu_seq_start(unsigned long *sp) +{ + WRITE_ONCE(*sp, *sp + 1); + smp_mb(); /* Ensure update-side operation after counter increment. */ + WARN_ON_ONCE(rcu_seq_state(*sp) != 1); +} + +/* Adjust sequence number for end of update-side operation. */ +static inline void rcu_seq_end(unsigned long *sp) +{ + smp_mb(); /* Ensure update-side operation before counter increment. */ + WARN_ON_ONCE(!rcu_seq_state(*sp)); + WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); +} + +/* Take a snapshot of the update side's sequence number. */ +static inline unsigned long rcu_seq_snap(unsigned long *sp) +{ + unsigned long s; + + s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK; + smp_mb(); /* Above access must not bleed into critical section. */ + return s; +} + +/* Return the current value the update side's sequence number, no ordering. */ +static inline unsigned long rcu_seq_current(unsigned long *sp) +{ + return READ_ONCE(*sp); +} + +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred. + */ +static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_GE(READ_ONCE(*sp), s); +} + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the @@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); + RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head)); + RCU_TRACE(trace_rcu_invoke_callback(rn, head);) head->func(head); rcu_lock_release(&rcu_callback_map); return false; @@ -135,6 +212,18 @@ int rcu_jiffies_till_stall_check(void); */ #define TPS(x) tracepoint_string(x) +/* + * Dump the ftrace buffer, but only one time per callsite per boot. + */ +#define rcu_ftrace_dump(oops_dump_mode) \ +do { \ + static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ + \ + if (!atomic_read(&___rfd_beenhere) && \ + !atomic_xchg(&___rfd_beenhere, 1)) \ + ftrace_dump(oops_dump_mode); \ +} while (0) + void rcu_early_boot_tests(void); void rcu_test_sync_prims(void); @@ -144,4 +233,341 @@ void rcu_test_sync_prims(void); */ extern void resched_cpu(int cpu); +#if defined(SRCU) || !defined(TINY_RCU) + +#include <linux/rcu_node_tree.h> + +extern int rcu_num_lvls; +extern int num_rcu_lvl[]; +extern int rcu_num_nodes; +static bool rcu_fanout_exact; +static int rcu_fanout_leaf; + +/* + * Compute the per-level fanout, either using the exact fanout specified + * or balancing the tree, depending on the rcu_fanout_exact boot parameter. + */ +static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) +{ + int i; + + if (rcu_fanout_exact) { + levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) + levelspread[i] = RCU_FANOUT; + } else { + int ccur; + int cprv; + + cprv = nr_cpu_ids; + for (i = rcu_num_lvls - 1; i >= 0; i--) { + ccur = levelcnt[i]; + levelspread[i] = (cprv + ccur - 1) / ccur; + cprv = ccur; + } + } +} + +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) + +/* + * Do a breadth-first scan of the non-leaf rcu_node structures for the + * specified rcu_state structure. Note that if there is a singleton + * rcu_node tree with but one rcu_node structure, this loop is a no-op. + */ +#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) + +/* + * Scan the leaves of the rcu_node hierarchy for the specified rcu_state + * structure. Note that if there is a singleton rcu_node tree with but + * one rcu_node structure, this loop -will- visit the rcu_node structure. + * It is still a leaf node, even if it is also the root node. + */ +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) + +/* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ + for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ + cpu <= rnp->grphi; \ + cpu = cpumask_next((cpu), cpu_possible_mask)) + +/* + * Wrappers for the rcu_node::lock acquire and release. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. + */ +#define raw_spin_lock_rcu_node(p) \ +do { \ + raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) + +#define raw_spin_lock_irq_rcu_node(p) \ +do { \ + raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irq_rcu_node(p) \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) + +#define raw_spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + +#define raw_spin_trylock_rcu_node(p) \ +({ \ + bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + +#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ + +#ifdef CONFIG_TINY_RCU +/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ +static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ +{ + return true; +} +static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ +{ + return false; +} + +static inline void rcu_expedite_gp(void) +{ +} + +static inline void rcu_unexpedite_gp(void) +{ +} +#else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_gp_is_normal(void); /* Internal RCU use. */ +bool rcu_gp_is_expedited(void); /* Internal RCU use. */ +void rcu_expedite_gp(void); +void rcu_unexpedite_gp(void); +void rcupdate_announce_bootup_oddness(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +#define RCU_SCHEDULER_INACTIVE 0 +#define RCU_SCHEDULER_INIT 1 +#define RCU_SCHEDULER_RUNNING 2 + +#ifdef CONFIG_TINY_RCU +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } +#else /* #ifdef CONFIG_TINY_RCU */ +void rcu_request_urgent_qs_task(struct task_struct *t); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +enum rcutorture_type { + RCU_FLAVOR, + RCU_BH_FLAVOR, + RCU_SCHED_FLAVOR, + RCU_TASKS_FLAVOR, + SRCU_FLAVOR, + INVALID_RCU_FLAVOR +}; + +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, + unsigned long *gpnum, unsigned long *completed); +void rcutorture_record_test_transition(void); +void rcutorture_record_progress(unsigned long vernum); +void do_trace_rcu_torture_read(const char *rcutorturename, + struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, + unsigned long c); +#else +static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, + int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + *flags = 0; + *gpnum = 0; + *completed = 0; +} +static inline void rcutorture_record_test_transition(void) +{ +} +static inline void rcutorture_record_progress(unsigned long vernum) +{ +} +#ifdef CONFIG_RCU_TRACE +void do_trace_rcu_torture_read(const char *rcutorturename, + struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, + unsigned long c); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ + do { } while (0) +#endif +#endif + +#ifdef CONFIG_TINY_SRCU + +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->srcu_idx; + *gpnum = *completed; +} + +#elif defined(CONFIG_TREE_SRCU) + +void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, unsigned long *completed); + +#endif + +#ifdef CONFIG_TINY_RCU + +/* + * Return the number of grace periods started. + */ +static inline unsigned long rcu_batches_started(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods started. + */ +static inline unsigned long rcu_batches_started_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods started. + */ +static inline unsigned long rcu_batches_started_sched(void) +{ + return 0; +} + +/* + * Return the number of grace periods completed. + */ +static inline unsigned long rcu_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods completed. + */ +static inline unsigned long rcu_batches_completed_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods completed. + */ +static inline unsigned long rcu_batches_completed_sched(void) +{ + return 0; +} + +/* + * Return the number of expedited grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of expedited sched grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed_sched(void) +{ + return 0; +} + +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return 0; +} + +static inline void rcu_force_quiescent_state(void) +{ +} + +static inline void rcu_bh_force_quiescent_state(void) +{ +} + +static inline void rcu_sched_force_quiescent_state(void) +{ +} + +static inline void show_rcu_gp_kthreads(void) +{ +} + +#else /* #ifdef CONFIG_TINY_RCU */ +extern unsigned long rcutorture_testseq; +extern unsigned long rcutorture_vernum; +unsigned long rcu_batches_started(void); +unsigned long rcu_batches_started_bh(void); +unsigned long rcu_batches_started_sched(void); +unsigned long rcu_batches_completed(void); +unsigned long rcu_batches_completed_bh(void); +unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_exp_batches_completed(void); +unsigned long rcu_exp_batches_completed_sched(void); +unsigned long srcu_batches_completed(struct srcu_struct *sp); +void show_rcu_gp_kthreads(void); +void rcu_force_quiescent_state(void); +void rcu_bh_force_quiescent_state(void); +void rcu_sched_force_quiescent_state(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +#ifdef CONFIG_RCU_NOCB_CPU +bool rcu_is_nocb_cpu(int cpu); +#else +static inline bool rcu_is_nocb_cpu(int cpu) { return false; } +#endif + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c new file mode 100644 index 000000000000..2b62a38b080f --- /dev/null +++ b/kernel/rcu/rcu_segcblist.c @@ -0,0 +1,505 @@ +/* + * RCU segmented callback lists, function definitions + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> + +#include "rcu_segcblist.h" + +/* Initialize simple callback list. */ +void rcu_cblist_init(struct rcu_cblist *rclp) +{ + rclp->head = NULL; + rclp->tail = &rclp->head; + rclp->len = 0; + rclp->len_lazy = 0; +} + +/* + * Debug function to actually count the number of callbacks. + * If the number exceeds the limit specified, return -1. + */ +long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) +{ + int cnt = 0; + struct rcu_head **rhpp = &rclp->head; + + for (;;) { + if (!*rhpp) + return cnt; + if (++cnt > lim) + return -1; + rhpp = &(*rhpp)->next; + } +} + +/* + * Dequeue the oldest rcu_head structure from the specified callback + * list. This function assumes that the callback is non-lazy, but + * the caller can later invoke rcu_cblist_dequeued_lazy() if it + * finds otherwise (and if it cares about laziness). This allows + * different users to have different ways of determining laziness. + */ +struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) +{ + struct rcu_head *rhp; + + rhp = rclp->head; + if (!rhp) + return NULL; + rclp->len--; + rclp->head = rhp->next; + if (!rclp->head) + rclp->tail = &rclp->head; + return rhp; +} + +/* + * Initialize an rcu_segcblist structure. + */ +void rcu_segcblist_init(struct rcu_segcblist *rsclp) +{ + int i; + + BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); + BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); + rsclp->head = NULL; + for (i = 0; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = &rsclp->head; + rsclp->len = 0; + rsclp->len_lazy = 0; +} + +/* + * Disable the specified rcu_segcblist structure, so that callbacks can + * no longer be posted to it. This structure must be empty. + */ +void rcu_segcblist_disable(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); + rsclp->tails[RCU_NEXT_TAIL] = NULL; +} + +/* + * Is the specified segment of the specified rcu_segcblist structure + * empty of callbacks? + */ +bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) +{ + if (seg == RCU_DONE_TAIL) + return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; + return rsclp->tails[seg - 1] == rsclp->tails[seg]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are ready to be invoked? + */ +bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are still pending, that is, not yet ready to be invoked? + */ +bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); +} + +/* + * Dequeue and return the first ready-to-invoke callback. If there + * are no ready-to-invoke callbacks, return NULL. Disables interrupts + * to avoid interference. Does not protect from interference from other + * CPUs or tasks. + */ +struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + int i; + struct rcu_head *rhp; + + local_irq_save(flags); + if (!rcu_segcblist_ready_cbs(rsclp)) { + local_irq_restore(flags); + return NULL; + } + rhp = rsclp->head; + BUG_ON(!rhp); + rsclp->head = rhp->next; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { + if (rsclp->tails[i] != &rhp->next) + break; + rsclp->tails[i] = &rsclp->head; + } + smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ + WRITE_ONCE(rsclp->len, rsclp->len - 1); + local_irq_restore(flags); + return rhp; +} + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + + local_irq_save(flags); + rsclp->len_lazy--; + local_irq_restore(flags); +} + +/* + * Return a pointer to the first callback in the specified rcu_segcblist + * structure. This is useful for diagnostics. + */ +struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return rsclp->head; + return NULL; +} + +/* + * Return a pointer to the first pending callback in the specified + * rcu_segcblist structure. This is useful just after posting a given + * callback -- if that callback is the first pending callback, then + * you cannot rely on someone else having already started up the required + * grace period. + */ +struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return *rsclp->tails[RCU_DONE_TAIL]; + return NULL; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * have not yet been processed beyond having been posted, that is, + * does it contain callbacks in its last segment? + */ +bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); +} + +/* + * Enqueue the specified callback onto the specified rcu_segcblist + * structure, updating accounting as needed. Note that the ->len + * field may be accessed locklessly, hence the WRITE_ONCE(). + * The ->len field is used by rcu_barrier() and friends to determine + * if it must post a callback on this structure, and it is OK + * for rcu_barrier() to sometimes post callbacks needlessly, but + * absolutely not OK for it to ever miss posting a callback. + */ +void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rhp->next = NULL; + *rsclp->tails[RCU_NEXT_TAIL] = rhp; + rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; +} + +/* + * Entrain the specified callback onto the specified rcu_segcblist at + * the end of the last non-empty segment. If the entire rcu_segcblist + * is empty, make no change, but return false. + * + * This is intended for use by rcu_barrier()-like primitives, -not- + * for normal grace-period use. IMPORTANT: The callback you enqueue + * will wait for all prior callbacks, NOT necessarily for a grace + * period. You have been warned. + */ +bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + int i; + + if (rcu_segcblist_n_cbs(rsclp) == 0) + return false; + WRITE_ONCE(rsclp->len, rsclp->len + 1); + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is entrained. */ + rhp->next = NULL; + for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1]) + break; + *rsclp->tails[i] = rhp; + for (; i <= RCU_NEXT_TAIL; i++) + rsclp->tails[i] = &rhp->next; + return true; +} + +/* + * Extract only the counts from the specified rcu_segcblist structure, + * and place them in the specified rcu_cblist structure. This function + * supports both callback orphaning and invocation, hence the separation + * of counts and callbacks. (Callbacks ready for invocation must be + * orphaned and adopted separately from pending callbacks, but counts + * apply to all callbacks. Locking must be used to make sure that + * both orphaned-callbacks lists are consistent.) + */ +void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rclp->len_lazy += rsclp->len_lazy; + rclp->len += rsclp->len; + rsclp->len_lazy = 0; + WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ +} + +/* + * Extract only those callbacks ready to be invoked from the specified + * rcu_segcblist structure and place them in the specified rcu_cblist + * structure. + */ +void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_ready_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = rsclp->head; + rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + rclp->tail = rsclp->tails[RCU_DONE_TAIL]; + for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) + if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) + rsclp->tails[i] = &rsclp->head; +} + +/* + * Extract only those callbacks still pending (not yet ready to be + * invoked) from the specified rcu_segcblist structure and place them in + * the specified rcu_cblist structure. Note that this loses information + * about any callbacks that might have been partway done waiting for + * their grace period. Too bad! They will have to start over. + */ +void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_pend_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; + rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Insert counts from the specified rcu_cblist structure in the + * specified rcu_segcblist structure. + */ +void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rsclp->len_lazy += rclp->len_lazy; + /* ->len sampled locklessly. */ + WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); + rclp->len_lazy = 0; + rclp->len = 0; +} + +/* + * Move callbacks from the specified rcu_cblist to the beginning of the + * done-callbacks segment of the specified rcu_segcblist. + */ +void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rclp->head) + return; /* No callbacks to move. */ + *rclp->tail = rsclp->head; + rsclp->head = rclp->head; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) + if (&rsclp->head == rsclp->tails[i]) + rsclp->tails[i] = rclp->tail; + else + break; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Move callbacks from the specified rcu_cblist to the end of the + * new-callbacks segment of the specified rcu_segcblist. + */ +void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + if (!rclp->head) + return; /* Nothing to do. */ + *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; + rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Advance the callbacks in the specified rcu_segcblist structure based + * on the current value passed in for the grace-period counter. + */ +void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) +{ + int i, j; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return; + + /* + * Find all callbacks whose ->gp_seq numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL segment. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + break; + rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; + } + + /* If no callbacks moved, nothing more need be done. */ + if (i == RCU_WAIT_TAIL) + return; + + /* Clean up tail pointers that might have been misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; + + /* + * Callbacks moved, so clean up the misordered ->tails[] pointers + * that now point into the middle of the list of ready-to-invoke + * callbacks. The overall effect is to copy down the later pointers + * into the gap that was created by the now-ready segments. + */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) + break; /* No more callbacks. */ + rsclp->tails[j] = rsclp->tails[i]; + rsclp->gp_seq[j] = rsclp->gp_seq[i]; + } +} + +/* + * "Accelerate" callbacks based on more-accurate grace-period information. + * The reason for this is that RCU does not synchronize the beginnings and + * ends of grace periods, and that callbacks are posted locally. This in + * turn means that the callbacks must be labelled conservatively early + * on, as getting exact information would degrade both performance and + * scalability. When more accurate grace-period information becomes + * available, previously posted callbacks can be "accelerated", marking + * them to complete at the end of the earlier grace period. + * + * This function operates on an rcu_segcblist structure, and also the + * grace-period sequence number seq at which new callbacks would become + * ready to invoke. Returns true if there are callbacks that won't be + * ready to invoke until seq, false otherwise. + */ +bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) +{ + int i; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return false; + + /* + * Find the segment preceding the oldest segment of callbacks + * whose ->gp_seq[] completion is at or after that passed in via + * "seq", skipping any empty segments. This oldest segment, along + * with any later segments, can be merged in with any newly arrived + * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq" + * as their ->gp_seq[] grace-period completion sequence number. + */ + for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1] && + ULONG_CMP_LT(rsclp->gp_seq[i], seq)) + break; + + /* + * If all the segments contain callbacks that correspond to + * earlier grace-period sequence numbers than "seq", leave. + * Assuming that the rcu_segcblist structure has enough + * segments in its arrays, this can only happen if some of + * the non-done segments contain callbacks that really are + * ready to invoke. This situation will get straightened + * out by the next call to rcu_segcblist_advance(). + * + * Also advance to the oldest segment of callbacks whose + * ->gp_seq[] completion is at or after that passed in via "seq", + * skipping any empty segments. + */ + if (++i >= RCU_NEXT_TAIL) + return false; + + /* + * Merge all later callbacks, including newly arrived callbacks, + * into the segment located by the for-loop above. Assign "seq" + * as the ->gp_seq[] value in order to correctly handle the case + * where there were no pending callbacks in the rcu_segcblist + * structure other than in the RCU_NEXT_TAIL segment. + */ + for (; i < RCU_NEXT_TAIL; i++) { + rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; + rsclp->gp_seq[i] = seq; + } + return true; +} + +/* + * Scan the specified rcu_segcblist structure for callbacks that need + * a grace period later than the one specified by "seq". We don't look + * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't + * have a grace-period sequence number. + */ +bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i; + + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rsclp->tails[i - 1] != rsclp->tails[i] && + ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + return true; + return false; +} diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h new file mode 100644 index 000000000000..6e36e36478cd --- /dev/null +++ b/kernel/rcu/rcu_segcblist.h @@ -0,0 +1,164 @@ +/* + * RCU segmented callback lists, internal-to-rcu header file + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#include <linux/rcu_segcblist.h> + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) +{ + rclp->len_lazy--; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) +{ + return rclp->head; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) +{ + WARN_ON_ONCE(!rclp->head); + return rclp->tail; +} + +void rcu_cblist_init(struct rcu_cblist *rclp); +long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim); +struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); + +/* + * Is the specified rcu_segcblist structure empty? + * + * But careful! The fact that the ->head field is NULL does not + * necessarily imply that there are no callbacks associated with + * this structure. When callbacks are being invoked, they are + * removed as a group. If callback invocation must be preempted, + * the remaining callbacks will be added back to the list. Either + * way, the counts are updated later. + * + * So it is often the case that rcu_segcblist_n_cbs() should be used + * instead. + */ +static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) +{ + return !rsclp->head; +} + +/* Return number of callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) +{ + return READ_ONCE(rsclp->len); +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len_lazy; +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len - rsclp->len_lazy; +} + +/* + * Is the specified rcu_segcblist enabled, for example, not corresponding + * to an offline or callback-offloaded CPU? + */ +static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) +{ + return !!rsclp->tails[RCU_NEXT_TAIL]; +} + +/* + * Are all segments following the specified segment of the specified + * rcu_segcblist structure empty of callbacks? (The specified + * segment might well contain callbacks.) + */ +static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) +{ + return !*rsclp->tails[seg]; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) +{ + return rsclp->head; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); + return rsclp->tails[RCU_NEXT_TAIL]; +} + +void rcu_segcblist_init(struct rcu_segcblist *rsclp); +void rcu_segcblist_disable(struct rcu_segcblist *rsclp); +bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg); +bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); +bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); +struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); +void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp); +struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); +struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); +bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp); +void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy); +bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy); +void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); +bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); +bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, + unsigned long seq); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a86fb47e4a..3cc18110b612 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -48,6 +48,8 @@ #include <linux/torture.h> #include <linux/vmalloc.h> +#include "rcu.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); @@ -59,12 +61,16 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nreaders, 0, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, shutdown, !IS_ENABLED(MODULE), + "Shutdown at end of performance tests."); torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); +torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -86,13 +92,16 @@ static u64 t_rcu_perf_writer_started; static u64 t_rcu_perf_writer_finished; static unsigned long b_rcu_perf_writer_started; static unsigned long b_rcu_perf_writer_finished; +static DEFINE_PER_CPU(atomic_t, n_async_inflight); static int rcu_perf_writer_state; #define RTWS_INIT 0 -#define RTWS_EXP_SYNC 1 -#define RTWS_SYNC 2 -#define RTWS_IDLE 2 -#define RTWS_STOPPING 3 +#define RTWS_ASYNC 1 +#define RTWS_BARRIER 2 +#define RTWS_EXP_SYNC 3 +#define RTWS_SYNC 4 +#define RTWS_IDLE 5 +#define RTWS_STOPPING 6 #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -114,6 +123,8 @@ struct rcu_perf_ops { unsigned long (*started)(void); unsigned long (*completed)(void); unsigned long (*exp_completed)(void); + void (*async)(struct rcu_head *head, rcu_callback_t func); + void (*gp_barrier)(void); void (*sync)(void); void (*exp_sync)(void); const char *name; @@ -153,6 +164,8 @@ static struct rcu_perf_ops rcu_ops = { .started = rcu_batches_started, .completed = rcu_batches_completed, .exp_completed = rcu_exp_batches_completed, + .async = call_rcu, + .gp_barrier = rcu_barrier, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .name = "rcu" @@ -181,6 +194,8 @@ static struct rcu_perf_ops rcu_bh_ops = { .started = rcu_batches_started_bh, .completed = rcu_batches_completed_bh, .exp_completed = rcu_exp_batches_completed_sched, + .async = call_rcu_bh, + .gp_barrier = rcu_barrier_bh, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, .name = "rcu_bh" @@ -208,6 +223,16 @@ static unsigned long srcu_perf_completed(void) return srcu_batches_completed(srcu_ctlp); } +static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + call_srcu(srcu_ctlp, head, func); +} + +static void srcu_rcu_barrier(void) +{ + srcu_barrier(srcu_ctlp); +} + static void srcu_perf_synchronize(void) { synchronize_srcu(srcu_ctlp); @@ -226,11 +251,42 @@ static struct rcu_perf_ops srcu_ops = { .started = NULL, .completed = srcu_perf_completed, .exp_completed = srcu_perf_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, .sync = srcu_perf_synchronize, .exp_sync = srcu_perf_synchronize_expedited, .name = "srcu" }; +static struct srcu_struct srcud; + +static void srcu_sync_perf_init(void) +{ + srcu_ctlp = &srcud; + init_srcu_struct(srcu_ctlp); +} + +static void srcu_sync_perf_cleanup(void) +{ + cleanup_srcu_struct(srcu_ctlp); +} + +static struct rcu_perf_ops srcud_ops = { + .ptype = SRCU_FLAVOR, + .init = srcu_sync_perf_init, + .cleanup = srcu_sync_perf_cleanup, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcud" +}; + /* * Definitions for sched perf testing. */ @@ -254,6 +310,8 @@ static struct rcu_perf_ops sched_ops = { .started = rcu_batches_started_sched, .completed = rcu_batches_completed_sched, .exp_completed = rcu_exp_batches_completed_sched, + .async = call_rcu_sched, + .gp_barrier = rcu_barrier_sched, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, .name = "sched" @@ -281,6 +339,8 @@ static struct rcu_perf_ops tasks_ops = { .readunlock = tasks_perf_read_unlock, .started = rcu_no_completed, .completed = rcu_no_completed, + .async = call_rcu_tasks, + .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, .name = "tasks" @@ -344,6 +404,15 @@ rcu_perf_reader(void *arg) } /* + * Callback function for asynchronous grace periods from rcu_perf_writer(). + */ +static void rcu_perf_async_cb(struct rcu_head *rhp) +{ + atomic_dec(this_cpu_ptr(&n_async_inflight)); + kfree(rhp); +} + +/* * RCU perf writer kthread. Repeatedly does a grace period. */ static int @@ -352,6 +421,7 @@ rcu_perf_writer(void *arg) int i = 0; int i_max; long me = (long)arg; + struct rcu_head *rhp = NULL; struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; @@ -380,9 +450,27 @@ rcu_perf_writer(void *arg) } do { + if (writer_holdoff) + udelay(writer_holdoff); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); - if (gp_exp) { + if (gp_async) { +retry: + if (!rhp) + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { + rcu_perf_writer_state = RTWS_ASYNC; + atomic_inc(this_cpu_ptr(&n_async_inflight)); + cur_ops->async(rhp, rcu_perf_async_cb); + rhp = NULL; + } else if (!kthread_should_stop()) { + rcu_perf_writer_state = RTWS_BARRIER; + cur_ops->gp_barrier(); + goto retry; + } else { + kfree(rhp); /* Because we are stopping. */ + } + } else if (gp_exp) { rcu_perf_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); } else { @@ -429,6 +517,10 @@ rcu_perf_writer(void *arg) i++; rcu_perf_wait_shutdown(); } while (!torture_must_stop()); + if (gp_async) { + rcu_perf_writer_state = RTWS_BARRIER; + cur_ops->gp_barrier(); + } rcu_perf_writer_state = RTWS_STOPPING; writer_n_durations[me] = i_max; torture_kthread_stopping("rcu_perf_writer"); @@ -452,6 +544,17 @@ rcu_perf_cleanup(void) u64 *wdp; u64 *wdpp; + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); + if (torture_cleanup_begin()) return; @@ -554,7 +657,7 @@ rcu_perf_init(void) long i; int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, RCUPERF_TASKS_OPS }; @@ -624,16 +727,6 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - firsterr = -EINVAL; - goto unwind; - } - if (rcu_gp_is_normal() && gp_exp) { - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - firsterr = -EINVAL; - goto unwind; - } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index cccc417a8135..b8f7f8ce8575 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,6 +52,8 @@ #include <linux/torture.h> #include <linux/vmalloc.h> +#include "rcu.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); @@ -559,19 +561,22 @@ static void srcu_torture_barrier(void) static void srcu_torture_stats(void) { - int cpu; - int idx = srcu_ctlp->completed & 0x1; + int __maybe_unused cpu; + int idx; - pr_alert("%s%s per-CPU(idx=%d):", +#ifdef CONFIG_TREE_SRCU + idx = srcu_ctlp->srcu_idx & 0x1; + pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; - struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); + struct srcu_data *counts; - u0 = counts->unlock_count[!idx]; - u1 = counts->unlock_count[idx]; + counts = per_cpu_ptr(srcu_ctlp->sda, cpu); + u0 = counts->srcu_unlock_count[!idx]; + u1 = counts->srcu_unlock_count[idx]; /* * Make sure that a lock is always counted if the corresponding @@ -579,14 +584,21 @@ static void srcu_torture_stats(void) */ smp_rmb(); - l0 = counts->lock_count[!idx]; - l1 = counts->lock_count[idx]; + l0 = counts->srcu_lock_count[!idx]; + l1 = counts->srcu_lock_count[idx]; c0 = l0 - u0; c1 = l1 - u1; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); +#elif defined(CONFIG_TINY_SRCU) + idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", + torture_type, TORTURE_FLAG, idx, + READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), + READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); +#endif } static void srcu_torture_synchronize_expedited(void) @@ -1333,12 +1345,14 @@ rcu_torture_stats_print(void) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { - int __maybe_unused flags; - unsigned long __maybe_unused gpnum; - unsigned long __maybe_unused completed; + int __maybe_unused flags = 0; + unsigned long __maybe_unused gpnum = 0; + unsigned long __maybe_unused completed = 0; rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, + &flags, &gpnum, &completed); wtp = READ_ONCE(writer_task); pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", rcu_torture_writer_state_getname(), diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c deleted file mode 100644 index ef3bcfb15b39..000000000000 --- a/kernel/rcu/srcu.c +++ /dev/null @@ -1,656 +0,0 @@ -/* - * Sleepable Read-Copy Update mechanism for mutual exclusion. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (C) IBM Corporation, 2006 - * Copyright (C) Fujitsu, 2012 - * - * Author: Paul McKenney <paulmck@us.ibm.com> - * Lai Jiangshan <laijs@cn.fujitsu.com> - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ - -#include <linux/export.h> -#include <linux/mutex.h> -#include <linux/percpu.h> -#include <linux/preempt.h> -#include <linux/rcupdate_wait.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/delay.h> -#include <linux/srcu.h> - -#include "rcu.h" - -/* - * Initialize an rcu_batch structure to empty. - */ -static inline void rcu_batch_init(struct rcu_batch *b) -{ - b->head = NULL; - b->tail = &b->head; -} - -/* - * Enqueue a callback onto the tail of the specified rcu_batch structure. - */ -static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) -{ - *b->tail = head; - b->tail = &head->next; -} - -/* - * Is the specified rcu_batch structure empty? - */ -static inline bool rcu_batch_empty(struct rcu_batch *b) -{ - return b->tail == &b->head; -} - -/* - * Remove the callback at the head of the specified rcu_batch structure - * and return a pointer to it, or return NULL if the structure is empty. - */ -static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) -{ - struct rcu_head *head; - - if (rcu_batch_empty(b)) - return NULL; - - head = b->head; - b->head = head->next; - if (b->tail == &head->next) - rcu_batch_init(b); - - return head; -} - -/* - * Move all callbacks from the rcu_batch structure specified by "from" to - * the structure specified by "to". - */ -static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) -{ - if (!rcu_batch_empty(from)) { - *to->tail = from->head; - to->tail = from->tail; - rcu_batch_init(from); - } -} - -static int init_srcu_struct_fields(struct srcu_struct *sp) -{ - sp->completed = 0; - spin_lock_init(&sp->queue_lock); - sp->running = false; - rcu_batch_init(&sp->batch_queue); - rcu_batch_init(&sp->batch_check0); - rcu_batch_init(&sp->batch_check1); - rcu_batch_init(&sp->batch_done); - INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_array); - return sp->per_cpu_ref ? 0 : -ENOMEM; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -int __init_srcu_struct(struct srcu_struct *sp, const char *name, - struct lock_class_key *key) -{ - /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(__init_srcu_struct); - -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/** - * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. - * - * Must invoke this on a given srcu_struct before passing that srcu_struct - * to any other function. Each srcu_struct represents a separate domain - * of SRCU protection. - */ -int init_srcu_struct(struct srcu_struct *sp) -{ - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(init_srcu_struct); - -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/* - * Returns approximate total of the readers' ->lock_count[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->lock_count[idx]); - } - return sum; -} - -/* - * Returns approximate total of the readers' ->unlock_count[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->unlock_count[idx]); - } - return sum; -} - -/* - * Return true if the number of pre-existing readers is determined to - * be zero. - */ -static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) -{ - unsigned long unlocks; - - unlocks = srcu_readers_unlock_idx(sp, idx); - - /* - * Make sure that a lock is always counted if the corresponding unlock - * is counted. Needs to be a smp_mb() as the read side may contain a - * read from a variable that is written to before the synchronize_srcu() - * in the write side. In this case smp_mb()s A and B act like the store - * buffering pattern. - * - * This smp_mb() also pairs with smp_mb() C to prevent accesses after the - * synchronize_srcu() from being executed before the grace period ends. - */ - smp_mb(); /* A */ - - /* - * If the locks are the same as the unlocks, then there must have - * been no readers on this index at some time in between. This does not - * mean that there are no more readers, as one could have read the - * current index but not have incremented the lock counter yet. - * - * Possible bug: There is no guarantee that there haven't been ULONG_MAX - * increments of ->lock_count[] since the unlocks were counted, meaning - * that this could return true even if there are still active readers. - * Since there are no memory barriers around srcu_flip(), the CPU is not - * required to increment ->completed before running - * srcu_readers_unlock_idx(), which means that there could be an - * arbitrarily large number of critical sections that execute after - * srcu_readers_unlock_idx() but use the old value of ->completed. - */ - return srcu_readers_lock_idx(sp, idx) == unlocks; -} - -/** - * srcu_readers_active - returns true if there are readers. and false - * otherwise - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). - * - * Note that this is not an atomic primitive, and can therefore suffer - * severe errors when invoked on an active srcu_struct. That said, it - * can be useful as an error check at cleanup time. - */ -static bool srcu_readers_active(struct srcu_struct *sp) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->lock_count[0]); - sum += READ_ONCE(cpuc->lock_count[1]); - sum -= READ_ONCE(cpuc->unlock_count[0]); - sum -= READ_ONCE(cpuc->unlock_count[1]); - } - return sum; -} - -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) -{ - if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - free_percpu(sp->per_cpu_ref); - sp->per_cpu_ref = NULL; -} -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); - -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Must be called from process context. - * Returns an index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - idx = READ_ONCE(sp->completed) & 0x1; - __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); - smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - -/* - * Removes the count for the old reader from the appropriate per-CPU - * element of the srcu_struct. Note that this may well be a different - * CPU than that which was incremented by the corresponding srcu_read_lock(). - * Must be called from process context. - */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) -{ - smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); -} -EXPORT_SYMBOL_GPL(__srcu_read_unlock); - -/* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after 10 microseconds, - * we repeatedly block for 1-millisecond time periods. This approach - * has done well in testing, so there is no need for a config parameter. - */ -#define SRCU_RETRY_CHECK_DELAY 5 -#define SYNCHRONIZE_SRCU_TRYCOUNT 2 -#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 - -/* - * @@@ Wait until all pre-existing readers complete. Such readers - * will have used the index specified by "idx". - * the caller should ensures the ->completed is not changed while checking - * and idx = (->completed & 1) ^ 1 - */ -static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) -{ - for (;;) { - if (srcu_readers_active_idx_check(sp, idx)) - return true; - if (--trycount <= 0) - return false; - udelay(SRCU_RETRY_CHECK_DELAY); - } -} - -/* - * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->(un)lock_count[] arrays. This allows - * us to wait for pre-existing readers in a starvation-free manner. - */ -static void srcu_flip(struct srcu_struct *sp) -{ - WRITE_ONCE(sp->completed, sp->completed + 1); - - /* - * Ensure that if the updater misses an __srcu_read_unlock() - * increment, that task's next __srcu_read_lock() will see the - * above counter update. Note that both this memory barrier - * and the one in srcu_readers_active_idx_check() provide the - * guarantee for __srcu_read_lock(). - */ - smp_mb(); /* D */ /* Pairs with C. */ -} - -/* - * Enqueue an SRCU callback on the specified srcu_struct structure, - * initiating grace-period processing if it is not already running. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing SRCU read-side critical section. On systems with - * more than one CPU, this means that when "func()" is invoked, each CPU - * is guaranteed to have executed a full memory barrier since the end of - * its last corresponding SRCU read-side critical section whose beginning - * preceded the call to call_rcu(). It also means that each CPU executing - * an SRCU read-side critical section that continues beyond the start of - * "func()" must have executed a memory barrier after the call_rcu() - * but before the beginning of that SRCU read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting SRCU callback function "func()", then both CPU A and CPU - * B are guaranteed to execute a full memory barrier during the time - * interval between the call to call_rcu() and the invocation of "func()". - * This guarantee applies even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - * - * Of course, these guarantees apply only for invocations of call_srcu(), - * srcu_read_lock(), and srcu_read_unlock() that are all passed the same - * srcu_struct structure. - */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, - rcu_callback_t func) -{ - unsigned long flags; - - head->next = NULL; - head->func = func; - spin_lock_irqsave(&sp->queue_lock, flags); - smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ - rcu_batch_queue(&sp->batch_queue, head); - if (!sp->running) { - sp->running = true; - queue_delayed_work(system_power_efficient_wq, &sp->work, 0); - } - spin_unlock_irqrestore(&sp->queue_lock, flags); -} -EXPORT_SYMBOL_GPL(call_srcu); - -static void srcu_advance_batches(struct srcu_struct *sp, int trycount); -static void srcu_reschedule(struct srcu_struct *sp); - -/* - * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). - */ -static void __synchronize_srcu(struct srcu_struct *sp, int trycount) -{ - struct rcu_synchronize rcu; - struct rcu_head *head = &rcu.head; - bool done = false; - - RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || - lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); - - might_sleep(); - init_completion(&rcu.completion); - - head->next = NULL; - head->func = wakeme_after_rcu; - spin_lock_irq(&sp->queue_lock); - smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ - if (!sp->running) { - /* steal the processing owner */ - sp->running = true; - rcu_batch_queue(&sp->batch_check0, head); - spin_unlock_irq(&sp->queue_lock); - - srcu_advance_batches(sp, trycount); - if (!rcu_batch_empty(&sp->batch_done)) { - BUG_ON(sp->batch_done.head != head); - rcu_batch_dequeue(&sp->batch_done); - done = true; - } - /* give the processing owner to work_struct */ - srcu_reschedule(sp); - } else { - rcu_batch_queue(&sp->batch_queue, head); - spin_unlock_irq(&sp->queue_lock); - } - - if (!done) { - wait_for_completion(&rcu.completion); - smp_mb(); /* Caller's later accesses after GP. */ - } - -} - -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Wait for the count to drain to zero of both indexes. To avoid the - * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->completed & 1) ^ 1) to drain to zero at first, - * and then flip the completed and wait for the count of the other index. - * - * Can block; must be called from process context. - * - * Note that it is illegal to call synchronize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section, - * as long as the resulting graph of srcu_structs is acyclic. - * - * There are memory-ordering constraints implied by synchronize_srcu(). - * On systems with more than one CPU, when synchronize_srcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last corresponding SRCU-sched read-side critical section - * whose beginning preceded the call to synchronize_srcu(). In addition, - * each CPU having an SRCU read-side critical section that extends beyond - * the return from synchronize_srcu() is guaranteed to have executed a - * full memory barrier after the beginning of synchronize_srcu() and before - * the beginning of that SRCU read-side critical section. Note that these - * guarantees include CPUs that are offline, idle, or executing in user mode, - * as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_srcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_srcu(). This guarantee applies even if CPU A and CPU B - * are the same CPU, but again only if the system has more than one CPU. - * - * Of course, these memory-ordering guarantees apply only when - * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are - * passed the same srcu_struct structure. - */ -void synchronize_srcu(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) - ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT - : SYNCHRONIZE_SRCU_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu); - -/** - * synchronize_srcu_expedited - Brute-force SRCU grace period - * @sp: srcu_struct with which to synchronize. - * - * Wait for an SRCU grace period to elapse, but be more aggressive about - * spinning rather than blocking when waiting. - * - * Note that synchronize_srcu_expedited() has the same deadlock and - * memory-ordering properties as does synchronize_srcu(). - */ -void synchronize_srcu_expedited(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); - -/** - * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. - * @sp: srcu_struct on which to wait for in-flight callbacks. - */ -void srcu_barrier(struct srcu_struct *sp) -{ - synchronize_srcu(sp); -} -EXPORT_SYMBOL_GPL(srcu_barrier); - -/** - * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. - * - * Report the number of batches, correlated with, but not necessarily - * precisely the same as, the number of grace periods that have elapsed. - */ -unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return sp->completed; -} -EXPORT_SYMBOL_GPL(srcu_batches_completed); - -#define SRCU_CALLBACK_BATCH 10 -#define SRCU_INTERVAL 1 - -/* - * Move any new SRCU callbacks to the first stage of the SRCU grace - * period pipeline. - */ -static void srcu_collect_new(struct srcu_struct *sp) -{ - if (!rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - rcu_batch_move(&sp->batch_check0, &sp->batch_queue); - spin_unlock_irq(&sp->queue_lock); - } -} - -/* - * Core SRCU state machine. Advance callbacks from ->batch_check0 to - * ->batch_check1 and then to ->batch_done as readers drain. - */ -static void srcu_advance_batches(struct srcu_struct *sp, int trycount) -{ - int idx = 1 ^ (sp->completed & 1); - - /* - * Because readers might be delayed for an extended period after - * fetching ->completed for their index, at any point in time there - * might well be readers using both idx=0 and idx=1. We therefore - * need to wait for readers to clear from both index values before - * invoking a callback. - */ - - if (rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_check1)) - return; /* no callbacks need to be advanced */ - - if (!try_check_zero(sp, idx, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have already done with their - * first zero check and flip back when they were enqueued on - * ->batch_check0 in a previous invocation of srcu_advance_batches(). - * (Presumably try_check_zero() returned false during that - * invocation, leaving the callbacks stranded on ->batch_check1.) - * They are therefore ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); - - if (rcu_batch_empty(&sp->batch_check0)) - return; /* no callbacks need to be advanced */ - srcu_flip(sp); - - /* - * The callbacks in ->batch_check0 just finished their - * first check zero and flip, so move them to ->batch_check1 - * for future checking on the other idx. - */ - rcu_batch_move(&sp->batch_check1, &sp->batch_check0); - - /* - * SRCU read-side critical sections are normally short, so check - * at least twice in quick succession after a flip. - */ - trycount = trycount < 2 ? 2 : trycount; - if (!try_check_zero(sp, idx^1, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have now waited for all - * pre-existing readers using both idx values. They are therefore - * ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); -} - -/* - * Invoke a limited number of SRCU callbacks that have passed through - * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. Note that needed memory barriers have been executed - * in this task's context by srcu_readers_active_idx_check(). - */ -static void srcu_invoke_callbacks(struct srcu_struct *sp) -{ - int i; - struct rcu_head *head; - - for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { - head = rcu_batch_dequeue(&sp->batch_done); - if (!head) - break; - local_bh_disable(); - head->func(head); - local_bh_enable(); - } -} - -/* - * Finished one round of SRCU grace period. Start another if there are - * more SRCU callbacks queued, otherwise put SRCU into not-running state. - */ -static void srcu_reschedule(struct srcu_struct *sp) -{ - bool pending = true; - - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - sp->running = false; - pending = false; - } - spin_unlock_irq(&sp->queue_lock); - } - - if (pending) - queue_delayed_work(system_power_efficient_wq, - &sp->work, SRCU_INTERVAL); -} - -/* - * This is the work-queue function that handles SRCU grace periods. - */ -void process_srcu(struct work_struct *work) -{ - struct srcu_struct *sp; - - sp = container_of(work, struct srcu_struct, work.work); - - srcu_collect_new(sp); - srcu_advance_batches(sp, 1); - srcu_invoke_callbacks(sp); - srcu_reschedule(sp); -} -EXPORT_SYMBOL_GPL(process_srcu); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c new file mode 100644 index 000000000000..1a1c1047d2ed --- /dev/null +++ b/kernel/rcu/srcutiny.c @@ -0,0 +1,195 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tiny version for non-preemptible single-CPU use. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + */ + +#include <linux/export.h> +#include <linux/mutex.h> +#include <linux/preempt.h> +#include <linux/rcupdate_wait.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/srcu.h> + +#include <linux/rcu_node_tree.h> +#include "rcu_segcblist.h" +#include "rcu.h" + +static int init_srcu_struct_fields(struct srcu_struct *sp) +{ + sp->srcu_lock_nesting[0] = 0; + sp->srcu_lock_nesting[1] = 0; + init_swait_queue_head(&sp->srcu_wq); + sp->srcu_cb_head = NULL; + sp->srcu_cb_tail = &sp->srcu_cb_head; + sp->srcu_gp_running = false; + sp->srcu_gp_waiting = false; + sp->srcu_idx = 0; + INIT_WORK(&sp->srcu_work, srcu_drive_gp); + return 0; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(init_srcu_struct); + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); + flush_work(&sp->srcu_work); + WARN_ON(sp->srcu_gp_running); + WARN_ON(sp->srcu_gp_waiting); + WARN_ON(sp->srcu_cb_head); + WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); +} +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Removes the count for the old reader from the appropriate element of + * the srcu_struct. + */ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + int newval = sp->srcu_lock_nesting[idx] - 1; + + WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); + if (!newval && READ_ONCE(sp->srcu_gp_waiting)) + swake_up(&sp->srcu_wq); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock); + +/* + * Workqueue handler to drive one grace period and invoke any callbacks + * that become ready as a result. Single-CPU and !PREEMPT operation + * means that we get away with murder on synchronization. ;-) + */ +void srcu_drive_gp(struct work_struct *wp) +{ + int idx; + struct rcu_head *lh; + struct rcu_head *rhp; + struct srcu_struct *sp; + + sp = container_of(wp, struct srcu_struct, srcu_work); + if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head)) + return; /* Already running or nothing to do. */ + + /* Remove recently arrived callbacks and wait for readers. */ + WRITE_ONCE(sp->srcu_gp_running, true); + local_irq_disable(); + lh = sp->srcu_cb_head; + sp->srcu_cb_head = NULL; + sp->srcu_cb_tail = &sp->srcu_cb_head; + local_irq_enable(); + idx = sp->srcu_idx; + WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); + WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); + WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + + /* Invoke the callbacks we removed above. */ + while (lh) { + rhp = lh; + lh = lh->next; + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + } + + /* + * Enable rescheduling, and if there are more callbacks, + * reschedule ourselves. This can race with a call_srcu() + * at interrupt level, but the ->srcu_gp_running checks will + * straighten that out. + */ + WRITE_ONCE(sp->srcu_gp_running, false); + if (READ_ONCE(sp->srcu_cb_head)) + schedule_work(&sp->srcu_work); +} +EXPORT_SYMBOL_GPL(srcu_drive_gp); + +/* + * Enqueue an SRCU callback on the specified srcu_struct structure, + * initiating grace-period processing if it is not already running. + */ +void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, + rcu_callback_t func) +{ + unsigned long flags; + + rhp->func = func; + rhp->next = NULL; + local_irq_save(flags); + *sp->srcu_cb_tail = rhp; + sp->srcu_cb_tail = &rhp->next; + local_irq_restore(flags); + if (!READ_ONCE(sp->srcu_gp_running)) + schedule_work(&sp->srcu_work); +} +EXPORT_SYMBOL_GPL(call_srcu); + +/* + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + struct rcu_synchronize rs; + + init_rcu_head_on_stack(&rs.head); + init_completion(&rs.completion); + call_srcu(sp, &rs.head, wakeme_after_rcu); + wait_for_completion(&rs.completion); + destroy_rcu_head_on_stack(&rs.head); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c new file mode 100644 index 000000000000..d0ca524bf042 --- /dev/null +++ b/kernel/rcu/srcutree.c @@ -0,0 +1,1227 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2006 + * Copyright (C) Fujitsu, 2012 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + * Lai Jiangshan <laijs@cn.fujitsu.com> + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ + +#include <linux/export.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/rcupdate_wait.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/srcu.h> + +#include "rcu.h" +#include "rcu_segcblist.h" + +/* Holdoff in nanoseconds for auto-expediting. */ +#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) +static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; +module_param(exp_holdoff, ulong, 0444); + +/* Overflow-check frequency. N bits roughly says every 2**N grace periods. */ +static ulong counter_wrap_check = (ULONG_MAX >> 2); +module_param(counter_wrap_check, ulong, 0444); + +static void srcu_invoke_callbacks(struct work_struct *work); +static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); + +/* + * Initialize SRCU combining tree. Note that statically allocated + * srcu_struct structures might already have srcu_read_lock() and + * srcu_read_unlock() running against them. So if the is_static parameter + * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + */ +static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) +{ + int cpu; + int i; + int level = 0; + int levelspread[RCU_NUM_LVLS]; + struct srcu_data *sdp; + struct srcu_node *snp; + struct srcu_node *snp_first; + + /* Work out the overall tree geometry. */ + sp->level[0] = &sp->node[0]; + for (i = 1; i < rcu_num_lvls; i++) + sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1]; + rcu_init_levelspread(levelspread, num_rcu_lvl); + + /* Each pass through this loop initializes one srcu_node structure. */ + rcu_for_each_node_breadth_first(sp, snp) { + raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); + WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != + ARRAY_SIZE(snp->srcu_data_have_cbs)); + for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { + snp->srcu_have_cbs[i] = 0; + snp->srcu_data_have_cbs[i] = 0; + } + snp->srcu_gp_seq_needed_exp = 0; + snp->grplo = -1; + snp->grphi = -1; + if (snp == &sp->node[0]) { + /* Root node, special case. */ + snp->srcu_parent = NULL; + continue; + } + + /* Non-root node. */ + if (snp == sp->level[level + 1]) + level++; + snp->srcu_parent = sp->level[level - 1] + + (snp - sp->level[level]) / + levelspread[level - 1]; + } + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + ARRAY_SIZE(sdp->srcu_unlock_count)); + level = rcu_num_lvls - 1; + snp_first = sp->level[level]; + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(sp->sda, cpu); + raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq; + sdp->mynode = &snp_first[cpu / levelspread[level]]; + for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { + if (snp->grplo < 0) + snp->grplo = cpu; + snp->grphi = cpu; + } + sdp->cpu = cpu; + INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); + sdp->sp = sp; + sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); + if (is_static) + continue; + + /* Dynamically allocated, better be no srcu_read_locks()! */ + for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) { + sdp->srcu_lock_count[i] = 0; + sdp->srcu_unlock_count[i] = 0; + } + } +} + +/* + * Initialize non-compile-time initialized fields, including the + * associated srcu_node and srcu_data structures. The is_static + * parameter is passed through to init_srcu_struct_nodes(), and + * also tells us that ->sda has already been wired up to srcu_data. + */ +static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static) +{ + mutex_init(&sp->srcu_cb_mutex); + mutex_init(&sp->srcu_gp_mutex); + sp->srcu_idx = 0; + sp->srcu_gp_seq = 0; + sp->srcu_barrier_seq = 0; + mutex_init(&sp->srcu_barrier_mutex); + atomic_set(&sp->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&sp->work, process_srcu); + if (!is_static) + sp->sda = alloc_percpu(struct srcu_data); + init_srcu_struct_nodes(sp, is_static); + sp->srcu_gp_seq_needed_exp = 0; + sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */ + return sp->sda ? 0 : -ENOMEM; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); + raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); + return init_srcu_struct_fields(sp, false); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/** + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); + return init_srcu_struct_fields(sp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct); + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * First-use initialization of statically allocated srcu_struct + * structure. Wiring up the combining tree is more than can be + * done with compile-time initialization, so this check is added + * to each update-side SRCU primitive. Use sp->lock, which -is- + * compile-time initialized, to resolve races involving multiple + * CPUs trying to garner first-use privileges. + */ +static void check_init_srcu_struct(struct srcu_struct *sp) +{ + unsigned long flags; + + WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT); + /* The smp_load_acquire() pairs with the smp_store_release(). */ + if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ + return; /* Already initialized. */ + raw_spin_lock_irqsave_rcu_node(sp, flags); + if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { + raw_spin_unlock_irqrestore_rcu_node(sp, flags); + return; + } + init_srcu_struct_fields(sp, true); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); +} + +/* + * Returns approximate total of the readers' ->srcu_lock_count[] values + * for the rank of per-CPU counters specified by idx. + */ +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_lock_count[idx]); + } + return sum; +} + +/* + * Returns approximate total of the readers' ->srcu_unlock_count[] values + * for the rank of per-CPU counters specified by idx. + */ +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); + } + return sum; +} + +/* + * Return true if the number of pre-existing readers is determined to + * be zero. + */ +static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) +{ + unsigned long unlocks; + + unlocks = srcu_readers_unlock_idx(sp, idx); + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. Needs to be a smp_mb() as the read side may + * contain a read from a variable that is written to before the + * synchronize_srcu() in the write side. In this case smp_mb()s + * A and B act like the store buffering pattern. + * + * This smp_mb() also pairs with smp_mb() C to prevent accesses + * after the synchronize_srcu() from being executed before the + * grace period ends. + */ + smp_mb(); /* A */ + + /* + * If the locks are the same as the unlocks, then there must have + * been no readers on this index at some time in between. This does + * not mean that there are no more readers, as one could have read + * the current index but not have incremented the lock counter yet. + * + * So suppose that the updater is preempted here for so long + * that more than ULONG_MAX non-nested readers come and go in + * the meantime. It turns out that this cannot result in overflow + * because if a reader modifies its unlock count after we read it + * above, then that reader's next load of ->srcu_idx is guaranteed + * to get the new value, which will cause it to operate on the + * other bank of counters, where it cannot contribute to the + * overflow of these counters. This means that there is a maximum + * of 2*NR_CPUS increments, which cannot overflow given current + * systems, especially not on 64-bit systems. + * + * OK, how about nesting? This does impose a limit on nesting + * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, + * especially on 64-bit systems. + */ + return srcu_readers_lock_idx(sp, idx) == unlocks; +} + +/** + * srcu_readers_active - returns true if there are readers. and false + * otherwise + * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * + * Note that this is not an atomic primitive, and can therefore suffer + * severe errors when invoked on an active srcu_struct. That said, it + * can be useful as an error check at cleanup time. + */ +static bool srcu_readers_active(struct srcu_struct *sp) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_lock_count[0]); + sum += READ_ONCE(cpuc->srcu_lock_count[1]); + sum -= READ_ONCE(cpuc->srcu_unlock_count[0]); + sum -= READ_ONCE(cpuc->srcu_unlock_count[1]); + } + return sum; +} + +#define SRCU_INTERVAL 1 + +/* + * Return grace-period delay, zero if there are expedited grace + * periods pending, SRCU_INTERVAL otherwise. + */ +static unsigned long srcu_get_delay(struct srcu_struct *sp) +{ + if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq), + READ_ONCE(sp->srcu_gp_seq_needed_exp))) + return 0; + return SRCU_INTERVAL; +} + +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + int cpu; + + if (WARN_ON(!srcu_get_delay(sp))) + return; /* Leakage unless caller handles error. */ + if (WARN_ON(srcu_readers_active(sp))) + return; /* Leakage unless caller handles error. */ + flush_delayed_work(&sp->work); + for_each_possible_cpu(cpu) + flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(srcu_readers_active(sp))) { + pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + return; /* Caller forgot to stop doing call_srcu()? */ + } + free_percpu(sp->sda); + sp->sda = NULL; +} +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int __srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + idx = READ_ONCE(sp->srcu_idx) & 0x1; + this_cpu_inc(sp->sda->srcu_lock_count[idx]); + smp_mb(); /* B */ /* Avoid leaking the critical section. */ + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock); + +/* + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + */ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + smp_mb(); /* C */ /* Avoid leaking the critical section. */ + this_cpu_inc(sp->sda->srcu_unlock_count[idx]); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock); + +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after a few microseconds, + * we repeatedly block for 1-millisecond time periods. + */ +#define SRCU_RETRY_CHECK_DELAY 5 + +/* + * Start an SRCU grace period. + */ +static void srcu_gp_start(struct srcu_struct *sp) +{ + struct srcu_data *sdp = this_cpu_ptr(sp->sda); + int state; + + lockdep_assert_held(&sp->lock); + WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ + rcu_seq_start(&sp->srcu_gp_seq); + state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + WARN_ON_ONCE(state != SRCU_STATE_SCAN1); +} + +/* + * Track online CPUs to guide callback workqueue placement. + */ +DEFINE_PER_CPU(bool, srcu_online); + +void srcu_online_cpu(unsigned int cpu) +{ + WRITE_ONCE(per_cpu(srcu_online, cpu), true); +} + +void srcu_offline_cpu(unsigned int cpu) +{ + WRITE_ONCE(per_cpu(srcu_online, cpu), false); +} + +/* + * Place the workqueue handler on the specified CPU if online, otherwise + * just run it whereever. This is useful for placing workqueue handlers + * that are to invoke the specified CPU's callbacks. + */ +static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + bool ret; + + preempt_disable(); + if (READ_ONCE(per_cpu(srcu_online, cpu))) + ret = queue_delayed_work_on(cpu, wq, dwork, delay); + else + ret = queue_delayed_work(wq, dwork, delay); + preempt_enable(); + return ret; +} + +/* + * Schedule callback invocation for the specified srcu_data structure, + * if possible, on the corresponding CPU. + */ +static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) +{ + srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, + &sdp->work, delay); +} + +/* + * Schedule callback invocation for all srcu_data structures associated + * with the specified srcu_node structure that have callbacks for the + * just-completed grace period, the one corresponding to idx. If possible, + * schedule this invocation on the corresponding CPUs. + */ +static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, + unsigned long mask, unsigned long delay) +{ + int cpu; + + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { + if (!(mask & (1 << (cpu - snp->grplo)))) + continue; + srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay); + } +} + +/* + * Note the end of an SRCU grace period. Initiates callback invocation + * and starts a new grace period if needed. + * + * The ->srcu_cb_mutex acquisition does not protect any data, but + * instead prevents more than one grace period from starting while we + * are initiating callback invocation. This allows the ->srcu_have_cbs[] + * array to have a finite number of elements. + */ +static void srcu_gp_end(struct srcu_struct *sp) +{ + unsigned long cbdelay; + bool cbs; + int cpu; + unsigned long flags; + unsigned long gpseq; + int idx; + int idxnext; + unsigned long mask; + struct srcu_data *sdp; + struct srcu_node *snp; + + /* Prevent more than one additional grace period. */ + mutex_lock(&sp->srcu_cb_mutex); + + /* End the current grace period. */ + raw_spin_lock_irq_rcu_node(sp); + idx = rcu_seq_state(sp->srcu_gp_seq); + WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); + cbdelay = srcu_get_delay(sp); + sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + rcu_seq_end(&sp->srcu_gp_seq); + gpseq = rcu_seq_current(&sp->srcu_gp_seq); + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) + sp->srcu_gp_seq_needed_exp = gpseq; + raw_spin_unlock_irq_rcu_node(sp); + mutex_unlock(&sp->srcu_gp_mutex); + /* A new grace period can start at this point. But only one. */ + + /* Initiate callback invocation as needed. */ + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); + rcu_for_each_node_breadth_first(sp, snp) { + raw_spin_lock_irq_rcu_node(snp); + cbs = false; + if (snp >= sp->level[rcu_num_lvls - 1]) + cbs = snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) + snp->srcu_gp_seq_needed_exp = gpseq; + mask = snp->srcu_data_have_cbs[idx]; + snp->srcu_data_have_cbs[idx] = 0; + raw_spin_unlock_irq_rcu_node(snp); + if (cbs) + srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); + + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { + sdp = per_cpu_ptr(sp->sda, cpu); + raw_spin_lock_irqsave_rcu_node(sdp, flags); + if (ULONG_CMP_GE(gpseq, + sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + } + } + + /* Callback initiation done, allow grace periods after next. */ + mutex_unlock(&sp->srcu_cb_mutex); + + /* Start a new grace period if needed. */ + raw_spin_lock_irq_rcu_node(sp); + gpseq = rcu_seq_current(&sp->srcu_gp_seq); + if (!rcu_seq_state(gpseq) && + ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { + srcu_gp_start(sp); + raw_spin_unlock_irq_rcu_node(sp); + /* Throttle expedited grace periods: Should be rare! */ + srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff + ? 0 : SRCU_INTERVAL); + } else { + raw_spin_unlock_irq_rcu_node(sp); + } +} + +/* + * Funnel-locking scheme to scalably mediate many concurrent expedited + * grace-period requests. This function is invoked for the first known + * expedited request for a grace period that has already been requested, + * but without expediting. To start a completely new grace period, + * whether expedited or not, use srcu_funnel_gp_start() instead. + */ +static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, + unsigned long s) +{ + unsigned long flags; + + for (; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&sp->srcu_gp_seq, s) || + ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) + return; + raw_spin_lock_irqsave_rcu_node(snp, flags); + if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + raw_spin_unlock_irqrestore_rcu_node(snp, flags); + return; + } + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); + } + raw_spin_lock_irqsave_rcu_node(sp, flags); + if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + sp->srcu_gp_seq_needed_exp = s; + raw_spin_unlock_irqrestore_rcu_node(sp, flags); +} + +/* + * Funnel-locking scheme to scalably mediate many concurrent grace-period + * requests. The winner has to do the work of actually starting grace + * period s. Losers must either ensure that their desired grace-period + * number is recorded on at least their leaf srcu_node structure, or they + * must take steps to invoke their own callbacks. + */ +static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, + unsigned long s, bool do_norm) +{ + unsigned long flags; + int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); + struct srcu_node *snp = sdp->mynode; + unsigned long snp_seq; + + /* Each pass through the loop does one level of the srcu_node tree. */ + for (; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) + return; /* GP already done and CBs recorded. */ + raw_spin_lock_irqsave_rcu_node(snp, flags); + if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + snp_seq = snp->srcu_have_cbs[idx]; + if (snp == sdp->mynode && snp_seq == s) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(snp, flags); + if (snp == sdp->mynode && snp_seq != s) { + srcu_schedule_cbs_sdp(sdp, do_norm + ? SRCU_INTERVAL + : 0); + return; + } + if (!do_norm) + srcu_funnel_exp_start(sp, snp, s); + return; + } + snp->srcu_have_cbs[idx] = s; + if (snp == sdp->mynode) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) + snp->srcu_gp_seq_needed_exp = s; + raw_spin_unlock_irqrestore_rcu_node(snp, flags); + } + + /* Top of tree, must ensure the grace period will be started. */ + raw_spin_lock_irqsave_rcu_node(sp, flags); + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { + /* + * Record need for grace period s. Pair with load + * acquire setting up for initialization. + */ + smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/ + } + if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + sp->srcu_gp_seq_needed_exp = s; + + /* If grace period not already done and none in progress, start it. */ + if (!rcu_seq_done(&sp->srcu_gp_seq, s) && + rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + srcu_gp_start(sp); + queue_delayed_work(system_power_efficient_wq, &sp->work, + srcu_get_delay(sp)); + } + raw_spin_unlock_irqrestore_rcu_node(sp, flags); +} + +/* + * Wait until all readers counted by array index idx complete, but + * loop an additional time if there is an expedited grace period pending. + * The caller must ensure that ->srcu_idx is not changed while checking. + */ +static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) +{ + for (;;) { + if (srcu_readers_active_idx_check(sp, idx)) + return true; + if (--trycount + !srcu_get_delay(sp) <= 0) + return false; + udelay(SRCU_RETRY_CHECK_DELAY); + } +} + +/* + * Increment the ->srcu_idx counter so that future SRCU readers will + * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows + * us to wait for pre-existing readers in a starvation-free manner. + */ +static void srcu_flip(struct srcu_struct *sp) +{ + /* + * Ensure that if this updater saw a given reader's increment + * from __srcu_read_lock(), that reader was using an old value + * of ->srcu_idx. Also ensure that if a given reader sees the + * new value of ->srcu_idx, this updater's earlier scans cannot + * have seen that reader's increments (which is OK, because this + * grace period need not wait on that reader). + */ + smp_mb(); /* E */ /* Pairs with B and C. */ + + WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); + + /* + * Ensure that if the updater misses an __srcu_read_unlock() + * increment, that task's next __srcu_read_lock() will see the + * above counter update. Note that both this memory barrier + * and the one in srcu_readers_active_idx_check() provide the + * guarantee for __srcu_read_lock(). + */ + smp_mb(); /* D */ /* Pairs with C. */ +} + +/* + * If SRCU is likely idle, return true, otherwise return false. + * + * Note that it is OK for several current from-idle requests for a new + * grace period from idle to specify expediting because they will all end + * up requesting the same grace period anyhow. So no loss. + * + * Note also that if any CPU (including the current one) is still invoking + * callbacks, this function will nevertheless say "idle". This is not + * ideal, but the overhead of checking all CPUs' callback lists is even + * less ideal, especially on large systems. Furthermore, the wakeup + * can happen before the callback is fully removed, so we have no choice + * but to accept this type of error. + * + * This function is also subject to counter-wrap errors, but let's face + * it, if this function was preempted for enough time for the counters + * to wrap, it really doesn't matter whether or not we expedite the grace + * period. The extra overhead of a needlessly expedited grace period is + * negligible when amoritized over that time period, and the extra latency + * of a needlessly non-expedited grace period is similarly negligible. + */ +static bool srcu_might_be_idle(struct srcu_struct *sp) +{ + unsigned long curseq; + unsigned long flags; + struct srcu_data *sdp; + unsigned long t; + + /* If the local srcu_data structure has callbacks, not idle. */ + local_irq_save(flags); + sdp = this_cpu_ptr(sp->sda); + if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { + local_irq_restore(flags); + return false; /* Callbacks already present, so not idle. */ + } + local_irq_restore(flags); + + /* + * No local callbacks, so probabalistically probe global state. + * Exact information would require acquiring locks, which would + * kill scalability, hence the probabalistic nature of the probe. + */ + + /* First, see if enough time has passed since the last GP. */ + t = ktime_get_mono_fast_ns(); + if (exp_holdoff == 0 || + time_in_range_open(t, sp->srcu_last_gp_end, + sp->srcu_last_gp_end + exp_holdoff)) + return false; /* Too soon after last GP. */ + + /* Next, check for probable idleness. */ + curseq = rcu_seq_current(&sp->srcu_gp_seq); + smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ + if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed))) + return false; /* Grace period in progress, so not idle. */ + smp_mb(); /* Order ->srcu_gp_seq with prior access. */ + if (curseq != rcu_seq_current(&sp->srcu_gp_seq)) + return false; /* GP # changed, so not idle. */ + return true; /* With reasonable probability, idle! */ +} + +/* + * SRCU callback function to leak a callback. + */ +static void srcu_leak_callback(struct rcu_head *rhp) +{ +} + +/* + * Enqueue an SRCU callback on the srcu_data structure associated with + * the current CPU and the specified srcu_struct structure, initiating + * grace-period processing if it is not already running. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing SRCU read-side critical section. On systems with + * more than one CPU, this means that when "func()" is invoked, each CPU + * is guaranteed to have executed a full memory barrier since the end of + * its last corresponding SRCU read-side critical section whose beginning + * preceded the call to call_rcu(). It also means that each CPU executing + * an SRCU read-side critical section that continues beyond the start of + * "func()" must have executed a memory barrier after the call_rcu() + * but before the beginning of that SRCU read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting SRCU callback function "func()", then both CPU A and CPU + * B are guaranteed to execute a full memory barrier during the time + * interval between the call to call_rcu() and the invocation of "func()". + * This guarantee applies even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + * + * Of course, these guarantees apply only for invocations of call_srcu(), + * srcu_read_lock(), and srcu_read_unlock() that are all passed the same + * srcu_struct structure. + */ +void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, + rcu_callback_t func, bool do_norm) +{ + unsigned long flags; + bool needexp = false; + bool needgp = false; + unsigned long s; + struct srcu_data *sdp; + + check_init_srcu_struct(sp); + if (debug_rcu_head_queue(rhp)) { + /* Probable double call_srcu(), so leak the callback. */ + WRITE_ONCE(rhp->func, srcu_leak_callback); + WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n"); + return; + } + rhp->func = func; + local_irq_save(flags); + sdp = this_cpu_ptr(sp->sda); + raw_spin_lock_rcu_node(sdp); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + s = rcu_seq_snap(&sp->srcu_gp_seq); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { + sdp->srcu_gp_seq_needed = s; + needgp = true; + } + if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { + sdp->srcu_gp_seq_needed_exp = s; + needexp = true; + } + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + if (needgp) + srcu_funnel_gp_start(sp, sdp, s, do_norm); + else if (needexp) + srcu_funnel_exp_start(sp, sdp->mynode, s); +} + +/** + * call_srcu() - Queue a callback for invocation after an SRCU grace period + * @sp: srcu_struct in queue the callback + * @head: structure to be used for queueing the SRCU callback. + * @func: function to be invoked after the SRCU grace period + * + * The callback function will be invoked some time after a full SRCU + * grace period elapses, in other words after all pre-existing SRCU + * read-side critical sections have completed. However, the callback + * function might well execute concurrently with other SRCU read-side + * critical sections that started after call_srcu() was invoked. SRCU + * read-side critical sections are delimited by srcu_read_lock() and + * srcu_read_unlock(), and may be nested. + * + * The callback will be invoked from process context, but must nevertheless + * be fast and must not block. + */ +void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, + rcu_callback_t func) +{ + __call_srcu(sp, rhp, func, true); +} +EXPORT_SYMBOL_GPL(call_srcu); + +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). + */ +static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) +{ + struct rcu_synchronize rcu; + + RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); + + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return; + might_sleep(); + check_init_srcu_struct(sp); + init_completion(&rcu.completion); + init_rcu_head_on_stack(&rcu.head); + __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} + +/** + * synchronize_srcu_expedited - Brute-force SRCU grace period + * @sp: srcu_struct with which to synchronize. + * + * Wait for an SRCU grace period to elapse, but be more aggressive about + * spinning rather than blocking when waiting. + * + * Note that synchronize_srcu_expedited() has the same deadlock and + * memory-ordering properties as does synchronize_srcu(). + */ +void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, rcu_gp_is_normal()); +} +EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); + +/** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Wait for the count to drain to zero of both indexes. To avoid the + * possible starvation of synchronize_srcu(), it waits for the count of + * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, + * and then flip the srcu_idx and wait for the count of the other index. + * + * Can block; must be called from process context. + * + * Note that it is illegal to call synchronize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section, + * as long as the resulting graph of srcu_structs is acyclic. + * + * There are memory-ordering constraints implied by synchronize_srcu(). + * On systems with more than one CPU, when synchronize_srcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last corresponding SRCU-sched read-side critical section + * whose beginning preceded the call to synchronize_srcu(). In addition, + * each CPU having an SRCU read-side critical section that extends beyond + * the return from synchronize_srcu() is guaranteed to have executed a + * full memory barrier after the beginning of synchronize_srcu() and before + * the beginning of that SRCU read-side critical section. Note that these + * guarantees include CPUs that are offline, idle, or executing in user mode, + * as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_srcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_srcu(). This guarantee applies even if CPU A and CPU B + * are the same CPU, but again only if the system has more than one CPU. + * + * Of course, these memory-ordering guarantees apply only when + * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are + * passed the same srcu_struct structure. + * + * If SRCU is likely idle, expedite the first request. This semantic + * was provided by Classic SRCU, and is relied upon by its users, so TREE + * SRCU must also provide it. Note that detecting idleness is heuristic + * and subject to both false positives and negatives. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + if (srcu_might_be_idle(sp) || rcu_gp_is_expedited()) + synchronize_srcu_expedited(sp); + else + __synchronize_srcu(sp, true); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); + +/* + * Callback function for srcu_barrier() use. + */ +static void srcu_barrier_cb(struct rcu_head *rhp) +{ + struct srcu_data *sdp; + struct srcu_struct *sp; + + sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); + sp = sdp->sp; + if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) + complete(&sp->srcu_barrier_completion); +} + +/** + * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. + * @sp: srcu_struct on which to wait for in-flight callbacks. + */ +void srcu_barrier(struct srcu_struct *sp) +{ + int cpu; + struct srcu_data *sdp; + unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq); + + check_init_srcu_struct(sp); + mutex_lock(&sp->srcu_barrier_mutex); + if (rcu_seq_done(&sp->srcu_barrier_seq, s)) { + smp_mb(); /* Force ordering following return. */ + mutex_unlock(&sp->srcu_barrier_mutex); + return; /* Someone else did our work for us. */ + } + rcu_seq_start(&sp->srcu_barrier_seq); + init_completion(&sp->srcu_barrier_completion); + + /* Initial count prevents reaching zero until all CBs are posted. */ + atomic_set(&sp->srcu_barrier_cpu_cnt, 1); + + /* + * Each pass through this loop enqueues a callback, but only + * on CPUs already having callbacks enqueued. Note that if + * a CPU already has callbacks enqueue, it must have already + * registered the need for a future grace period, so all we + * need do is enqueue a callback that will use the same + * grace period as the last callback already in the queue. + */ + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(sp->sda, cpu); + raw_spin_lock_irq_rcu_node(sdp); + atomic_inc(&sp->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head, 0)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); + atomic_dec(&sp->srcu_barrier_cpu_cnt); + } + raw_spin_unlock_irq_rcu_node(sdp); + } + + /* Remove the initial count, at which point reaching zero can happen. */ + if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) + complete(&sp->srcu_barrier_completion); + wait_for_completion(&sp->srcu_barrier_completion); + + rcu_seq_end(&sp->srcu_barrier_seq); + mutex_unlock(&sp->srcu_barrier_mutex); +} +EXPORT_SYMBOL_GPL(srcu_barrier); + +/** + * srcu_batches_completed - return batches completed. + * @sp: srcu_struct on which to report batch completion. + * + * Report the number of batches, correlated with, but not necessarily + * precisely the same as, the number of grace periods that have elapsed. + */ +unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return sp->srcu_idx; +} +EXPORT_SYMBOL_GPL(srcu_batches_completed); + +/* + * Core SRCU state machine. Push state bits of ->srcu_gp_seq + * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has + * completed in that state. + */ +static void srcu_advance_state(struct srcu_struct *sp) +{ + int idx; + + mutex_lock(&sp->srcu_gp_mutex); + + /* + * Because readers might be delayed for an extended period after + * fetching ->srcu_idx for their index, at any point in time there + * might well be readers using both idx=0 and idx=1. We therefore + * need to wait for readers to clear from both index values before + * invoking a callback. + * + * The load-acquire ensures that we see the accesses performed + * by the prior grace period. + */ + idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ + if (idx == SRCU_STATE_IDLE) { + raw_spin_lock_irq_rcu_node(sp); + if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); + raw_spin_unlock_irq_rcu_node(sp); + mutex_unlock(&sp->srcu_gp_mutex); + return; + } + idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + if (idx == SRCU_STATE_IDLE) + srcu_gp_start(sp); + raw_spin_unlock_irq_rcu_node(sp); + if (idx != SRCU_STATE_IDLE) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* Someone else started the grace period. */ + } + } + + if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + idx = 1 ^ (sp->srcu_idx & 1); + if (!try_check_zero(sp, idx, 1)) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* readers present, retry later. */ + } + srcu_flip(sp); + rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2); + } + + if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + + /* + * SRCU read-side critical sections are normally short, + * so check at least twice in quick succession after a flip. + */ + idx = 1 ^ (sp->srcu_idx & 1); + if (!try_check_zero(sp, idx, 2)) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* readers present, retry later. */ + } + srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */ + } +} + +/* + * Invoke a limited number of SRCU callbacks that have passed through + * their grace period. If there are more to do, SRCU will reschedule + * the workqueue. Note that needed memory barriers have been executed + * in this task's context by srcu_readers_active_idx_check(). + */ +static void srcu_invoke_callbacks(struct work_struct *work) +{ + bool more; + struct rcu_cblist ready_cbs; + struct rcu_head *rhp; + struct srcu_data *sdp; + struct srcu_struct *sp; + + sdp = container_of(work, struct srcu_data, work.work); + sp = sdp->sp; + rcu_cblist_init(&ready_cbs); + raw_spin_lock_irq_rcu_node(sdp); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + if (sdp->srcu_cblist_invoking || + !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { + raw_spin_unlock_irq_rcu_node(sdp); + return; /* Someone else on the job or nothing to do. */ + } + + /* We are on the job! Extract and invoke ready callbacks. */ + sdp->srcu_cblist_invoking = true; + rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); + raw_spin_unlock_irq_rcu_node(sdp); + rhp = rcu_cblist_dequeue(&ready_cbs); + for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + debug_rcu_head_unqueue(rhp); + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + } + + /* + * Update counts, accelerate new callbacks, and if needed, + * schedule another round of callback invocation. + */ + raw_spin_lock_irq_rcu_node(sdp); + rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + sdp->srcu_cblist_invoking = false; + more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); + raw_spin_unlock_irq_rcu_node(sdp); + if (more) + srcu_schedule_cbs_sdp(sdp, 0); +} + +/* + * Finished one round of SRCU grace period. Start another if there are + * more SRCU callbacks queued, otherwise put SRCU into not-running state. + */ +static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) +{ + bool pushgp = true; + + raw_spin_lock_irq_rcu_node(sp); + if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { + /* All requests fulfilled, time to go idle. */ + pushgp = false; + } + } else if (!rcu_seq_state(sp->srcu_gp_seq)) { + /* Outstanding request and no GP. Start one. */ + srcu_gp_start(sp); + } + raw_spin_unlock_irq_rcu_node(sp); + + if (pushgp) + queue_delayed_work(system_power_efficient_wq, &sp->work, delay); +} + +/* + * This is the work-queue function that handles SRCU grace periods. + */ +void process_srcu(struct work_struct *work) +{ + struct srcu_struct *sp; + + sp = container_of(work, struct srcu_struct, work.work); + + srcu_advance_state(sp); + srcu_reschedule(sp, srcu_get_delay(sp)); +} +EXPORT_SYMBOL_GPL(process_srcu); + +void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = rcu_seq_ctr(sp->srcu_gp_seq); + *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); +} +EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); + +static int __init srcu_bootup_announce(void) +{ + pr_info("Hierarchical SRCU implementation.\n"); + if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) + pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); + return 0; +} +early_initcall(srcu_bootup_announce); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 6ad330dbbae2..f8488965250f 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -35,15 +35,26 @@ #include <linux/time.h> #include <linux/cpu.h> #include <linux/prefetch.h> -#include <linux/trace_events.h> #include "rcu.h" -/* Forward declarations for tiny_plugin.h. */ -struct rcu_ctrlblk; -static void __call_rcu(struct rcu_head *head, - rcu_callback_t func, - struct rcu_ctrlblk *rcp); +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_sched_ctrlblk = { + .donetail = &rcu_sched_ctrlblk.rcucblist, + .curtail = &rcu_sched_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; #include "tiny_plugin.h" @@ -59,19 +70,6 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL(rcu_barrier_sched); -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) - -/* - * Test whether RCU thinks that the current CPU is idle. - */ -bool notrace __rcu_is_watching(void) -{ - return true; -} -EXPORT_SYMBOL(__rcu_is_watching); - -#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ - /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). * Also irqs are disabled to avoid confusion due to interrupt handlers @@ -79,7 +77,6 @@ EXPORT_SYMBOL(__rcu_is_watching); */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - RCU_TRACE(reset_cpu_stall_ticks(rcp)); if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; @@ -125,7 +122,6 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - RCU_TRACE(check_cpu_stalls()); if (user) rcu_sched_qs(); else if (!in_softirq()) @@ -140,10 +136,8 @@ void rcu_check_callbacks(int user) */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { - const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; - RCU_TRACE(int cb_count = 0); /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); @@ -152,7 +146,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); return; } - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -162,22 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name); while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim(rn, list); + __rcu_reclaim("", list); local_bh_enable(); list = next; - RCU_TRACE(cb_count++); } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, - cb_count, 0, need_resched(), - is_idle_task(current), - false)); } static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) @@ -221,7 +207,6 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++); local_irq_restore(flags); if (unlikely(is_idle_task(current))) { @@ -254,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); - rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index c64b827ecbca..f0a01b2a3062 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -22,37 +22,7 @@ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ -#include <linux/kthread.h> -#include <linux/init.h> -#include <linux/debugfs.h> -#include <linux/seq_file.h> - -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ - struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ - struct rcu_head **curtail; /* ->next pointer of last CB. */ - RCU_TRACE(long qlen); /* Number of pending CBs. */ - RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ - RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ - RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ - RCU_TRACE(const char *name); /* Name of RCU type. */ -}; - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_sched_ctrlblk = { - .donetail = &rcu_sched_ctrlblk.rcucblist, - .curtail = &rcu_sched_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_sched") -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .donetail = &rcu_bh_ctrlblk.rcucblist, - .curtail = &rcu_bh_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_bh") -}; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) #include <linux/kernel_stat.h> int rcu_scheduler_active __read_mostly; @@ -65,105 +35,13 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. * The reason for this is that Tiny RCU does not need kthreads, so does * not have to care about the fact that the scheduler is half-initialized - * at a certain phase of the boot process. + * at a certain phase of the boot process. Unless SRCU is in the mix. */ void __init rcu_scheduler_starting(void) { WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = RCU_SCHEDULER_RUNNING; -} - -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -#ifdef CONFIG_RCU_TRACE - -static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) -{ - unsigned long flags; - - local_irq_save(flags); - rcp->qlen -= n; - local_irq_restore(flags); -} - -/* - * Dump statistics for TINY_RCU, such as they are. - */ -static int show_tiny_stats(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); - seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); - return 0; -} - -static int show_tiny_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_tiny_stats, NULL); -} - -static const struct file_operations show_tiny_stats_fops = { - .owner = THIS_MODULE, - .open = show_tiny_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutiny_trace_init(void) -{ - struct dentry *retval; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - retval = debugfs_create_file("rcudata", 0444, rcudir, - NULL, &show_tiny_stats_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} -device_initcall(rcutiny_trace_init); - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long j; - unsigned long js; - - if (rcu_cpu_stall_suppress) - return; - rcp->ticks_this_gp++; - j = jiffies; - js = READ_ONCE(rcp->jiffies_stall); - if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { - pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, - jiffies - rcp->gp_start, rcp->qlen); - dump_stack(); - WRITE_ONCE(rcp->jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - } else if (ULONG_CMP_GE(j, js)) { - WRITE_ONCE(rcp->jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); - } -} - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ - rcp->ticks_this_gp = 0; - rcp->gp_start = jiffies; - WRITE_ONCE(rcp->jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); -} - -static void check_cpu_stalls(void) -{ - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); + rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) + ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; } -#endif /* #ifdef CONFIG_RCU_TRACE */ +#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a6dcf3bd244f..51d4c3acf32d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -98,8 +98,8 @@ struct rcu_state sname##_state = { \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ - .orphan_nxttail = &sname##_state.orphan_nxtlist, \ - .orphan_donetail = &sname##_state.orphan_donelist, \ + .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ + .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -124,7 +124,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ -static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; +int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* panic() on RCU Stall sysctl. */ int sysctl_panic_on_rcu_stall __read_mostly; @@ -168,39 +168,21 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ -#ifdef CONFIG_RCU_KTHREAD_PRIO -static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; -#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */ module_param(kthread_prio, int, 0644); /* Delay in jiffies for grace-period initialization delays, debug only. */ -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT -static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; -module_param(gp_preinit_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ -static const int gp_preinit_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ - -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT -static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY; -module_param(gp_init_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ -static const int gp_init_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ - -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP -static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY; -module_param(gp_cleanup_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ -static const int gp_cleanup_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ +static int gp_preinit_delay; +module_param(gp_preinit_delay, int, 0444); +static int gp_init_delay; +module_param(gp_init_delay, int, 0444); +static int gp_cleanup_delay; +module_param(gp_cleanup_delay, int, 0444); /* * Number of grace periods between delays, normalized by the duration of - * the delay. The longer the the delay, the more the grace periods between + * the delay. The longer the delay, the more the grace periods between * each delay. The reason for this normalization is that it means that, * for non-zero delays, the overall slowdown of grace periods is constant * regardless of the duration of the delay. This arrangement balances @@ -250,6 +232,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!"); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), @@ -265,6 +248,7 @@ void rcu_sched_qs(void) void rcu_bh_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), __this_cpu_read(rcu_bh_data.gpnum), @@ -273,15 +257,19 @@ void rcu_bh_qs(void) } } -static DEFINE_PER_CPU(int, rcu_sched_qs_mask); +/* + * Steal a bit from the bottom of ->dynticks for idle entry/exit + * control. Initially this is for TLB flushing. + */ +#define RCU_DYNTICK_CTRL_MASK 0x1 +#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) +#ifndef rcu_eqs_special_exit +#define rcu_eqs_special_exit() do { } while (0) +#endif static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, - .dynticks = ATOMIC_INIT(1), -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, - .dynticks_idle = ATOMIC_INIT(1), -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; /* @@ -305,15 +293,20 @@ bool rcu_irq_enter_disabled(void) static void rcu_dynticks_eqs_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special; + int seq; /* - * CPUs seeing atomic_inc_return() must see prior RCU read-side + * CPUs seeing atomic_add_return() must see prior RCU read-side * critical sections, and we also must force ordering with the * next idle sojourn. */ - special = atomic_inc_return(&rdtp->dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + /* Better be in an extended quiescent state! */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (seq & RCU_DYNTICK_CTRL_CTR)); + /* Better not have special action (TLB flush) pending! */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (seq & RCU_DYNTICK_CTRL_MASK)); } /* @@ -323,15 +316,22 @@ static void rcu_dynticks_eqs_enter(void) static void rcu_dynticks_eqs_exit(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special; + int seq; /* - * CPUs seeing atomic_inc_return() must see prior idle sojourns, + * CPUs seeing atomic_add_return() must see prior idle sojourns, * and we also must force ordering with the next RCU read-side * critical section. */ - special = atomic_inc_return(&rdtp->dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !(seq & RCU_DYNTICK_CTRL_CTR)); + if (seq & RCU_DYNTICK_CTRL_MASK) { + atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); + smp_mb__after_atomic(); /* _exit after clearing mask. */ + /* Prefer duplicate flushes to losing a flush. */ + rcu_eqs_special_exit(); + } } /* @@ -348,9 +348,9 @@ static void rcu_dynticks_eqs_online(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - if (atomic_read(&rdtp->dynticks) & 0x1) + if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) return; - atomic_add(0x1, &rdtp->dynticks); + atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); } /* @@ -362,7 +362,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - return !(atomic_read(&rdtp->dynticks) & 0x1); + return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); } /* @@ -373,7 +373,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) { int snap = atomic_add_return(0, &rdtp->dynticks); - return snap; + return snap & ~RCU_DYNTICK_CTRL_MASK; } /* @@ -382,7 +382,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & 0x1); + return !(snap & RCU_DYNTICK_CTRL_CTR); } /* @@ -402,14 +402,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) static void rcu_dynticks_momentary_idle(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special = atomic_add_return(2, &rdtp->dynticks); + int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, + &rdtp->dynticks); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & 0x1)); + WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); } -DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); -EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); +/* + * Set the special (bottom) bit of the specified CPU so that it + * will take special action (such as flushing its TLB) on the + * next exit from an extended quiescent state. Returns true if + * the bit was successfully set, or false if the CPU was not in + * an extended quiescent state. + */ +bool rcu_eqs_special_set(int cpu) +{ + int old; + int new; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + do { + old = atomic_read(&rdtp->dynticks); + if (old & RCU_DYNTICK_CTRL_CTR) + return false; + new = old | RCU_DYNTICK_CTRL_MASK; + } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); + return true; +} /* * Let the RCU core know that this CPU has gone through the scheduler, @@ -418,44 +438,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * memory barriers to let the RCU core know about it, regardless of what * this CPU might (or might not) do in the near future. * - * We inform the RCU core by emulating a zero-duration dyntick-idle - * period, which we in turn do by incrementing the ->dynticks counter - * by two. + * We inform the RCU core by emulating a zero-duration dyntick-idle period. * * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - struct rcu_data *rdp; - int resched_mask; - struct rcu_state *rsp; - - /* - * Yes, we can lose flag-setting operations. This is OK, because - * the flag will be set again after some delay. - */ - resched_mask = raw_cpu_read(rcu_sched_qs_mask); - raw_cpu_write(rcu_sched_qs_mask, 0); - - /* Find the flavor that needs a quiescent state. */ - for_each_rcu_flavor(rsp) { - rdp = raw_cpu_ptr(rsp->rda); - if (!(resched_mask & rsp->flavor_mask)) - continue; - smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ - if (READ_ONCE(rdp->mynode->completed) != - READ_ONCE(rdp->cond_resched_completed)) - continue; - - /* - * Pretend to be momentarily idle for the quiescent state. - * This allows the grace-period kthread to record the - * quiescent state, with no need for this CPU to do anything - * further. - */ - rcu_dynticks_momentary_idle(); - break; - } + raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); + rcu_dynticks_momentary_idle(); } /* @@ -463,14 +453,22 @@ static void rcu_momentary_dyntick_idle(void) * and requires special handling for preemptible RCU. * The caller must have disabled interrupts. */ -void rcu_note_context_switch(void) +void rcu_note_context_switch(bool preempt) { barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); - rcu_preempt_note_context_switch(); - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + rcu_preempt_note_context_switch(preempt); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) + goto out; + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); + if (!preempt) + rcu_note_voluntary_context_switch_lite(current); +out: trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -493,35 +491,35 @@ void rcu_all_qs(void) { unsigned long flags; + if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) + return; + preempt_disable(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { + preempt_enable(); + return; + } + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); } - if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { - /* - * Yes, we just checked a per-CPU variable with preemption - * enabled, so we might be migrated to some other CPU at - * this point. That is OK because in that case, the - * migration will supply the needed quiescent state. - * We might end up needlessly disabling preemption and - * invoking rcu_sched_qs() on the destination CPU, but - * the probability and cost are both quite low, so this - * should not be a problem in practice. - */ - preempt_disable(); + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) rcu_sched_qs(); - preempt_enable(); - } - this_cpu_inc(rcu_qs_ctr); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ + preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); -static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ -static long qhimark = 10000; /* If this many pending, ignore blimit. */ -static long qlowmark = 100; /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +static long blimit = DEFAULT_RCU_BLIMIT; +#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +static long qhimark = DEFAULT_RCU_QHIMARK; +#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +static long qlowmark = DEFAULT_RCU_QLOMARK; module_param(blimit, long, 0444); module_param(qhimark, long, 0444); @@ -544,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644); static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj); +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -704,15 +699,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, default: break; } - if (rsp != NULL) { - *flags = READ_ONCE(rsp->gp_flags); - *gpnum = READ_ONCE(rsp->gpnum); - *completed = READ_ONCE(rsp->completed); + if (rsp == NULL) return; - } - *flags = 0; - *gpnum = 0; - *completed = 0; + *flags = READ_ONCE(rsp->gp_flags); + *gpnum = READ_ONCE(rsp->gpnum); + *completed = READ_ONCE(rsp->completed); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); @@ -728,16 +719,6 @@ void rcutorture_record_progress(unsigned long vernum) EXPORT_SYMBOL_GPL(rcutorture_record_progress); /* - * Does the CPU have callbacks ready to be invoked? - */ -static int -cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) -{ - return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && - rdp->nxttail[RCU_NEXT_TAIL] != NULL; -} - -/* * Return the root node of the specified rcu_state structure. */ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) @@ -756,6 +737,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!"); return READ_ONCE(*fp); } @@ -767,21 +749,18 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - int i; - + RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!"); if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) return true; /* Yes, a no-CBs CPU needs one. */ - if (!rdp->nxttail[RCU_NEXT_TAIL]) + if (!rcu_segcblist_is_enabled(&rdp->cblist)) return false; /* No, this is a no-CBs (or offline) CPU. */ - if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) + if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return true; /* Yes, CPU has newly registered callbacks. */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rdp->nxttail[i - 1] != rdp->nxttail[i] && - ULONG_CMP_LT(READ_ONCE(rsp->completed), - rdp->nxtcompleted[i])) - return true; /* Yes, CBs for future grace period. */ + if (rcu_segcblist_future_gp_needed(&rdp->cblist, + READ_ONCE(rsp->completed))) + return true; /* Yes, CBs for future grace period. */ return false; /* No grace period needed. */ } @@ -797,6 +776,7 @@ static void rcu_eqs_enter_common(bool user) struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!"); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { @@ -867,7 +847,6 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -917,7 +896,6 @@ void rcu_irq_exit(void) trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); rdtp->dynticks_nesting--; } - rcu_sysidle_enter(1); } /* @@ -970,6 +948,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); @@ -998,7 +977,6 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -1050,7 +1028,6 @@ void rcu_irq_enter(void) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(oldval, true); - rcu_sysidle_exit(1); } /* @@ -1133,22 +1110,11 @@ void rcu_nmi_exit(void) } /** - * __rcu_is_watching - are RCU read-side critical sections safe? - * - * Return true if RCU is watching the running CPU, which means that - * this CPU can safely enter RCU read-side critical sections. Unlike - * rcu_is_watching(), the caller of __rcu_is_watching() must have at - * least disabled preemption. - */ -bool notrace __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - -/** * rcu_is_watching - see if RCU thinks that the current CPU is idle * - * If the current CPU is in its idle loop and is neither in an interrupt + * Return true if RCU is watching the running CPU, which means that this + * CPU can safely enter RCU read-side critical sections. In other words, + * if the current CPU is in its idle loop and is neither in an interrupt * or NMI handler, return true. */ bool notrace rcu_is_watching(void) @@ -1156,12 +1122,30 @@ bool notrace rcu_is_watching(void) bool ret; preempt_disable_notrace(); - ret = __rcu_is_watching(); + ret = !rcu_dynticks_curr_cpu_in_eqs(); preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_is_watching); +/* + * If a holdout task is actually running, request an urgent quiescent + * state from its CPU. This is unsynchronized, so migrations can cause + * the request to go to the wrong CPU. Which is OK, all that will happen + * is that the CPU's next context switch will be a bit slower and next + * time around this task will generate another request. + */ +void rcu_request_urgent_qs_task(struct task_struct *t) +{ + int cpu; + + barrier(); + cpu = task_cpu(t); + if (!task_curr(t)) + return; /* This task is not running on that CPU. */ + smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true); +} + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* @@ -1222,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void) * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) +static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); - rcu_sysidle_check_cpu(rdp, isidle, maxj); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, @@ -1243,11 +1225,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, * idle state since the last call to dyntick_save_progress_counter() * for this same CPU, or by virtue of having been offline. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { unsigned long jtsq; - int *rcrmp; + bool *rnhqp; + bool *ruqp; unsigned long rjtsc; struct rcu_node *rnp; @@ -1283,11 +1265,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * might not be the case for nohz_full CPUs looping in the kernel. */ rnp = rdp->mynode; + ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && - READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && + READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); return 1; + } else { + /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ + smp_store_release(ruqp, true); } /* Check for the CPU being offline. */ @@ -1304,7 +1290,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * in-kernel CPU-bound tasks cannot advance grace periods. * So if the grace period is old enough, make the CPU pay attention. * Note that the unsynchronized assignments to the per-CPU - * rcu_sched_qs_mask variable are safe. Yes, setting of + * rcu_need_heavy_qs variable are safe. Yes, setting of * bits can be lost, but they will be set again on the next * force-quiescent-state pass. So lost bit sets do not result * in incorrect behavior, merely in a grace period lasting @@ -1318,16 +1304,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * is set too high, we override with half of the RCU CPU stall * warning delay. */ - rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); - if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || - time_after(jiffies, rdp->rsp->jiffies_resched)) { - if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { - WRITE_ONCE(rdp->cond_resched_completed, - READ_ONCE(rdp->mynode->completed)); - smp_mb(); /* ->cond_resched_completed before *rcrmp. */ - WRITE_ONCE(*rcrmp, - READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - } + rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); + if (!READ_ONCE(*rnhqp) && + (time_after(jiffies, rdp->rsp->gp_start + jtsq) || + time_after(jiffies, rdp->rsp->jiffies_resched))) { + WRITE_ONCE(*rnhqp, true); + /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ + smp_store_release(ruqp, true); rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } @@ -1487,7 +1470,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + cpu)->cblist); pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), (long)rsp->gpnum, (long)rsp->completed, totqlen); @@ -1541,7 +1525,8 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + cpu)->cblist); pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); @@ -1644,30 +1629,6 @@ void rcu_cpu_stall_reset(void) } /* - * Initialize the specified rcu_data structure's default callback list - * to empty. The default callback list is the one that is not used by - * no-callbacks CPUs. - */ -static void init_default_callback_list(struct rcu_data *rdp) -{ - int i; - - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; -} - -/* - * Initialize the specified rcu_data structure's callback list to empty. - */ -static void init_callback_list(struct rcu_data *rdp) -{ - if (init_nocb_callback_list(rdp)) - return; - init_default_callback_list(rdp); -} - -/* * Determine the value that ->completed will have at the end of the * next subsequent grace period. This is used to tag callbacks so that * a CPU can invoke callbacks in a timely fashion even if that CPU has @@ -1679,6 +1640,8 @@ static void init_callback_list(struct rcu_data *rdp) static unsigned long rcu_cbs_completed(struct rcu_state *rsp, struct rcu_node *rnp) { + lockdep_assert_held(&rnp->lock); + /* * If RCU is idle, we just wait for the next grace period. * But we can only be sure that RCU is idle if we are looking @@ -1721,10 +1684,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long *c_out) { unsigned long c; - int i; bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + lockdep_assert_held(&rnp->lock); + /* * Pick up grace-period number for new callbacks. If this * grace period is already marked as needed, return to the caller. @@ -1767,13 +1731,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Get a new grace-period number. If there really is no grace * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. Note that even no-CBs - * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. + * earlier. Adjust callbacks as needed. */ c = rcu_cbs_completed(rdp->rsp, rnp_root); - for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) - if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) - rdp->nxtcompleted[i] = c; + if (!rcu_is_nocb_cpu(rdp->cpu)) + (void)rcu_segcblist_accelerate(&rdp->cblist, c); /* * If the needed for the required grace period is already @@ -1805,9 +1767,7 @@ out: /* * Clean up any old requests for the just-ended grace period. Also return - * whether any additional grace periods have been requested. Also invoke - * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads - * waiting for this grace period to complete. + * whether any additional grace periods have been requested. */ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { @@ -1853,57 +1813,29 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - unsigned long c; - int i; - bool ret; + bool ret = false; - /* If the CPU has no callbacks, nothing to do. */ - if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return false; + lockdep_assert_held(&rnp->lock); - /* - * Starting from the sublist containing the callbacks most - * recently assigned a ->completed number and working down, find the - * first sublist that is not assignable to an upcoming grace period. - * Such a sublist has something in it (first two tests) and has - * a ->completed number assigned that will complete sooner than - * the ->completed number for newly arrived callbacks (last test). - * - * The key point is that any later sublist can be assigned the - * same ->completed number as the newly arrived callbacks, which - * means that the callbacks in any of these later sublist can be - * grouped into a single sublist, whether or not they have already - * been assigned a ->completed number. - */ - c = rcu_cbs_completed(rsp, rnp); - for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) - if (rdp->nxttail[i] != rdp->nxttail[i - 1] && - !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) - break; - - /* - * If there are no sublist for unassigned callbacks, leave. - * At the same time, advance "i" one sublist, so that "i" will - * index into the sublist where all the remaining callbacks should - * be grouped into. - */ - if (++i >= RCU_NEXT_TAIL) + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; /* - * Assign all subsequent callbacks' ->completed number to the next - * full grace period and group them all in the sublist initially - * indexed by "i". + * Callbacks are often registered with incomplete grace-period + * information. Something about the fact that getting exact + * information requires acquiring a global lock... RCU therefore + * makes a conservative estimate of the grace period number at which + * a given callback will become ready to invoke. The following + * code checks this estimate and improves it when possible, thus + * accelerating callback invocation to an earlier grace-period + * number. */ - for (; i <= RCU_NEXT_TAIL; i++) { - rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtcompleted[i] = c; - } - /* Record any needed additional grace periods. */ - ret = rcu_start_future_gp(rnp, rdp, NULL); + if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) + ret = rcu_start_future_gp(rnp, rdp, NULL); /* Trace depending on how much we were able to accelerate. */ - if (!*rdp->nxttail[RCU_WAIT_TAIL]) + if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); @@ -1923,32 +1855,17 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i, j; + lockdep_assert_held(&rnp->lock); - /* If the CPU has no callbacks, nothing to do. */ - if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; /* * Find all callbacks whose ->completed numbers indicate that they * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { - if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) - break; - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; - } - /* Clean up any sublist tail pointers that were misordered above. */ - for (j = RCU_WAIT_TAIL; j < i; j++) - rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; - - /* Copy down callbacks to fill in empty sublists. */ - for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { - if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) - break; - rdp->nxttail[j] = rdp->nxttail[i]; - rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; - } + rcu_segcblist_advance(&rdp->cblist, rnp->completed); /* Classify any remaining callbacks. */ return rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1966,6 +1883,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; bool need_gp; + lockdep_assert_held(&rnp->lock); + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && !unlikely(READ_ONCE(rdp->gpwrap))) { @@ -1993,7 +1912,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); @@ -2172,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) */ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) { - bool isidle = false; - unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); rsp->n_force_qs++; if (first_time) { /* Collect dyntick-idle snapshots. */ - if (is_sysidle_rcu_state(rsp)) { - isidle = true; - maxj = jiffies - ULONG_MAX / 4; - } - force_qs_rnp(rsp, dyntick_save_progress_counter, - &isidle, &maxj); - rcu_sysidle_report_gp(rsp, isidle, maxj); + force_qs_rnp(rsp, dyntick_save_progress_counter); } else { /* Handle dyntick-idle and offline CPUs. */ - isidle = true; - force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { @@ -2398,6 +2308,7 @@ static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rnp->lock); if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -2459,6 +2370,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { + lockdep_assert_held(&rcu_get_root(rsp)->lock); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); @@ -2483,6 +2395,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, unsigned long oldmask = 0; struct rcu_node *rnp_c; + lockdep_assert_held(&rnp->lock); + /* Walk up the rcu_node hierarchy. */ for (;;) { if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { @@ -2543,6 +2457,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; + lockdep_assert_held(&rnp->lock); if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2591,7 +2506,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -2656,6 +2571,8 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rsp->orphan_lock); + /* No-CBs CPUs do not have orphanable callbacks. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) return; @@ -2665,13 +2582,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * because _rcu_barrier() excludes CPU-hotplug operations, so it * cannot be running now. Thus no memory barrier is required. */ - if (rdp->nxtlist != NULL) { - rsp->qlen_lazy += rdp->qlen_lazy; - rsp->qlen += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - rdp->qlen_lazy = 0; - WRITE_ONCE(rdp->qlen, 0); - } + rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); + rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); /* * Next, move those callbacks still needing a grace period to @@ -2679,31 +2591,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * Some of the callbacks might have gone partway through a grace * period, but that is too bad. They get to start over because we * cannot assume that grace periods are synchronized across CPUs. - * We don't bother updating the ->nxttail[] array yet, instead - * we just reset the whole thing later on. */ - if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { - *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; - rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - } + rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); /* * Then move the ready-to-invoke callbacks to the orphanage, * where some other CPU will pick them up. These will not be * required to pass though another grace period: They are done. */ - if (rdp->nxtlist != NULL) { - *rsp->orphan_donetail = rdp->nxtlist; - rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; - } + rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - /* - * Finally, initialize the rcu_data structure's list to empty and - * disallow further callbacks on this CPU. - */ - init_callback_list(rdp); - rdp->nxttail[RCU_NEXT_TAIL] = NULL; + /* Finally, disallow further callbacks on this CPU. */ + rcu_segcblist_disable(&rdp->cblist); } /* @@ -2712,22 +2611,20 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, */ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { - int i; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + lockdep_assert_held(&rsp->orphan_lock); + /* No-CBs CPUs are handled specially. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) return; /* Do the accounting first. */ - rdp->qlen_lazy += rsp->qlen_lazy; - rdp->qlen += rsp->qlen; - rdp->n_cbs_adopted += rsp->qlen; - if (rsp->qlen_lazy != rsp->qlen) + rdp->n_cbs_adopted += rsp->orphan_done.len; + if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) rcu_idle_count_callbacks_posted(); - rsp->qlen_lazy = 0; - rsp->qlen = 0; + rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); /* * We do not need a memory barrier here because the only way we @@ -2735,24 +2632,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) * we are the task doing the rcu_barrier(). */ - /* First adopt the ready-to-invoke callbacks. */ - if (rsp->orphan_donelist != NULL) { - *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; - for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) - if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[i] = rsp->orphan_donetail; - rsp->orphan_donelist = NULL; - rsp->orphan_donetail = &rsp->orphan_donelist; - } - - /* And then adopt the callbacks that still need a grace period. */ - if (rsp->orphan_nxtlist != NULL) { - *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; - rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; - rsp->orphan_nxtlist = NULL; - rsp->orphan_nxttail = &rsp->orphan_nxtlist; - } + /* First adopt the ready-to-invoke callbacks, then the done ones. */ + rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); + WARN_ON_ONCE(rsp->orphan_done.head); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); + WARN_ON_ONCE(rsp->orphan_pend.head); + WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != + !rcu_segcblist_n_cbs(&rdp->cblist)); } /* @@ -2760,14 +2646,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - RCU_TRACE(unsigned long mask); - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); - RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + RCU_TRACE(unsigned long mask;) + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) + RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return; - RCU_TRACE(mask = rdp->grpmask); + RCU_TRACE(mask = rdp->grpmask;) trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), TPS("cpuofl")); @@ -2795,6 +2681,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; + lockdep_assert_held(&rnp->lock); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; @@ -2840,9 +2727,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, - "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", - cpu, rdp->qlen, rdp->nxtlist); + WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || + !rcu_segcblist_empty(&rdp->cblist), + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", + cpu, rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_first_cb(&rdp->cblist)); } /* @@ -2852,14 +2741,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; - struct rcu_head *next, *list, **tail; - long bl, count, count_lazy; - int i; + struct rcu_head *rhp; + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + long bl, count; /* If no callbacks are ready, just return. */ - if (!cpu_has_callbacks_ready_to_invoke(rdp)) { - trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); - trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), + if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { + trace_rcu_batch_start(rsp->name, + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), 0); + trace_rcu_batch_end(rsp->name, 0, + !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); return; @@ -2867,73 +2759,61 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* * Extract the list of ready callbacks, disabling to prevent - * races with call_rcu() from interrupt handlers. + * races with call_rcu() from interrupt handlers. Leave the + * callback counts, as rcu_barrier() needs to be conservative. */ local_irq_save(flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); - list = rdp->nxtlist; - rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - tail = rdp->nxttail[RCU_DONE_TAIL]; - for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) - if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[i] = &rdp->nxtlist; + trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), bl); + rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); local_irq_restore(flags); /* Invoke callbacks. */ - count = count_lazy = 0; - while (list) { - next = list->next; - prefetch(next); - debug_rcu_head_unqueue(list); - if (__rcu_reclaim(rsp->name, list)) - count_lazy++; - list = next; - /* Stop only if limit reached and CPU has something to do. */ - if (++count >= bl && + rhp = rcu_cblist_dequeue(&rcl); + for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + debug_rcu_head_unqueue(rhp); + if (__rcu_reclaim(rsp->name, rhp)) + rcu_cblist_dequeued_lazy(&rcl); + /* + * Stop only if limit reached and CPU has something to do. + * Note: The rcl structure counts down from zero. + */ + if (-rcl.len >= bl && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; } local_irq_save(flags); - trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), - is_idle_task(current), - rcu_is_callbacks_kthread()); - - /* Update count, and requeue any remaining callbacks. */ - if (list != NULL) { - *tail = rdp->nxtlist; - rdp->nxtlist = list; - for (i = 0; i < RCU_NEXT_SIZE; i++) - if (&rdp->nxtlist == rdp->nxttail[i]) - rdp->nxttail[i] = tail; - else - break; - } + count = -rcl.len; + trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(), + is_idle_task(current), rcu_is_callbacks_kthread()); + + /* Update counts and requeue any remaining callbacks. */ + rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rdp->qlen_lazy -= count_lazy; - WRITE_ONCE(rdp->qlen, rdp->qlen - count); rdp->n_cbs_invoked += count; + rcu_segcblist_insert_count(&rdp->cblist, &rcl); /* Reinstate batch limit if we have worked down the excess. */ - if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) + count = rcu_segcblist_n_cbs(&rdp->cblist); + if (rdp->blimit == LONG_MAX && count <= qlowmark) rdp->blimit = blimit; /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ - if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + if (count == 0 && rdp->qlen_last_fqs_check != 0) { rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; - } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) - rdp->qlen_last_fqs_check = rdp->qlen; - WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); + } else if (count < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = count; + WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); local_irq_restore(flags); /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_core(); } @@ -2992,10 +2872,7 @@ void rcu_check_callbacks(int user) * * The caller must have suppressed start of new grace periods. */ -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj) +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) { int cpu; unsigned long flags; @@ -3034,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp, for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { - if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + if (f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } } @@ -3099,7 +2976,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - WARN_ON_ONCE(rdp->beenonline == 0); + WARN_ON_ONCE(!rdp->beenonline); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); @@ -3117,7 +2994,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) } /* If there are callbacks ready, invoke them. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_callbacks(rsp, rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ @@ -3189,7 +3066,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, * invoking force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ - if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > + rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ note_gp_changes(rsp, rdp); @@ -3207,10 +3085,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) + rcu_segcblist_first_pend_cb(&rdp->cblist) != head) force_quiescent_state(rsp); rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; + rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } } } @@ -3239,9 +3117,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); if (debug_rcu_head_queue(head)) { - /* Probable double call_rcu(), so leak the callback. */ + /* + * Probable double call_rcu(), so leak the callback. + * Use rcu:rcu_callback trace event to find the previous + * time callback was passed to __call_rcu(). + */ + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); - WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); return; } head->func = func; @@ -3250,7 +3133,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { + if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { int offline; if (cpu != -1) @@ -3269,31 +3152,45 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, */ BUG_ON(cpu != -1); WARN_ON_ONCE(!rcu_is_watching()); - if (!likely(rdp->nxtlist)) - init_default_callback_list(rdp); + if (rcu_segcblist_empty(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); } - WRITE_ONCE(rdp->qlen, rdp->qlen + 1); - if (lazy) - rdp->qlen_lazy++; - else + rcu_segcblist_enqueue(&rdp->cblist, head, lazy); + if (!lazy) rcu_idle_count_callbacks_posted(); - smp_mb(); /* Count before adding callback for rcu_barrier(). */ - *rdp->nxttail[RCU_NEXT_TAIL] = head; - rdp->nxttail[RCU_NEXT_TAIL] = &head->next; if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen_lazy, rdp->qlen); + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist)); else - trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); + trace_rcu_callback(rsp->name, head, + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ __call_rcu_core(rsp, rdp, head, flags); local_irq_restore(flags); } -/* - * Queue an RCU-sched callback for invocation after a grace period. +/** + * call_rcu_sched() - Queue an RCU for invocation after sched grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_sched() assumes + * that the read-side critical sections end on enabling of preemption + * or on voluntary preemption. + * RCU read-side critical sections are delimited by : + * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR + * - anything that disables preemption. + * + * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { @@ -3301,8 +3198,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu_sched); -/* - * Queue an RCU callback for invocation after a quicker grace period. +/** + * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by : + * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. + * OR + * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. + * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { @@ -3378,12 +3293,6 @@ static inline int rcu_blocking_is_gp(void) * to have executed a full memory barrier during the execution of * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). - * - * This primitive provides the guarantees made by the (now removed) - * synchronize_kernel() API. In contrast, synchronize_rcu() only - * guarantees that rcu_read_lock() sections will have completed. - * In "classic RCU", these two guarantees happen to be one and - * the same, but can differ in realtime RCU implementations. */ void synchronize_sched(void) { @@ -3531,41 +3440,6 @@ void cond_synchronize_sched(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_sched); -/* Adjust sequence number for start of update-side operation. */ -static void rcu_seq_start(unsigned long *sp) -{ - WRITE_ONCE(*sp, *sp + 1); - smp_mb(); /* Ensure update-side operation after counter increment. */ - WARN_ON_ONCE(!(*sp & 0x1)); -} - -/* Adjust sequence number for end of update-side operation. */ -static void rcu_seq_end(unsigned long *sp) -{ - smp_mb(); /* Ensure update-side operation before counter increment. */ - WRITE_ONCE(*sp, *sp + 1); - WARN_ON_ONCE(*sp & 0x1); -} - -/* Take a snapshot of the update side's sequence number. */ -static unsigned long rcu_seq_snap(unsigned long *sp) -{ - unsigned long s; - - s = (READ_ONCE(*sp) + 3) & ~0x1; - smp_mb(); /* Above access must not bleed into critical section. */ - return s; -} - -/* - * Given a snapshot from rcu_seq_snap(), determine whether or not a - * full update-side operation has occurred. - */ -static bool rcu_seq_done(unsigned long *sp, unsigned long s) -{ - return ULONG_CMP_GE(READ_ONCE(*sp), s); -} - /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -3589,7 +3463,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { rdp->n_rp_core_needs_qs++; } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { rdp->n_rp_report_qs++; @@ -3597,7 +3471,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) { + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { rdp->n_rp_cb_ready++; return 1; } @@ -3661,10 +3535,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (!rdp->nxtlist) + if (rcu_segcblist_empty(&rdp->cblist)) continue; hc = true; - if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { + if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) { al = false; break; } @@ -3711,8 +3585,14 @@ static void rcu_barrier_func(void *type) struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); - atomic_inc(&rsp->barrier_cpu_count); - rsp->call(&rdp->barrier_head, rcu_barrier_callback); + rdp->barrier_head.func = rcu_barrier_callback; + debug_rcu_head_queue(&rdp->barrier_head); + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + atomic_inc(&rsp->barrier_cpu_count); + } else { + debug_rcu_head_unqueue(&rdp->barrier_head); + _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); + } } /* @@ -3773,7 +3653,7 @@ static void _rcu_barrier(struct rcu_state *rsp) __call_rcu(&rdp->barrier_head, rcu_barrier_callback, rsp, cpu, 0); } - } else if (READ_ONCE(rdp->qlen)) { + } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); @@ -3831,6 +3711,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; + lockdep_assert_held(&rnp->lock); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; @@ -3882,10 +3763,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - if (!rdp->nxtlist) - init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ + if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ + !init_nocb_callback_list(rdp)) + rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_sysidle_init_percpu_data(rdp->dynticks); rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ @@ -3902,12 +3783,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->cpu_no_qs.b.norm = true; - rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); + rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * Invoked early in the CPU-online process, when pretty much all + * services are available. The incoming CPU is not present. + */ int rcutree_prepare_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3921,6 +3806,9 @@ int rcutree_prepare_cpu(unsigned int cpu) return 0; } +/* + * Update RCU priority boot kthread affinity for CPU-hotplug changes. + */ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) { struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); @@ -3928,20 +3816,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); } +/* + * Near the end of the CPU-online process. Pretty much all services + * enabled, and the CPU is now very much alive. + */ int rcutree_online_cpu(unsigned int cpu) { sync_sched_exp_online_cleanup(cpu); rcutree_affinity_setting(cpu, -1); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_online_cpu(cpu); return 0; } +/* + * Near the beginning of the process. The CPU is still very much alive + * with pretty much all services enabled. + */ int rcutree_offline_cpu(unsigned int cpu) { rcutree_affinity_setting(cpu, cpu); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_offline_cpu(cpu); return 0; } - +/* + * Near the end of the offline process. We do only tracing here. + */ int rcutree_dying_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3951,6 +3853,9 @@ int rcutree_dying_cpu(unsigned int cpu) return 0; } +/* + * The outgoing CPU is gone and we are running elsewhere. + */ int rcutree_dead_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3968,6 +3873,10 @@ int rcutree_dead_cpu(unsigned int cpu) * incoming CPUs are not allowed to use RCU read-side critical sections * until this function is called. Failing to observe this restriction * will result in lockdep splats. + * + * Note that this function is special in that it is invoked directly + * from the incoming CPU rather than from the cpuhp_step mechanism. + * This is because this function must be invoked at a precise location. */ void rcu_cpu_starting(unsigned int cpu) { @@ -3993,9 +3902,6 @@ void rcu_cpu_starting(unsigned int cpu) * The CPU is exiting the idle loop into the arch_cpu_idle_dead() * function. We now remove it from the rcu_node tree's ->qsmaskinit * bit masks. - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit - * bit masks. */ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) { @@ -4011,6 +3917,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * The outgoing function has no further need of RCU, so remove it from + * the list of CPUs that RCU must track. + * + * Note that this function is special in that it is invoked directly + * from the outgoing CPU rather than from the cpuhp_step mechanism. + * This is because this function must be invoked at a precise location. + */ void rcu_report_dead(unsigned int cpu) { struct rcu_state *rsp; @@ -4025,6 +3939,10 @@ void rcu_report_dead(unsigned int cpu) } #endif +/* + * On non-huge systems, use expedited RCU grace periods to make suspend + * and hibernation run faster. + */ static int rcu_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -4095,7 +4013,7 @@ early_initcall(rcu_spawn_gp_kthread); * task is booting the system, and such primitives are no-ops). After this * function is called, any synchronous grace-period primitives are run as * expedited, with the requesting task driving the grace period forward. - * A later core_initcall() rcu_exp_runtime_mode() will switch to full + * A later core_initcall() rcu_set_runtime_mode() will switch to full * runtime RCU functionality. */ void rcu_scheduler_starting(void) @@ -4108,31 +4026,6 @@ void rcu_scheduler_starting(void) } /* - * Compute the per-level fanout, either using the exact fanout specified - * or balancing the tree, depending on the rcu_fanout_exact boot parameter. - */ -static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) -{ - int i; - - if (rcu_fanout_exact) { - levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; - for (i = rcu_num_lvls - 2; i >= 0; i--) - levelspread[i] = RCU_FANOUT; - } else { - int ccur; - int cprv; - - cprv = nr_cpu_ids; - for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = levelcnt[i]; - levelspread[i] = (cprv + ccur - 1) / ccur; - cprv = ccur; - } - } -} - -/* * Helper function for rcu_init() that initializes one rcu_state structure. */ static void __init rcu_init_one(struct rcu_state *rsp) @@ -4141,9 +4034,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) static const char * const fqs[] = RCU_FQS_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static u8 fl_mask = 0x1; - int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ int cpustride = 1; int i; @@ -4158,20 +4049,16 @@ static void __init rcu_init_one(struct rcu_state *rsp) /* Initialize the level-tracking arrays. */ - for (i = 0; i < rcu_num_lvls; i++) - levelcnt[i] = num_rcu_lvl[i]; for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; - rcu_init_levelspread(levelspread, levelcnt); - rsp->flavor_mask = fl_mask; - fl_mask <<= 1; + rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1]; + rcu_init_levelspread(levelspread, num_rcu_lvl); /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= levelspread[i]; rnp = rsp->level[i]; - for (j = 0; j < levelcnt[i]; j++, rnp++) { + for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), &rcu_node_class[i], buf[i]); @@ -4344,6 +4231,8 @@ void __init rcu_init(void) for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); rcu_cpu_starting(cpu); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_online_cpu(cpu); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ec62a05bfdb3..9af0f31d6847 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -30,80 +30,9 @@ #include <linux/seqlock.h> #include <linux/swait.h> #include <linux/stop_machine.h> +#include <linux/rcu_node_tree.h> -/* - * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and - * CONFIG_RCU_FANOUT_LEAF. - * In theory, it should be possible to add more levels straightforwardly. - * In practice, this did work well going from three levels to four. - * Of course, your mileage may vary. - */ - -#ifdef CONFIG_RCU_FANOUT -#define RCU_FANOUT CONFIG_RCU_FANOUT -#else /* #ifdef CONFIG_RCU_FANOUT */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT 64 -# else -# define RCU_FANOUT 32 -# endif -#endif /* #else #ifdef CONFIG_RCU_FANOUT */ - -#ifdef CONFIG_RCU_FANOUT_LEAF -#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF -#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT_LEAF 64 -# else -# define RCU_FANOUT_LEAF 32 -# endif -#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ - -#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) -#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) -#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) -#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) - -#if NR_CPUS <= RCU_FANOUT_1 -# define RCU_NUM_LVLS 1 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_NODES NUM_RCU_LVL_0 -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } -# define RCU_NODE_NAME_INIT { "rcu_node_0" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -#elif NR_CPUS <= RCU_FANOUT_2 -# define RCU_NUM_LVLS 2 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -#elif NR_CPUS <= RCU_FANOUT_3 -# define RCU_NUM_LVLS 3 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -#elif NR_CPUS <= RCU_FANOUT_4 -# define RCU_NUM_LVLS 4 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -#else -# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" -#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ - -extern int rcu_num_lvls; -extern int rcu_num_nodes; +#include "rcu_segcblist.h" /* * Dynticks per-CPU state. @@ -113,14 +42,9 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - long long dynticks_idle_nesting; - /* irq/process nesting level from idle. */ - atomic_t dynticks_idle; /* Even value for idle, else odd. */ - /* "Idle" excludes userspace execution. */ - unsigned long dynticks_idle_jiffies; - /* End of last non-NMI non-idle period. */ -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ + unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ + bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; @@ -228,19 +152,6 @@ struct rcu_node { /* Number of tasks boosted for expedited GP. */ unsigned long n_normal_boosts; /* Number of tasks boosted for normal GP. */ - unsigned long n_balk_blkd_tasks; - /* Refused to boost: no blocked tasks. */ - unsigned long n_balk_exp_gp_tasks; - /* Refused to boost: nothing blocking GP. */ - unsigned long n_balk_boost_tasks; - /* Refused to boost: already boosting. */ - unsigned long n_balk_notblocked; - /* Refused to boost: RCU RS CS still running. */ - unsigned long n_balk_notyet; - /* Refused to boost: not yet time. */ - unsigned long n_balk_nos; - /* Refused to boost: not sure why, though. */ - /* This can happen due to race conditions. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -262,41 +173,6 @@ struct rcu_node { #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) /* - * Do a full breadth-first scan of the rcu_node structures for the - * specified rcu_state structure. - */ -#define rcu_for_each_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Do a breadth-first scan of the non-leaf rcu_node structures for the - * specified rcu_state structure. Note that if there is a singleton - * rcu_node tree with but one rcu_node structure, this loop is a no-op. - */ -#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) - -/* - * Scan the leaves of the rcu_node hierarchy for the specified rcu_state - * structure. Note that if there is a singleton rcu_node tree with but - * one rcu_node structure, this loop -will- visit the rcu_node structure. - * It is still a leaf node, even if it is also the root node. - */ -#define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Iterate over all possible CPUs in a leaf RCU node. - */ -#define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ - cpu <= rnp->grphi; \ - cpu = cpumask_next((cpu), cpu_possible_mask)) - -/* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. */ @@ -336,34 +212,9 @@ struct rcu_data { /* period it is aware of. */ /* 2) batch handling */ - /* - * If nxtlist is not NULL, it is partitioned as follows. - * Any of the partitions might be empty, in which case the - * pointer to that partition will be equal to the pointer for - * the following partition. When the list is empty, all of - * the nxttail elements point to the ->nxtlist pointer itself, - * which in that case is NULL. - * - * [nxtlist, *nxttail[RCU_DONE_TAIL]): - * Entries that batch # <= ->completed - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * Note that the value of *nxttail[RCU_NEXT_TAIL] will - * always be NULL, as this is the end of the list. - */ - struct rcu_head *nxtlist; - struct rcu_head **nxttail[RCU_NEXT_SIZE]; - unsigned long nxtcompleted[RCU_NEXT_SIZE]; - /* grace periods for sublists. */ - long qlen_lazy; /* # of lazy queued callbacks */ - long qlen; /* # of queued callbacks, incl lazy */ + struct rcu_segcblist cblist; /* Segmented callback list, with */ + /* different callbacks waiting for */ + /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ @@ -440,9 +291,9 @@ struct rcu_data { }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ -#define RCU_NOGP_WAKE_NOT 0 -#define RCU_NOGP_WAKE 1 -#define RCU_NOGP_WAKE_FORCE 2 +#define RCU_NOCB_WAKE_NOT 0 +#define RCU_NOCB_WAKE 1 +#define RCU_NOCB_WAKE_FORCE 2 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ @@ -482,7 +333,6 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS + 1]; /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ - u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ call_rcu_func_t call; /* call_rcu() flavor. */ int ncpus; /* # CPUs seen so far. */ @@ -502,14 +352,11 @@ struct rcu_state { raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; /* Protect following fields. */ - struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ + struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ /* need a grace period. */ - struct rcu_head **orphan_nxttail; /* Tail of above. */ - struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ + struct rcu_cblist orphan_done; /* Orphaned callbacks that */ /* are ready to invoke. */ - struct rcu_head **orphan_donetail; /* Tail of above. */ - long qlen_lazy; /* Number of lazy callbacks. */ - long qlen; /* Total number of callbacks. */ + /* (Contains counts.) */ /* End of fields guarded by orphan_lock. */ struct mutex barrier_mutex; /* Guards barrier fields. */ @@ -596,6 +443,7 @@ extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ int rcu_dynticks_snap(struct rcu_dynticks *rdtp); +bool rcu_eqs_special_set(int cpu); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -608,7 +456,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -static void rcu_preempt_note_context_switch(void); +static void rcu_preempt_note_context_switch(bool preempt); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); @@ -660,89 +508,17 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); -static void rcu_sysidle_enter(int irq); -static void rcu_sysidle_exit(int irq); -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj); -static bool is_sysidle_rcu_state(struct rcu_state *rsp); -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj); static void rcu_bind_gp_kthread(void); -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); -#endif /* #ifndef RCU_TREE_NONCORE */ +#ifdef CONFIG_SRCU +void srcu_online_cpu(unsigned int cpu); +void srcu_offline_cpu(unsigned int cpu); +#else /* #ifdef CONFIG_SRCU */ +void srcu_online_cpu(unsigned int cpu) { } +void srcu_offline_cpu(unsigned int cpu) { } +#endif /* #else #ifdef CONFIG_SRCU */ -#ifdef CONFIG_RCU_TRACE -/* Read out queue lengths for tracing. */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ -#ifdef CONFIG_RCU_NOCB_CPU - *ql = atomic_long_read(&rdp->nocb_q_count); - *qll = atomic_long_read(&rdp->nocb_q_count_lazy); -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - *ql = 0; - *qll = 0; -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -} -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * Wrappers for the rcu_node::lock acquire and release. - * - * Because the rcu_nodes form a tree, the tree traversal locking will observe - * different lock values, this in turn means that an UNLOCK of one level - * followed by a LOCK of another level does not imply a full memory barrier; - * and most importantly transitivity is lost. - * - * In order to restore full ordering between tree levels, augment the regular - * lock acquire functions with smp_mb__after_unlock_lock(). - * - * As ->lock of struct rcu_node is a __private field, therefore one should use - * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. - */ -static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) -{ - raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); - smp_mb__after_unlock_lock(); -} - -static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) -{ - raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); -} - -static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) -{ - raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); - smp_mb__after_unlock_lock(); -} - -static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) -{ - raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); -} - -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ -} while (0) - -static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) -{ - bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); - - if (locked) - smp_mb__after_unlock_lock(); - return locked; -} +#endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index a7b639ccd46e..dd21ca47e4b4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * * Caller must hold the rcu_state's exp_mutex. */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; @@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, rnp->grphi, TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(rsp, &rdp->exp_workdone2, s)); return true; @@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data) return; } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); resched_cpu(smp_processor_id()); } @@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) rnp->exp_seq_rq = s; spin_unlock(&rnp->exp_lock); } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + smp_mb(); /* All above changes before wakeup. */ + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]); } trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); mutex_unlock(&rsp->exp_wake_mutex); @@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, /* Wait for expedited grace period to complete. */ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); rnp = rcu_get_root(rsp); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone0, s)); + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], + sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); + smp_mb(); /* Workqueue actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rsp->exp_mutex); @@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void) EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ - -/* - * Switch to run-time mode once Tree RCU has fully initialized. - */ -static int __init rcu_exp_runtime_mode(void) -{ - rcu_test_sync_prims(); - rcu_scheduler_active = RCU_SCHEDULER_RUNNING; - rcu_test_sync_prims(); - return 0; -} -core_initcall(rcu_exp_runtime_mode); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a62a8f1caac..908b309d60d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ static void __init rcu_bootup_announce_oddness(void) { if (IS_ENABLED(CONFIG_RCU_TRACE)) - pr_info("\tRCU debugfs-based tracing is enabled.\n"); + pr_info("\tRCU event tracing is enabled.\n"); if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", @@ -90,8 +90,32 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); - if (IS_ENABLED(CONFIG_RCU_BOOST)) - pr_info("\tRCU kthread priority: %d.\n", kthread_prio); +#ifdef CONFIG_RCU_BOOST + pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); +#endif + if (blimit != DEFAULT_RCU_BLIMIT) + pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); + if (qhimark != DEFAULT_RCU_QHIMARK) + pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); + if (qlowmark != DEFAULT_RCU_QLOMARK) + pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (jiffies_till_first_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); + if (jiffies_till_next_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); + if (rcu_kick_kthreads) + pr_info("\tKick kthreads if too-long grace period.\n"); + if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) + pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + if (gp_preinit_delay) + pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); + if (gp_init_delay) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); + if (gp_cleanup_delay) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) + pr_info("\tRCU debug extended QS entry/exit.\n"); + rcupdate_announce_bootup_oddness(); } #ifdef CONFIG_PREEMPT_RCU @@ -155,6 +179,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; + lockdep_assert_held(&rnp->lock); + /* * Decide where to queue the newly blocked task. In theory, * this could be an if-statement. In practice, when I tried @@ -263,6 +289,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) */ static void rcu_preempt_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data_p->gpnum), @@ -286,12 +313,14 @@ static void rcu_preempt_qs(void) * * Caller must disable interrupts. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { struct task_struct *t = current; struct rcu_data *rdp; struct rcu_node *rnp; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); + WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -607,6 +636,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; @@ -643,8 +673,37 @@ static void rcu_preempt_do_callbacks(void) #endif /* #ifdef CONFIG_RCU_BOOST */ -/* - * Queue a preemptible-RCU callback for invocation after a grace period. +/** + * call_rcu() - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -663,8 +722,13 @@ EXPORT_SYMBOL_GPL(call_rcu); * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. * - * See the description of synchronize_sched() for more detailed information - * on memory ordering guarantees. + * See the description of synchronize_sched() for more detailed + * information on memory-ordering guarantees. However, please note + * that -only- the memory-ordering guarantees apply. For example, + * synchronize_rcu() is -not- guaranteed to wait on things like code + * protected by preempt_disable(), instead, synchronize_rcu() is -only- + * guaranteed to wait on RCU read-side critical sections, that is, sections + * of code protected by rcu_read_lock(). */ void synchronize_rcu(void) { @@ -738,7 +802,7 @@ static void __init rcu_bootup_announce(void) * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { } @@ -835,33 +899,6 @@ void exit_rcu(void) #include "../locking/rtmutex_common.h" -#ifdef CONFIG_RCU_TRACE - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ - if (!rcu_preempt_has_tasks(rnp)) - rnp->n_balk_blkd_tasks++; - else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) - rnp->n_balk_exp_gp_tasks++; - else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) - rnp->n_balk_boost_tasks++; - else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) - rnp->n_balk_notblocked++; - else if (rnp->gp_tasks != NULL && - ULONG_CMP_LT(jiffies, rnp->boost_time)) - rnp->n_balk_notyet++; - else - rnp->n_balk_nos++; -} - -#else /* #ifdef CONFIG_RCU_TRACE */ - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - static void rcu_wake_cond(struct task_struct *t, int status) { /* @@ -992,8 +1029,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; + lockdep_assert_held(&rnp->lock); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { - rnp->n_balk_exp_gp_tasks++; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1009,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { - rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1260,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) - ? 0 : rcu_cpu_has_callbacks(NULL); + return rcu_cpu_has_callbacks(NULL); } /* @@ -1350,10 +1385,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) */ if ((rdp->completed != rnp->completed || unlikely(READ_ONCE(rdp->gpwrap))) && - rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) + rcu_segcblist_pend_cbs(&rdp->cblist)) note_gp_changes(rsp, rdp); - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) cbs_ready = true; } return cbs_ready; @@ -1372,10 +1407,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); unsigned long dj; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { - *nextevt = KTIME_MAX; - return 0; - } + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; @@ -1424,8 +1456,8 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); + if (rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1461,7 +1493,7 @@ static void rcu_prepare_for_idle(void) rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (!*rdp->nxttail[RCU_DONE_TAIL]) + if (rcu_segcblist_pend_cbs(&rdp->cblist)) continue; rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ @@ -1479,8 +1511,8 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); + if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1529,7 +1561,7 @@ static void rcu_oom_notify_cpu(void *unused) for_each_rcu_flavor(rsp) { rdp = raw_cpu_ptr(rsp->rda); - if (rdp->qlen_lazy != 0) { + if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) { atomic_inc(&oom_callback_count); rsp->call(&rdp->oom_head, rcu_oom_callback); } @@ -1709,7 +1741,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { - rcu_nocb_poll = 1; + rcu_nocb_poll = true; return 0; } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); @@ -1747,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -#ifndef CONFIG_RCU_NOCB_CPU_ALL /* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { @@ -1755,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Kick the leader kthread for this NOCB group. @@ -1769,6 +1799,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); } } @@ -1860,7 +1891,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmptyIsDeferred")); } @@ -1872,7 +1905,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvfIsDeferred")); } @@ -1930,30 +1965,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long flags) { - long ql = rsp->qlen; - long qll = rsp->qlen_lazy; + long ql = rsp->orphan_done.len; + long qll = rsp->orphan_done.len_lazy; /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ if (!rcu_is_nocb_cpu(smp_processor_id())) return false; - rsp->qlen = 0; - rsp->qlen_lazy = 0; /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_donelist != NULL) { - __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, - rsp->orphan_donetail, ql, qll, flags); - ql = qll = 0; - rsp->orphan_donelist = NULL; - rsp->orphan_donetail = &rsp->orphan_donelist; + if (rsp->orphan_done.head) { + __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), + rcu_cblist_tail(&rsp->orphan_done), + ql, qll, flags); } - if (rsp->orphan_nxtlist != NULL) { - __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, - rsp->orphan_nxttail, ql, qll, flags); - ql = qll = 0; - rsp->orphan_nxtlist = NULL; - rsp->orphan_nxttail = &rsp->orphan_nxtlist; + if (rsp->orphan_pend.head) { + __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), + rcu_cblist_tail(&rsp->orphan_pend), + ql, qll, flags); } + rcu_cblist_init(&rsp->orphan_done); + rcu_cblist_init(&rsp->orphan_pend); return true; } @@ -2023,6 +2054,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; + smp_mb(); /* wakeup before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) @@ -2201,8 +2233,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; ndw = READ_ONCE(rdp->nocb_defer_wakeup); - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } @@ -2212,10 +2244,6 @@ void __init rcu_init_nohz(void) bool need_rcu_nocb_mask = true; struct rcu_state *rsp; -#ifdef CONFIG_RCU_NOCB_CPU_NONE - need_rcu_nocb_mask = false; -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ - #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) need_rcu_nocb_mask = true; @@ -2231,14 +2259,6 @@ void __init rcu_init_nohz(void) if (!have_rcu_nocb_mask) return; -#ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tOffload RCU callbacks from CPU 0\n"); - cpumask_set_cpu(0, rcu_nocb_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ -#ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); @@ -2395,16 +2415,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) return false; /* If there are early-boot callbacks, move them to nocb lists. */ - if (rdp->nxtlist) { - rdp->nocb_head = rdp->nxtlist; - rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; - atomic_long_set(&rdp->nocb_q_count, rdp->qlen); - atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); - rdp->nxtlist = NULL; - rdp->qlen = 0; - rdp->qlen_lazy = 0; + if (!rcu_segcblist_empty(&rdp->cblist)) { + rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); + rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); + atomic_long_set(&rdp->nocb_q_count, + rcu_segcblist_n_cbs(&rdp->cblist)); + atomic_long_set(&rdp->nocb_q_count_lazy, + rcu_segcblist_n_lazy_cbs(&rdp->cblist)); + rcu_segcblist_init(&rdp->cblist); } - rdp->nxttail[RCU_NEXT_TAIL] = NULL; + rcu_segcblist_disable(&rdp->cblist); return true; } @@ -2491,421 +2511,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) #endif /* #ifdef CONFIG_NO_HZ_FULL */ } - -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - -static int full_sysidle_state; /* Current system-idle state. */ -#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ -#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ -#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ -#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ -#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ - -/* - * Invoked to note exit from irq or task transition to idle. Note that - * usermode execution does -not- count as idle here! After all, we want - * to detect full-system idle states, not RCU quiescent states and grace - * periods. The caller must have disabled interrupts. - */ -static void rcu_sysidle_enter(int irq) -{ - unsigned long j; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for fully idle. */ - if (irq) { - rdtp->dynticks_idle_nesting--; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - if (rdtp->dynticks_idle_nesting != 0) - return; /* Still not fully idle. */ - } else { - if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) { - rdtp->dynticks_idle_nesting = 0; - } else { - rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - return; /* Still not fully idle. */ - } - } - - /* Record start of fully idle period. */ - j = jiffies; - WRITE_ONCE(rdtp->dynticks_idle_jiffies, j); - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); -} - -/* - * Unconditionally force exit from full system-idle state. This is - * invoked when a normal CPU exits idle, but must be called separately - * for the timekeeping CPU (tick_do_timer_cpu). The reason for this - * is that the timekeeping CPU is permitted to take scheduling-clock - * interrupts while the system is in system-idle state, and of course - * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock - * interrupt from any other type of interrupt. - */ -void rcu_sysidle_force_exit(void) -{ - int oldstate = READ_ONCE(full_sysidle_state); - int newoldstate; - - /* - * Each pass through the following loop attempts to exit full - * system-idle state. If contention proves to be a problem, - * a trylock-based contention tree could be used here. - */ - while (oldstate > RCU_SYSIDLE_SHORT) { - newoldstate = cmpxchg(&full_sysidle_state, - oldstate, RCU_SYSIDLE_NOT); - if (oldstate == newoldstate && - oldstate == RCU_SYSIDLE_FULL_NOTED) { - rcu_kick_nohz_cpu(tick_do_timer_cpu); - return; /* We cleared it, done! */ - } - oldstate = newoldstate; - } - smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ -} - -/* - * Invoked to note entry to irq or task transition from idle. Note that - * usermode execution does -not- count as idle here! The caller must - * have disabled interrupts. - */ -static void rcu_sysidle_exit(int irq) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for already non-idle. */ - if (irq) { - rdtp->dynticks_idle_nesting++; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - if (rdtp->dynticks_idle_nesting != 1) - return; /* Already non-idle. */ - } else { - /* - * Allow for irq misnesting. Yes, it really is possible - * to enter an irq handler then never leave it, and maybe - * also vice versa. Handle both possibilities. - */ - if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - return; /* Already non-idle. */ - } else { - rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; - } - } - - /* Record end of idle period. */ - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); - - /* - * If we are the timekeeping CPU, we are permitted to be non-idle - * during a system-idle state. This must be the case, because - * the timekeeping CPU has to take scheduling-clock interrupts - * during the time that the system is transitioning to full - * system-idle state. This means that the timekeeping CPU must - * invoke rcu_sysidle_force_exit() directly if it does anything - * more than take a scheduling-clock interrupt. - */ - if (smp_processor_id() == tick_do_timer_cpu) - return; - - /* Update system-idle state: We are clearly no longer fully idle! */ - rcu_sysidle_force_exit(); -} - -/* - * Check to see if the current CPU is idle. Note that usermode execution - * does not count as idle. The caller must have disabled interrupts, - * and must be running on tick_do_timer_cpu. - */ -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ - int cur; - unsigned long j; - struct rcu_dynticks *rdtp = rdp->dynticks; - - /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ - if (!tick_nohz_full_enabled()) - return; - - /* - * If some other CPU has already reported non-idle, if this is - * not the flavor of RCU that tracks sysidle state, or if this - * is an offline or the timekeeping CPU, nothing to do. - */ - if (!*isidle || rdp->rsp != rcu_state_p || - cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) - return; - /* Verify affinity of current kthread. */ - WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); - - /* Pick up current idle and NMI-nesting counter and check. */ - cur = atomic_read(&rdtp->dynticks_idle); - if (cur & 0x1) { - *isidle = false; /* We are not idle! */ - return; - } - smp_mb(); /* Read counters before timestamps. */ - - /* Pick up timestamps. */ - j = READ_ONCE(rdtp->dynticks_idle_jiffies); - /* If this CPU entered idle more recently, update maxj timestamp. */ - if (ULONG_CMP_LT(*maxj, j)) - *maxj = j; -} - -/* - * Is this the flavor of RCU that is handling full-system idle? - */ -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return rsp == rcu_state_p; -} - -/* - * Return a delay in jiffies based on the number of CPUs, rcu_node - * leaf fanout, and jiffies tick rate. The idea is to allow larger - * systems more time to transition to full-idle state in order to - * avoid the cache thrashing that otherwise occur on the state variable. - * Really small systems (less than a couple of tens of CPUs) should - * instead use a single global atomically incremented counter, and later - * versions of this will automatically reconfigure themselves accordingly. - */ -static unsigned long rcu_sysidle_delay(void) -{ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return 0; - return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); -} - -/* - * Advance the full-system-idle state. This is invoked when all of - * the non-timekeeping CPUs are idle. - */ -static void rcu_sysidle(unsigned long j) -{ - /* Check the current state. */ - switch (READ_ONCE(full_sysidle_state)) { - case RCU_SYSIDLE_NOT: - - /* First time all are idle, so note a short idle period. */ - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT); - break; - - case RCU_SYSIDLE_SHORT: - - /* - * Idle for a bit, time to advance to next state? - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); - break; - - case RCU_SYSIDLE_LONG: - - /* - * Do an additional check pass before advancing to full. - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); - break; - - default: - break; - } -} - -/* - * Found a non-idle non-timekeeping CPU, so kick the system-idle state - * back to the beginning. - */ -static void rcu_sysidle_cancel(void) -{ - smp_mb(); - if (full_sysidle_state > RCU_SYSIDLE_SHORT) - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT); -} - -/* - * Update the sysidle state based on the results of a force-quiescent-state - * scan of the CPUs' dyntick-idle state. - */ -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj, bool gpkt) -{ - if (rsp != rcu_state_p) - return; /* Wrong flavor, ignore. */ - if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return; /* Running state machine from timekeeping CPU. */ - if (isidle) - rcu_sysidle(maxj); /* More idle! */ - else - rcu_sysidle_cancel(); /* Idle is over. */ -} - -/* - * Wrapper for rcu_sysidle_report() when called from the grace-period - * kthread's context. - */ -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - rcu_sysidle_report(rsp, isidle, maxj, true); -} - -/* Callback and function for forcing an RCU grace period. */ -struct rcu_sysidle_head { - struct rcu_head rh; - int inuse; -}; - -static void rcu_sysidle_cb(struct rcu_head *rhp) -{ - struct rcu_sysidle_head *rshp; - - /* - * The following memory barrier is needed to replace the - * memory barriers that would normally be in the memory - * allocator. - */ - smp_mb(); /* grace period precedes setting inuse. */ - - rshp = container_of(rhp, struct rcu_sysidle_head, rh); - WRITE_ONCE(rshp->inuse, 0); -} - -/* - * Check to see if the system is fully idle, other than the timekeeping CPU. - * The caller must have disabled interrupts. This is not intended to be - * called unless tick_nohz_full_enabled(). - */ -bool rcu_sys_is_idle(void) -{ - static struct rcu_sysidle_head rsh; - int rss = READ_ONCE(full_sysidle_state); - - if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) - return false; - - /* Handle small-system case by doing a full scan of CPUs. */ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { - int oldrss = rss - 1; - - /* - * One pass to advance to each state up to _FULL. - * Give up if any pass fails to advance the state. - */ - while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { - int cpu; - bool isidle = true; - unsigned long maxj = jiffies - ULONG_MAX / 4; - struct rcu_data *rdp; - - /* Scan all the CPUs looking for nonidle CPUs. */ - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rcu_state_p->rda, cpu); - rcu_sysidle_check_cpu(rdp, &isidle, &maxj); - if (!isidle) - break; - } - rcu_sysidle_report(rcu_state_p, isidle, maxj, false); - oldrss = rss; - rss = READ_ONCE(full_sysidle_state); - } - } - - /* If this is the first observation of an idle period, record it. */ - if (rss == RCU_SYSIDLE_FULL) { - rss = cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); - return rss == RCU_SYSIDLE_FULL; - } - - smp_mb(); /* ensure rss load happens before later caller actions. */ - - /* If already fully idle, tell the caller (in case of races). */ - if (rss == RCU_SYSIDLE_FULL_NOTED) - return true; - - /* - * If we aren't there yet, and a grace period is not in flight, - * initiate a grace period. Either way, tell the caller that - * we are not there yet. We use an xchg() rather than an assignment - * to make up for the memory barriers that would otherwise be - * provided by the memory allocator. - */ - if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && - !rcu_gp_in_progress(rcu_state_p) && - !rsh.inuse && xchg(&rsh.inuse, 1) == 0) - call_rcu(&rsh.rh, rcu_sysidle_cb); - return false; -} - -/* - * Initialize dynticks sysidle state for CPUs coming online. - */ -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ - rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; -} - -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - -static void rcu_sysidle_enter(int irq) -{ -} - -static void rcu_sysidle_exit(int irq) -{ -} - -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ -} - -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return false; -} - -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ -} - -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? @@ -2936,13 +2541,7 @@ static void rcu_bind_gp_kthread(void) if (!tick_nohz_full_enabled()) return; -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - cpu = tick_do_timer_cpu; - if (cpu >= 0 && cpu < nr_cpu_ids) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ housekeeping_affine(current); -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c deleted file mode 100644 index 8751a748499a..000000000000 --- a/kernel/rcu/tree_trace.c +++ /dev/null @@ -1,494 +0,0 @@ -/* - * Read-Copy Update tracing for hierarchical implementation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright IBM Corporation, 2008 - * Author: Paul E. McKenney - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/rcupdate.h> -#include <linux/interrupt.h> -#include <linux/sched.h> -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/completion.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/mutex.h> -#include <linux/debugfs.h> -#include <linux/seq_file.h> - -#define RCU_TREE_NONCORE -#include "tree.h" - -DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); - -static int r_open(struct inode *inode, struct file *file, - const struct seq_operations *op) -{ - int ret = seq_open(file, op); - if (!ret) { - struct seq_file *m = (struct seq_file *)file->private_data; - m->private = inode->i_private; - } - return ret; -} - -static void *r_start(struct seq_file *m, loff_t *pos) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - *pos = cpumask_next(*pos - 1, cpu_possible_mask); - if ((*pos) < nr_cpu_ids) - return per_cpu_ptr(rsp->rda, *pos); - return NULL; -} - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return r_start(m, pos); -} - -static void r_stop(struct seq_file *m, void *v) -{ -} - -static int show_rcubarrier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d bseq: %lu\n", - atomic_read(&rsp->barrier_cpu_count), - rsp->barrier_sequence); - return 0; -} - -static int rcubarrier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcubarrier, inode->i_private); -} - -static const struct file_operations rcubarrier_fops = { - .owner = THIS_MODULE, - .open = rcubarrier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static char convert_kthread_status(unsigned int kthread_status) -{ - if (kthread_status > RCU_KTHREAD_MAX) - return '?'; - return "SRWOY"[kthread_status]; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) -{ - long ql, qll; - - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->cpu_no_qs.b.norm, - rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), - rdp->core_needs_qs); - seq_printf(m, " dt=%d/%llx/%d df=%lu", - rcu_dynticks_snap(rdp->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, " of=%lu", rdp->offline_fqs); - rcu_nocb_q_lengths(rdp, &ql, &qll); - qll += rdp->qlen_lazy; - ql += rdp->qlen; - seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", - qll, ql, - ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != - rdp->nxttail[RCU_NEXT_TAIL]], - ".R"[rdp->nxttail[RCU_WAIT_TAIL] != - rdp->nxttail[RCU_NEXT_READY_TAIL]], - ".W"[rdp->nxttail[RCU_DONE_TAIL] != - rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c ktl=%x", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu)), - per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, " b=%ld", rdp->blimit); - seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", - rdp->n_cbs_invoked, rdp->n_nocbs_invoked, - rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -static int show_rcudata(struct seq_file *m, void *v) -{ - print_one_rcu_data(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcudate_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcudata, -}; - -static int rcudata_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcudate_op); -} - -static const struct file_operations rcudata_fops = { - .owner = THIS_MODULE, - .open = rcudata_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcuexp(struct seq_file *m, void *v) -{ - int cpu; - struct rcu_state *rsp = (struct rcu_state *)m->private; - struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; - - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->exp_workdone0); - s1 += atomic_long_read(&rdp->exp_workdone1); - s2 += atomic_long_read(&rdp->exp_workdone2); - s3 += atomic_long_read(&rdp->exp_workdone3); - } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, - atomic_read(&rsp->expedited_need_qs), - rsp->expedited_sequence / 2); - return 0; -} - -static int rcuexp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuexp, inode->i_private); -} - -static const struct file_operations rcuexp_fops = { - .owner = THIS_MODULE, - .open = rcuexp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) -{ - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", - rnp->grplo, rnp->grphi, - "T."[list_empty(&rnp->blkd_tasks)], - "N."[!rnp->gp_tasks], - "E."[!rnp->exp_tasks], - "B."[!rnp->boost_tasks], - convert_kthread_status(rnp->boost_kthread_status), - rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts); - seq_printf(m, "j=%04x bt=%04x\n", - (int)(jiffies & 0xffff), - (int)(rnp->boost_time & 0xffff)); - seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - rnp->n_balk_blkd_tasks, - rnp->n_balk_exp_gp_tasks, - rnp->n_balk_boost_tasks, - rnp->n_balk_notblocked, - rnp->n_balk_notyet, - rnp->n_balk_nos); -} - -static int show_rcu_node_boost(struct seq_file *m, void *unused) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(&rcu_preempt_state, rnp) - print_one_rcu_node_boost(m, rnp); - return 0; -} - -static int rcu_node_boost_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcu_node_boost, NULL); -} - -static const struct file_operations rcu_node_boost_fops = { - .owner = THIS_MODULE, - .open = rcu_node_boost_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long gpnum; - int level = 0; - struct rcu_node *rnp; - - gpnum = rsp->gpnum; - seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", - ulong2long(rsp->completed), ulong2long(gpnum), - rsp->gp_state, - (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff)); - seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", - rsp->n_force_qs, rsp->n_force_qs_ngp, - rsp->n_force_qs - rsp->n_force_qs_ngp, - READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { - if (rnp->level != level) { - seq_puts(m, "\n"); - level = rnp->level; - } - seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", - rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, - ".G"[rnp->gp_tasks != NULL], - ".E"[rnp->exp_tasks != NULL], - ".T"[!list_empty(&rnp->blkd_tasks)], - rnp->grplo, rnp->grphi, rnp->grpnum); - } - seq_puts(m, "\n"); -} - -static int show_rcuhier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - print_one_rcu_state(m, rsp); - return 0; -} - -static int rcuhier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuhier, inode->i_private); -} - -static const struct file_operations rcuhier_fops = { - .owner = THIS_MODULE, - .open = rcuhier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long completed; - unsigned long gpnum; - unsigned long gpage; - unsigned long gpmax; - struct rcu_node *rnp = &rsp->node[0]; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - completed = READ_ONCE(rsp->completed); - gpnum = READ_ONCE(rsp->gpnum); - if (completed == gpnum) - gpage = 0; - else - gpage = jiffies - rsp->gp_start; - gpmax = rsp->gp_max; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", - ulong2long(completed), ulong2long(gpnum), gpage, gpmax); -} - -static int show_rcugp(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - show_one_rcugp(m, rsp); - return 0; -} - -static int rcugp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcugp, inode->i_private); -} - -static const struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .open = rcugp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) -{ - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cnp=%ld ", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending); - seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", - rdp->n_rp_core_needs_qs, - rdp->n_rp_report_qs, - rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n", - rdp->n_rp_gp_completed, - rdp->n_rp_gp_started, - rdp->n_rp_nocb_defer_wakeup, - rdp->n_rp_need_nothing); -} - -static int show_rcu_pending(struct seq_file *m, void *v) -{ - print_one_rcu_pending(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcu_pending_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcu_pending, -}; - -static int rcu_pending_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcu_pending_op); -} - -static const struct file_operations rcu_pending_fops = { - .owner = THIS_MODULE, - .open = rcu_pending_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcutorture(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcutorture test sequence: %lu %s\n", - rcutorture_testseq >> 1, - (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); - seq_printf(m, "rcutorture update version number: %lu\n", - rcutorture_vernum); - return 0; -} - -static int rcutorture_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcutorture, NULL); -} - -static const struct file_operations rcutorture_fops = { - .owner = THIS_MODULE, - .open = rcutorture_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutree_trace_init(void) -{ - struct rcu_state *rsp; - struct dentry *retval; - struct dentry *rspdir; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - - for_each_rcu_flavor(rsp) { - rspdir = debugfs_create_dir(rsp->name, rcudir); - if (!rspdir) - goto free_out; - - retval = debugfs_create_file("rcudata", 0444, - rspdir, rsp, &rcudata_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuexp", 0444, - rspdir, rsp, &rcuexp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcu_pending", 0444, - rspdir, rsp, &rcu_pending_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcubarrier", 0444, - rspdir, rsp, &rcubarrier_fops); - if (!retval) - goto free_out; - -#ifdef CONFIG_RCU_BOOST - if (rsp == &rcu_preempt_state) { - retval = debugfs_create_file("rcuboost", 0444, - rspdir, NULL, &rcu_node_boost_fops); - if (!retval) - goto free_out; - } -#endif - - retval = debugfs_create_file("rcugp", 0444, - rspdir, rsp, &rcugp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuhier", 0444, - rspdir, rsp, &rcuhier_fops); - if (!retval) - goto free_out; - } - - retval = debugfs_create_file("rcutorture", 0444, rcudir, - NULL, &rcutorture_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} -device_initcall(rcutree_trace_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 55c8530316c7..00e77c470017 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -62,7 +62,9 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU +extern int rcu_expedited; /* from sysctl */ module_param(rcu_expedited, int, 0); +extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); @@ -124,7 +126,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); * non-expedited counterparts? Intended for use within RCU. Note * that if the user specifies both rcu_expedited and rcu_normal, then * rcu_normal wins. (Except during the time period during boot from - * when the first task is spawned until the rcu_exp_runtime_mode() + * when the first task is spawned until the rcu_set_runtime_mode() * core_initcall() is invoked, at which point everything is expedited.) */ bool rcu_gp_is_normal(void) @@ -190,6 +192,39 @@ void rcu_end_inkernel_boot(void) #endif /* #ifndef CONFIG_TINY_RCU */ +/* + * Test each non-SRCU synchronous grace-period wait API. This is + * useful just after a change in mode for these primitives, and + * during early boot. + */ +void rcu_test_sync_prims(void) +{ + if (!IS_ENABLED(CONFIG_PROVE_RCU)) + return; + synchronize_rcu(); + synchronize_rcu_bh(); + synchronize_sched(); + synchronize_rcu_expedited(); + synchronize_rcu_bh_expedited(); + synchronize_sched_expedited(); +} + +#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) + +/* + * Switch to run-time mode once RCU has fully initialized. + */ +static int __init rcu_set_runtime_mode(void) +{ + rcu_test_sync_prims(); + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + rcu_test_sync_prims(); + return 0; +} +core_initcall(rcu_set_runtime_mode); + +#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ + #ifdef CONFIG_PREEMPT_RCU /* @@ -346,6 +381,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array) { int i; + int j; /* Initialize and register callbacks for each flavor specified. */ for (i = 0; i < n; i++) { @@ -357,7 +393,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, } init_rcu_head_on_stack(&rs_array[i].head); init_completion(&rs_array[i].completion); - (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + for (j = 0; j < i; j++) + if (crcu_array[j] == crcu_array[i]) + break; + if (j == i) + (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); } /* Wait for all callbacks to be invoked. */ @@ -366,7 +406,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, (crcu_array[i] == call_rcu || crcu_array[i] == call_rcu_bh)) continue; - wait_for_completion(&rs_array[i].completion); + for (j = 0; j < i; j++) + if (crcu_array[j] == crcu_array[i]) + break; + if (j == i) + wait_for_completion(&rs_array[i].completion); destroy_rcu_head_on_stack(&rs_array[i].head); } } @@ -527,15 +571,30 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); DEFINE_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); static struct task_struct *rcu_tasks_kthread_ptr; -/* - * Post an RCU-tasks callback. First call must be from process context - * after the scheduler if fully operational. +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), entry into idle, or transition to usermode + * execution. As such, there are no read-side primitives analogous to + * rcu_read_lock() and rcu_read_unlock() because this primitive is intended + * to determine that all tasks have passed through a safe state, not so + * much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { @@ -632,6 +691,7 @@ static void check_holdout_task(struct task_struct *t, put_task_struct(t); return; } + rcu_request_urgent_qs_task(t); if (!needreport) return; if (*firstreport) { @@ -817,23 +877,23 @@ static void rcu_spawn_tasks_kthread(void) #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifndef CONFIG_TINY_RCU + /* - * Test each non-SRCU synchronous grace-period wait API. This is - * useful just after a change in mode for these primitives, and - * during early boot. + * Print any non-default Tasks RCU settings. */ -void rcu_test_sync_prims(void) +static void __init rcu_tasks_bootup_oddness(void) { - if (!IS_ENABLED(CONFIG_PROVE_RCU)) - return; - synchronize_rcu(); - synchronize_rcu_bh(); - synchronize_sched(); - synchronize_rcu_expedited(); - synchronize_rcu_bh_expedited(); - synchronize_sched_expedited(); +#ifdef CONFIG_TASKS_RCU + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + else + pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ } +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PROVE_RCU /* @@ -918,3 +978,25 @@ late_initcall(rcu_verify_early_boot_tests); #else void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ + +#ifndef CONFIG_TINY_RCU + +/* + * Print any significant non-default boot-time settings. + */ +void __init rcupdate_announce_bootup_oddness(void) +{ + if (rcu_normal) + pr_info("\tNo expedited grace period (rcu_normal).\n"); + else if (rcu_normal_after_boot) + pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n"); + else if (rcu_expedited) + pr_info("\tAll grace periods are expedited (rcu_expedited).\n"); + if (rcu_cpu_stall_suppress) + pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n"); + if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT) + pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout); + rcu_tasks_bootup_oddness(); +} + +#endif /* #ifndef CONFIG_TINY_RCU */ diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 89ab6758667b..53f0164ed362 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,9 +16,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o swait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o +obj-y += idle_task.o fair.o rt.o deadline.o +obj-y += wait.o wait_bit.o swait.o completion.o idle.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 00a45c45beca..ca0f8fc945c6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -64,6 +64,7 @@ #include <linux/workqueue.h> #include <linux/compiler.h> #include <linux/tick.h> +#include <linux/init.h> /* * Scheduler clock - returns current time in nanosec units. @@ -124,14 +125,27 @@ int sched_clock_stable(void) return static_branch_likely(&__sched_clock_stable); } +static void __scd_stamp(struct sched_clock_data *scd) +{ + scd->tick_gtod = ktime_get_ns(); + scd->tick_raw = sched_clock(); +} + static void __set_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); + struct sched_clock_data *scd; /* + * Since we're still unstable and the tick is already running, we have + * to disable IRQs in order to get a consistent scd->tick* reading. + */ + local_irq_disable(); + scd = this_scd(); + /* * Attempt to make the (initial) unstable->stable transition continuous. */ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); + local_irq_enable(); printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", scd->tick_gtod, __gtod_offset, @@ -141,8 +155,38 @@ static void __set_sched_clock_stable(void) tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } +/* + * If we ever get here, we're screwed, because we found out -- typically after + * the fact -- that TSC wasn't good. This means all our clocksources (including + * ktime) could have reported wrong values. + * + * What we do here is an attempt to fix up and continue sort of where we left + * off in a coherent manner. + * + * The only way to fully avoid random clock jumps is to boot with: + * "tsc=unstable". + */ static void __sched_clock_work(struct work_struct *work) { + struct sched_clock_data *scd; + int cpu; + + /* take a current timestamp and set 'now' */ + preempt_disable(); + scd = this_scd(); + __scd_stamp(scd); + scd->clock = scd->tick_gtod + __gtod_offset; + preempt_enable(); + + /* clone to all CPUs */ + for_each_possible_cpu(cpu) + per_cpu(sched_clock_data, cpu) = *scd; + + printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n"); + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); + static_branch_disable(&__sched_clock_stable); } @@ -150,27 +194,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work); static void __clear_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); - - /* - * Attempt to make the stable->unstable transition continuous. - * - * Trouble is, this is typically called from the TSC watchdog - * timer, which is late per definition. This means the tick - * values can already be screwy. - * - * Still do what we can. - */ - __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod); - - printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", - scd->tick_gtod, __gtod_offset, - scd->tick_raw, __sched_clock_offset); + if (!sched_clock_stable()) + return; tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); - - if (sched_clock_stable()) - schedule_work(&sched_clock_work); + schedule_work(&sched_clock_work); } void clear_sched_clock_stable(void) @@ -183,7 +211,11 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -void sched_clock_init_late(void) +/* + * We run this as late_initcall() such that it runs after all built-in drivers, + * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. + */ +static int __init sched_clock_init_late(void) { sched_clock_running = 2; /* @@ -197,7 +229,10 @@ void sched_clock_init_late(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); + + return 0; } +late_initcall(sched_clock_init_late); /* * min, max except they take wrapping into account @@ -347,21 +382,38 @@ void sched_clock_tick(void) { struct sched_clock_data *scd; + if (sched_clock_stable()) + return; + + if (unlikely(!sched_clock_running)) + return; + WARN_ON_ONCE(!irqs_disabled()); + scd = this_scd(); + __scd_stamp(scd); + sched_clock_local(scd); +} + +void sched_clock_tick_stable(void) +{ + u64 gtod, clock; + + if (!sched_clock_stable()) + return; + /* - * Update these values even if sched_clock_stable(), because it can - * become unstable at any point in time at which point we need some - * values to fall back on. + * Called under watchdog_lock. * - * XXX arguably we can skip this if we expose tsc_clocksource_reliable + * The watchdog just found this TSC to (still) be stable, so now is a + * good moment to update our __gtod_offset. Because once we find the + * TSC to be unstable, any computation will be computing crap. */ - scd = this_scd(); - scd->tick_raw = sched_clock(); - scd->tick_gtod = ktime_get_ns(); - - if (!sched_clock_stable() && likely(sched_clock_running)) - sched_clock_local(scd); + local_irq_disable(); + gtod = ktime_get_ns(); + clock = sched_clock(); + __gtod_offset = (clock + __sched_clock_offset) - gtod; + local_irq_enable(); } /* @@ -374,15 +426,21 @@ void sched_clock_idle_sleep_event(void) EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * We just idled delta nanoseconds (called with irqs disabled): + * We just idled; resync with ktime. */ -void sched_clock_idle_wakeup_event(u64 delta_ns) +void sched_clock_idle_wakeup_event(void) { - if (timekeeping_suspended) + unsigned long flags; + + if (sched_clock_stable()) + return; + + if (unlikely(timekeeping_suspended)) return; + local_irq_save(flags); sched_clock_tick(); - touch_softlockup_watchdog_sched(); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 53f9558fa925..13fc5ae9bf2f 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -66,7 +66,7 @@ do_wait_for_common(struct completion *x, if (!x->done) { DECLARE_WAITQUEUE(wait, current); - __add_wait_queue_tail_exclusive(&x->wait, &wait); + __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c51147a1204c..17c667b427b4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10,6 +10,7 @@ #include <uapi/linux/sched/types.h> #include <linux/sched/loadavg.h> #include <linux/sched/hotplug.h> +#include <linux/wait_bit.h> #include <linux/cpuset.h> #include <linux/delayacct.h> #include <linux/init_task.h> @@ -788,36 +789,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -void sched_set_stop_task(int cpu, struct task_struct *stop) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; - struct task_struct *old_stop = cpu_rq(cpu)->stop; - - if (stop) { - /* - * Make it appear like a SCHED_FIFO task, its something - * userspace knows about and won't get confused about. - * - * Also, it will make PI more or less work without too - * much confusion -- but then, stop work should not - * rely on PI working anyway. - */ - sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); - - stop->sched_class = &stop_sched_class; - } - - cpu_rq(cpu)->stop = stop; - - if (old_stop) { - /* - * Reset it back to a normal scheduling class so that - * it can die in pieces. - */ - old_stop->sched_class = &rt_sched_class; - } -} - /* * __normal_prio - return the priority that is based on the static prio */ @@ -1588,6 +1559,36 @@ static void update_avg(u64 *avg, u64 sample) *avg += diff >> 3; } +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + #else static inline int __set_cpus_allowed_ptr(struct task_struct *p, @@ -1731,7 +1732,7 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); - struct task_struct *p; + struct task_struct *p, *t; struct rq_flags rf; if (!llist) @@ -1740,17 +1741,8 @@ void sched_ttwu_pending(void) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - while (llist) { - int wake_flags = 0; - - p = llist_entry(llist, struct task_struct, wake_entry); - llist = llist_next(llist); - - if (p->sched_remote_wakeup) - wake_flags = WF_MIGRATED; - - ttwu_do_activate(rq, p, wake_flags, &rf); - } + llist_for_each_entry_safe(p, t, llist, wake_entry) + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); rq_unlock_irqrestore(rq, &rf); } @@ -2148,23 +2140,6 @@ int wake_up_state(struct task_struct *p, unsigned int state) } /* - * This function clears the sched_dl_entity static params. - */ -void __dl_clear_params(struct task_struct *p) -{ - struct sched_dl_entity *dl_se = &p->dl; - - dl_se->dl_runtime = 0; - dl_se->dl_deadline = 0; - dl_se->dl_period = 0; - dl_se->flags = 0; - dl_se->dl_bw = 0; - - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; -} - -/* * Perform scheduler related setup for a newly forked process p. * p is forked by current. * @@ -2193,6 +2168,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); + init_dl_inactive_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -2430,7 +2406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 20; + return BW_UNIT; /* * Doing this here saves a lot of checks in all @@ -2440,93 +2416,9 @@ unsigned long to_ratio(u64 period, u64 runtime) if (period == 0) return 0; - return div64_u64(runtime << 20, period); + return div64_u64(runtime << BW_SHIFT, period); } -#ifdef CONFIG_SMP -inline struct dl_bw *dl_bw_of(int i) -{ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - return &cpu_rq(i)->rd->dl_bw; -} - -static inline int dl_bw_cpus(int i) -{ - struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; - - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - for_each_cpu_and(i, rd->span, cpu_active_mask) - cpus++; - - return cpus; -} -#else -inline struct dl_bw *dl_bw_of(int i) -{ - return &cpu_rq(i)->dl.dl_bw; -} - -static inline int dl_bw_cpus(int i) -{ - return 1; -} -#endif - -/* - * We must be sure that accepting a new task (or allowing changing the - * parameters of an existing one) is consistent with the bandwidth - * constraints. If yes, this function also accordingly updates the currently - * allocated bandwidth to reflect the new situation. - * - * This function is called while holding p's rq->lock. - * - * XXX we should delay bw change until the task's 0-lag point, see - * __setparam_dl(). - */ -static int dl_overflow(struct task_struct *p, int policy, - const struct sched_attr *attr) -{ - - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - u64 period = attr->sched_period ?: attr->sched_deadline; - u64 runtime = attr->sched_runtime; - u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; - - /* !deadline task may carry old deadline bandwidth */ - if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) - return 0; - - /* - * Either if a task, enters, leave, or stays -deadline but changes - * its parameters, we may need to update accordingly the total - * allocated bandwidth of the container. - */ - raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); - if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { - __dl_add(dl_b, new_bw); - err = 0; - } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { - __dl_clear(dl_b, p->dl.dl_bw); - __dl_add(dl_b, new_bw); - err = 0; - } else if (!dl_policy(policy) && task_has_dl_policy(p)) { - __dl_clear(dl_b, p->dl.dl_bw); - err = 0; - } - raw_spin_unlock(&dl_b->lock); - - return err; -} - -extern void init_dl_bw(struct dl_bw *dl_b); - /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -3382,7 +3274,7 @@ static void __sched notrace __schedule(bool preempt) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(); + rcu_note_context_switch(preempt); /* * Make sure that signal_pending_state()->signal_pending() below @@ -3502,6 +3394,31 @@ asmlinkage __visible void __sched schedule(void) } EXPORT_SYMBOL(schedule); +/* + * synchronize_rcu_tasks() makes sure that no task is stuck in preempted + * state (have scheduled out non-voluntarily) by making sure that all + * tasks have either left the run queue or have gone into user space. + * As idle tasks do not do either, they must not ever be preempted + * (schedule out non-voluntarily). + * + * schedule_idle() is similar to schedule_preempt_disable() except that it + * never enables preemption because it does not call sched_submit_work(). + */ +void __sched schedule_idle(void) +{ + /* + * As this skips calling sched_submit_work(), which the idle task does + * regardless because that function is a nop when the task is in a + * TASK_RUNNING state, make sure this isn't used someplace that the + * current task can be in any other state. Note, idle is always in the + * TASK_RUNNING state. + */ + WARN_ON_ONCE(current->state); + do { + __schedule(false); + } while (need_resched()); +} + #ifdef CONFIG_CONTEXT_TRACKING asmlinkage __visible void __sched schedule_user(void) { @@ -3662,7 +3579,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) exception_exit(prev_state); } -int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, +int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { return try_to_wake_up(curr->private, mode, wake_flags); @@ -3984,46 +3901,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) } /* - * This function initializes the sched_dl_entity of a newly becoming - * SCHED_DEADLINE task. - * - * Only the static values are considered here, the actual runtime and the - * absolute deadline will be properly calculated when the task is enqueued - * for the first time with its new policy. - */ -static void -__setparam_dl(struct task_struct *p, const struct sched_attr *attr) -{ - struct sched_dl_entity *dl_se = &p->dl; - - dl_se->dl_runtime = attr->sched_runtime; - dl_se->dl_deadline = attr->sched_deadline; - dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; - dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - - /* - * Changing the parameters of a task is 'tricky' and we're not doing - * the correct thing -- also see task_dead_dl() and switched_from_dl(). - * - * What we SHOULD do is delay the bandwidth release until the 0-lag - * point. This would include retaining the task_struct until that time - * and change dl_overflow() to not immediately decrement the current - * amount. - * - * Instead we retain the current runtime/deadline and let the new - * parameters take effect after the current reservation period lapses. - * This is safe (albeit pessimistic) because the 0-lag point is always - * before the current scheduling deadline. - * - * We can still have temporary overloads because we do not delay the - * change in bandwidth until that time; so admission control is - * not on the safe side. It does however guarantee tasks will never - * consume more than promised. - */ -} - -/* * sched_setparam() passes in -1 for its policy, to let the functions * it calls know not to change it. */ @@ -4076,59 +3953,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->sched_class = &fair_sched_class; } -static void -__getparam_dl(struct task_struct *p, struct sched_attr *attr) -{ - struct sched_dl_entity *dl_se = &p->dl; - - attr->sched_priority = p->rt_priority; - attr->sched_runtime = dl_se->dl_runtime; - attr->sched_deadline = dl_se->dl_deadline; - attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; -} - -/* - * This function validates the new parameters of a -deadline task. - * We ask for the deadline not being zero, and greater or equal - * than the runtime, as well as the period of being zero or - * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution of 1us (we - * check sched_runtime only since it is always the smaller one) and - * below 2^63 ns (we have to check both sched_deadline and - * sched_period, as the latter can be zero). - */ -static bool -__checkparam_dl(const struct sched_attr *attr) -{ - /* deadline != 0 */ - if (attr->sched_deadline == 0) - return false; - - /* - * Since we truncate DL_SCALE bits, make sure we're at least - * that big. - */ - if (attr->sched_runtime < (1ULL << DL_SCALE)) - return false; - - /* - * Since we use the MSB for wrap-around and sign issues, make - * sure it's not set (mind that period can be equal to zero). - */ - if (attr->sched_deadline & (1ULL << 63) || - attr->sched_period & (1ULL << 63)) - return false; - - /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || - attr->sched_deadline < attr->sched_runtime) - return false; - - return true; -} - /* * Check the target process has a UID that matches the current process's: */ @@ -4145,19 +3969,6 @@ static bool check_same_owner(struct task_struct *p) return match; } -static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) -{ - struct sched_dl_entity *dl_se = &p->dl; - - if (dl_se->dl_runtime != attr->sched_runtime || - dl_se->dl_deadline != attr->sched_deadline || - dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) - return true; - - return false; -} - static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi) @@ -4172,8 +3983,8 @@ static int __sched_setscheduler(struct task_struct *p, int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; - /* May grab non-irq protected spin_locks: */ - BUG_ON(in_interrupt()); + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); recheck: /* Double check policy once rq lock held: */ if (policy < 0) { @@ -4186,7 +3997,8 @@ recheck: return -EINVAL; } - if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + if (attr->sched_flags & + ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) return -EINVAL; /* @@ -4337,7 +4149,7 @@ change: * of a SCHED_DEADLINE task) we need to check if enough bandwidth * is available. */ - if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { + if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { task_rq_unlock(rq, p, &rf); return -EBUSY; } @@ -5438,26 +5250,17 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +#ifdef CONFIG_SMP + int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { - int ret = 1, trial_cpus; - struct dl_bw *cur_dl_b; - unsigned long flags; + int ret = 1; if (!cpumask_weight(cur)) return ret; - rcu_read_lock_sched(); - cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); - - raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) - ret = 0; - raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); - rcu_read_unlock_sched(); + ret = dl_cpuset_cpumask_can_shrink(cur, trial); return ret; } @@ -5481,43 +5284,14 @@ int task_can_attach(struct task_struct *p, goto out; } -#ifdef CONFIG_SMP if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) { - unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, - cs_cpus_allowed); - struct dl_bw *dl_b; - bool overflow; - int cpus; - unsigned long flags; - - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) - ret = -EBUSY; - else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw); - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); + cs_cpus_allowed)) + ret = dl_task_can_attach(p, cs_cpus_allowed); - } -#endif out: return ret; } -#ifdef CONFIG_SMP - bool sched_smp_initialized __read_mostly; #ifdef CONFIG_NUMA_BALANCING @@ -5580,7 +5354,7 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); if (mm != &init_mm) { - switch_mm_irqs_off(mm, &init_mm, current); + switch_mm(mm, &init_mm, current); finish_arch_post_lock_switch(); } mmdrop(mm); @@ -5780,23 +5554,8 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { - unsigned long flags; - struct dl_bw *dl_b; - bool overflow; - int cpus; - if (!cpuhp_tasks_frozen) { - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - - if (overflow) + if (dl_cpu_busy(cpu)) return -EBUSY; cpuset_update_active_cpus(); } else { @@ -5849,15 +5608,9 @@ int sched_cpu_deactivate(unsigned int cpu) * users of this state to go away such that all new such users will * observe it. * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * * Do sync before park smpboot threads to take care the rcu boost case. */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); + synchronize_rcu_mult(call_rcu, call_rcu_sched); if (!sched_smp_initialized) return 0; @@ -5933,7 +5686,6 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); sched_init_numa(); @@ -5943,7 +5695,7 @@ void __init sched_init_smp(void) * happen. */ mutex_lock(&sched_domains_mutex); - init_sched_domains(cpu_active_mask); + sched_init_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -5959,7 +5711,6 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_init_smt(); - sched_clock_init_late(); sched_smp_initialized = true; } @@ -5975,7 +5726,6 @@ early_initcall(migration_init); void __init sched_init_smp(void) { sched_init_granularity(); - sched_clock_init_late(); } #endif /* CONFIG_SMP */ @@ -6001,28 +5751,13 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); -#define WAIT_TABLE_BITS 8 -#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) -static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; - -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - unsigned long val = (unsigned long)word << shift | bit; - - return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); -} -EXPORT_SYMBOL(bit_waitqueue); - void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; sched_clock_init(); - - for (i = 0; i < WAIT_TABLE_SIZE; i++) - init_waitqueue_head(bit_wait_table + i); + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); @@ -6174,7 +5909,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); @@ -6226,8 +5960,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) + system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || + oops_in_progress) return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; @@ -6482,385 +6218,6 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, tsk, &rf); } -#endif /* CONFIG_CGROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -/* - * Ensure that the real time constraints are schedulable. - */ -static DEFINE_MUTEX(rt_constraints_mutex); - -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) -{ - struct task_struct *g, *p; - - /* - * Autogroups do not have RT tasks; see autogroup_create(). - */ - if (task_group_is_autogroup(tg)) - return 0; - - for_each_process_thread(g, p) { - if (rt_task(p) && task_group(p) == tg) - return 1; - } - - return 0; -} - -struct rt_schedulable_data { - struct task_group *tg; - u64 rt_period; - u64 rt_runtime; -}; - -static int tg_rt_schedulable(struct task_group *tg, void *data) -{ - struct rt_schedulable_data *d = data; - struct task_group *child; - unsigned long total, sum = 0; - u64 period, runtime; - - period = ktime_to_ns(tg->rt_bandwidth.rt_period); - runtime = tg->rt_bandwidth.rt_runtime; - - if (tg == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } - - /* - * Cannot have more runtime than the period. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - - /* - * Ensure we don't starve existing RT tasks. - */ - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) - return -EBUSY; - - total = to_ratio(period, runtime); - - /* - * Nobody can have more than the global setting allows. - */ - if (total > to_ratio(global_rt_period(), global_rt_runtime())) - return -EINVAL; - - /* - * The sum of our children's runtime should not exceed our own. - */ - list_for_each_entry_rcu(child, &tg->children, siblings) { - period = ktime_to_ns(child->rt_bandwidth.rt_period); - runtime = child->rt_bandwidth.rt_runtime; - - if (child == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } - - sum += to_ratio(period, runtime); - } - - if (sum > total) - return -EINVAL; - - return 0; -} - -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - int ret; - - struct rt_schedulable_data data = { - .tg = tg, - .rt_period = period, - .rt_runtime = runtime, - }; - - rcu_read_lock(); - ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); - rcu_read_unlock(); - - return ret; -} - -static int tg_set_rt_bandwidth(struct task_group *tg, - u64 rt_period, u64 rt_runtime) -{ - int i, err = 0; - - /* - * Disallowing the root group RT runtime is BAD, it would disallow the - * kernel creating (and or operating) RT threads. - */ - if (tg == &root_task_group && rt_runtime == 0) - return -EINVAL; - - /* No period doesn't make any sense. */ - if (rt_period == 0) - return -EINVAL; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - err = __rt_schedulable(tg, rt_period, rt_runtime); - if (err) - goto unlock; - - raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); - tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); - tg->rt_bandwidth.rt_runtime = rt_runtime; - - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = tg->rt_rq[i]; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = rt_runtime; - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); -unlock: - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return err; -} - -static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) -{ - u64 rt_runtime, rt_period; - - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us < 0) - rt_runtime = RUNTIME_INF; - - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); -} - -static long sched_group_rt_runtime(struct task_group *tg) -{ - u64 rt_runtime_us; - - if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) - return -1; - - rt_runtime_us = tg->rt_bandwidth.rt_runtime; - do_div(rt_runtime_us, NSEC_PER_USEC); - return rt_runtime_us; -} - -static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) -{ - u64 rt_runtime, rt_period; - - rt_period = rt_period_us * NSEC_PER_USEC; - rt_runtime = tg->rt_bandwidth.rt_runtime; - - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); -} - -static long sched_group_rt_period(struct task_group *tg) -{ - u64 rt_period_us; - - rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); - do_div(rt_period_us, NSEC_PER_USEC); - return rt_period_us; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static int sched_rt_global_constraints(void) -{ - int ret = 0; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - ret = __rt_schedulable(NULL, 0, 0); - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return ret; -} - -static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) -{ - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) - return 0; - - return 1; -} - -#else /* !CONFIG_RT_GROUP_SCHED */ -static int sched_rt_global_constraints(void) -{ - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = &cpu_rq(i)->rt; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = global_rt_runtime(); - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - - return 0; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -static int sched_dl_global_validate(void) -{ - u64 runtime = global_rt_runtime(); - u64 period = global_rt_period(); - u64 new_bw = to_ratio(period, runtime); - struct dl_bw *dl_b; - int cpu, ret = 0; - unsigned long flags; - - /* - * Here we want to check the bandwidth not being set to some - * value smaller than the currently allocated bandwidth in - * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! - */ - for_each_possible_cpu(cpu) { - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) - ret = -EBUSY; - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - - if (ret) - break; - } - - return ret; -} - -static void sched_dl_do_global(void) -{ - u64 new_bw = -1; - struct dl_bw *dl_b; - int cpu; - unsigned long flags; - - def_dl_bandwidth.dl_period = global_rt_period(); - def_dl_bandwidth.dl_runtime = global_rt_runtime(); - - if (global_rt_runtime() != RUNTIME_INF) - new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - - /* - * FIXME: As above... - */ - for_each_possible_cpu(cpu) { - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - dl_b->bw = new_bw; - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - } -} - -static int sched_rt_global_validate(void) -{ - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - if ((sysctl_sched_rt_runtime != RUNTIME_INF) && - (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) - return -EINVAL; - - return 0; -} - -static void sched_rt_do_global(void) -{ - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); -} - -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int old_period, old_runtime; - static DEFINE_MUTEX(mutex); - int ret; - - mutex_lock(&mutex); - old_period = sysctl_sched_rt_period; - old_runtime = sysctl_sched_rt_runtime; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (!ret && write) { - ret = sched_rt_global_validate(); - if (ret) - goto undo; - - ret = sched_dl_global_validate(); - if (ret) - goto undo; - - ret = sched_rt_global_constraints(); - if (ret) - goto undo; - - sched_rt_do_global(); - sched_dl_do_global(); - } - if (0) { -undo: - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } - mutex_unlock(&mutex); - - return ret; -} - -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); - /* - * Make sure that internally we keep jiffies. - * Also, writing zero resets the timeslice to default: - */ - if (!ret && write) { - sched_rr_timeslice = - sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : - msecs_to_jiffies(sysctl_sched_rr_timeslice); - } - mutex_unlock(&mutex); - return ret; -} - -#ifdef CONFIG_CGROUP_SCHED static inline struct task_group *css_tg(struct cgroup_subsys_state *css) { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 76877a62b5fa..076a2e31951c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -101,9 +101,6 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, if (sg_policy->next_freq == next_freq) return; - if (sg_policy->next_freq > next_freq) - next_freq = (sg_policy->next_freq + next_freq) >> 1; - sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -245,11 +242,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - u64 last_freq_update_time = sg_policy->last_freq_update_time; unsigned long util = 0, max = 1; unsigned int j; @@ -265,7 +261,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) * enough, don't take the CPU into account as it probably is * idle now (and clear iowait_boost for it). */ - delta_ns = last_freq_update_time - j_sg_cpu->last_update; + delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; continue; @@ -309,7 +305,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_RT_DL) next_f = sg_policy->policy->cpuinfo.max_freq; else - next_f = sugov_next_freq_shared(sg_cpu); + next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index aea3135c5d90..67c70e287647 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -615,19 +615,13 @@ static void cputime_adjust(struct task_cputime *curr, * userspace. Once a task gets some ticks, the monotonicy code at * 'update' will ensure things converge to the observed ratio. */ - if (stime == 0) { - utime = rtime; - goto update; + if (stime != 0) { + if (utime == 0) + stime = rtime; + else + stime = scale_stime(stime, rtime, stime + utime); } - if (utime == 0) { - stime = rtime; - goto update; - } - - stime = scale_stime(stime, rtime, stime + utime); - -update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a2ce59015642..a84299f44b5d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,7 @@ #include "sched.h" #include <linux/slab.h> +#include <uapi/linux/sched/types.h> struct dl_bandwidth def_dl_bandwidth; @@ -43,6 +44,254 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_SMP +static inline struct dl_bw *dl_bw_of(int i) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + return &cpu_rq(i)->rd->dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + int cpus = 0; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) + cpus++; + + return cpus; +} +#else +static inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->dl.dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + return 1; +} +#endif + +static inline +void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->running_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->running_bw += dl_bw; + SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ + SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); +} + +static inline +void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->running_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->running_bw -= dl_bw; + SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + if (dl_rq->running_bw > old) + dl_rq->running_bw = 0; +} + +static inline +void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->this_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->this_bw += dl_bw; + SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ +} + +static inline +void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->this_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->this_bw -= dl_bw; + SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + if (dl_rq->this_bw > old) + dl_rq->this_bw = 0; + SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); +} + +void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + struct rq *rq; + + if (task_on_rq_queued(p)) + return; + + rq = task_rq(p); + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + sub_rq_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(new_bw, &rq->dl); +} + +/* + * The utilization of a task cannot be immediately removed from + * the rq active utilization (running_bw) when the task blocks. + * Instead, we have to wait for the so called "0-lag time". + * + * If a task blocks before the "0-lag time", a timer (the inactive + * timer) is armed, and running_bw is decreased when the timer + * fires. + * + * If the task wakes up again before the inactive timer fires, + * the timer is cancelled, whereas if the task wakes up after the + * inactive timer fired (and running_bw has been decreased) the + * task's utilization has to be added to running_bw again. + * A flag in the deadline scheduling entity (dl_non_contending) + * is used to avoid race conditions between the inactive timer handler + * and task wakeups. + * + * The following diagram shows how running_bw is updated. A task is + * "ACTIVE" when its utilization contributes to running_bw; an + * "ACTIVE contending" task is in the TASK_RUNNING state, while an + * "ACTIVE non contending" task is a blocked task for which the "0-lag time" + * has not passed yet. An "INACTIVE" task is a task for which the "0-lag" + * time already passed, which does not contribute to running_bw anymore. + * +------------------+ + * wakeup | ACTIVE | + * +------------------>+ contending | + * | add_running_bw | | + * | +----+------+------+ + * | | ^ + * | dequeue | | + * +--------+-------+ | | + * | | t >= 0-lag | | wakeup + * | INACTIVE |<---------------+ | + * | | sub_running_bw | | + * +--------+-------+ | | + * ^ | | + * | t < 0-lag | | + * | | | + * | V | + * | +----+------+------+ + * | sub_running_bw | ACTIVE | + * +-------------------+ | + * inactive timer | non contending | + * fired +------------------+ + * + * The task_non_contending() function is invoked when a task + * blocks, and checks if the 0-lag time already passed or + * not (in the first case, it directly updates running_bw; + * in the second case, it arms the inactive timer). + * + * The task_contending() function is invoked when a task wakes + * up, and checks if the task is still in the "ACTIVE non contending" + * state or not (in the second case, it updates running_bw). + */ +static void task_non_contending(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + struct hrtimer *timer = &dl_se->inactive_timer; + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + s64 zerolag_time; + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); + WARN_ON(dl_se->dl_non_contending); + + zerolag_time = dl_se->deadline - + div64_long((dl_se->runtime * dl_se->dl_period), + dl_se->dl_runtime); + + /* + * Using relative times instead of the absolute "0-lag time" + * allows to simplify the code + */ + zerolag_time -= rq_clock(rq); + + /* + * If the "0-lag time" already passed, decrease the active + * utilization now, instead of starting a timer + */ + if (zerolag_time < 0) { + if (dl_task(p)) + sub_running_bw(dl_se->dl_bw, dl_rq); + if (!dl_task(p) || p->state == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (p->state == TASK_DEAD) + sub_rq_bw(p->dl.dl_bw, &rq->dl); + raw_spin_lock(&dl_b->lock); + __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_clear_params(p); + raw_spin_unlock(&dl_b->lock); + } + + return; + } + + dl_se->dl_non_contending = 1; + get_task_struct(p); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); +} + +static void task_contending(struct sched_dl_entity *dl_se, int flags) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + if (flags & ENQUEUE_MIGRATED) + add_rq_bw(dl_se->dl_bw, dl_rq); + + if (dl_se->dl_non_contending) { + dl_se->dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) + put_task_struct(dl_task_of(dl_se)); + } else { + /* + * Since "dl_non_contending" is not set, the + * task's utilization has already been removed from + * active utilization (either when the task blocked, + * when the "inactive timer" fired). + * So, add it back. + */ + add_running_bw(dl_se->dl_bw, dl_rq); + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -83,6 +332,10 @@ void init_dl_rq(struct dl_rq *dl_rq) #else init_dl_bw(&dl_rq->dl_bw); #endif + + dl_rq->running_bw = 0; + dl_rq->this_bw = 0; + init_dl_rq_bw_ratio(dl_rq); } #ifdef CONFIG_SMP @@ -484,13 +737,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, } /* - * When a -deadline entity is queued back on the runqueue, its runtime and - * deadline might need updating. + * Revised wakeup rule [1]: For self-suspending tasks, rather then + * re-initializing task's runtime and deadline, the revised wakeup + * rule adjusts the task's runtime to avoid the task to overrun its + * density. + * + * Reasoning: a task may overrun the density if: + * runtime / (deadline - t) > dl_runtime / dl_deadline + * + * Therefore, runtime can be adjusted to: + * runtime = (dl_runtime / dl_deadline) * (deadline - t) + * + * In such way that runtime will be equal to the maximum density + * the task can use without breaking any rule. + * + * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant + * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24. + */ +static void +update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 laxity = dl_se->deadline - rq_clock(rq); + + /* + * If the task has deadline < period, and the deadline is in the past, + * it should already be throttled before this check. + * + * See update_dl_entity() comments for further details. + */ + WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq))); + + dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT; +} + +/* + * Regarding the deadline, a task with implicit deadline has a relative + * deadline == relative period. A task with constrained deadline has a + * relative deadline <= relative period. + * + * We support constrained deadline tasks. However, there are some restrictions + * applied only for tasks which do not have an implicit deadline. See + * update_dl_entity() to know more about such restrictions. + * + * The dl_is_implicit() returns true if the task has an implicit deadline. + */ +static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline == dl_se->dl_period; +} + +/* + * When a deadline entity is placed in the runqueue, its runtime and deadline + * might need to be updated. This is done by a CBS wake up rule. There are two + * different rules: 1) the original CBS; and 2) the Revisited CBS. + * + * When the task is starting a new period, the Original CBS is used. In this + * case, the runtime is replenished and a new absolute deadline is set. + * + * When a task is queued before the begin of the next period, using the + * remaining runtime and deadline could make the entity to overflow, see + * dl_entity_overflow() to find more about runtime overflow. When such case + * is detected, the runtime and deadline need to be updated. + * + * If the task has an implicit deadline, i.e., deadline == period, the Original + * CBS is applied. the runtime is replenished and a new absolute deadline is + * set, as in the previous cases. + * + * However, the Original CBS does not work properly for tasks with + * deadline < period, which are said to have a constrained deadline. By + * applying the Original CBS, a constrained deadline task would be able to run + * runtime/deadline in a period. With deadline < period, the task would + * overrun the runtime/period allowed bandwidth, breaking the admission test. * - * The policy here is that we update the deadline of the entity only if: - * - the current deadline is in the past, - * - using the remaining runtime with the current deadline would make - * the entity exceed its bandwidth. + * In order to prevent this misbehave, the Revisited CBS is used for + * constrained deadline tasks when a runtime overflow is detected. In the + * Revisited CBS, rather than replenishing & setting a new absolute deadline, + * the remaining runtime of the task is reduced to avoid runtime overflow. + * Please refer to the comments update_dl_revised_wakeup() function to find + * more about the Revised CBS rule. */ static void update_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -500,6 +824,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + + if (unlikely(!dl_is_implicit(dl_se) && + !dl_time_before(dl_se->deadline, rq_clock(rq)) && + !dl_se->dl_boosted)){ + update_dl_revised_wakeup(dl_se, rq); + return; + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -593,10 +925,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have changed its scheduling policy to something * different than SCHED_DEADLINE (through switched_from_dl()). */ - if (!dl_task(p)) { - __dl_clear_params(p); + if (!dl_task(p)) goto unlock; - } /* * The task might have been boosted by someone else and might be in the @@ -723,6 +1053,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; + if (dl_se->runtime > 0) + dl_se->runtime = 0; } } @@ -735,6 +1067,47 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); /* + * This function implements the GRUB accounting rule: + * according to the GRUB reclaiming algorithm, the runtime is + * not decreased as "dq = -dt", but as + * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", + * where u is the utilization of the task, Umax is the maximum reclaimable + * utilization, Uinact is the (per-runqueue) inactive utilization, computed + * as the difference between the "total runqueue utilization" and the + * runqueue active utilization, and Uextra is the (per runqueue) extra + * reclaimable utilization. + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations + * multiplied by 2^BW_SHIFT, the result has to be shifted right by + * BW_SHIFT. + * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT, + * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value + * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. + * So, overflow is not an issue here. + */ +u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) +{ + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ + u64 u_act; + u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; + + /* + * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, + * we compare u_inact + rq->dl.extra_bw with + * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because + * u_inact + rq->dl.extra_bw can be larger than + * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative + * leading to wrong results) + */ + if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) + u_act = u_act_min; + else + u_act = BW_UNIT - u_inact - rq->dl.extra_bw; + + return (delta * u_act) >> BW_SHIFT; +} + +/* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). */ @@ -776,6 +1149,8 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) + delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); dl_se->runtime -= delta_exec; throttle: @@ -815,6 +1190,56 @@ throttle: } } +static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + inactive_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + + if (!dl_task(p) || p->state == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + dl_se->dl_non_contending = 0; + } + + raw_spin_lock(&dl_b->lock); + __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(&dl_b->lock); + __dl_clear_params(p); + + goto unlock; + } + if (dl_se->dl_non_contending == 0) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + + sub_running_bw(dl_se->dl_bw, &rq->dl); + dl_se->dl_non_contending = 0; +unlock: + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + + return HRTIMER_NORESTART; +} + +void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->inactive_timer; + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = inactive_task_timer; +} + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -946,10 +1371,12 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (flags & ENQUEUE_WAKEUP) + if (flags & ENQUEUE_WAKEUP) { + task_contending(dl_se, flags); update_dl_entity(dl_se, pi_se); - else if (flags & ENQUEUE_REPLENISH) + } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } __enqueue_dl_entity(dl_se); } @@ -959,11 +1386,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) __dequeue_dl_entity(dl_se); } -static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_deadline < dl_se->dl_period; -} - static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -995,17 +1417,32 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * If that is the case, the task will be throttled and * the replenishment timer will be set to the next period. */ - if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) + if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) dl_check_constrained_dl(&p->dl); + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { + add_rq_bw(p->dl.dl_bw, &rq->dl); + add_running_bw(p->dl.dl_bw, &rq->dl); + } + /* - * If p is throttled, we do nothing. In fact, if it exhausted + * If p is throttled, we do not enqueue it. In fact, if it exhausted * its budget it needs a replenishment and, since it now is on * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. + * However, the active utilization does not depend on the fact + * that the task is on the runqueue or not (but depends on the + * task's state - in GRUB parlance, "inactive" vs "active contending"). + * In other words, even if a task is throttled its utilization must + * be counted in the active utilization; hence, we need to call + * add_running_bw(). */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (flags & ENQUEUE_WAKEUP) + task_contending(&p->dl, flags); + return; + } enqueue_dl_entity(&p->dl, pi_se, flags); @@ -1023,6 +1460,23 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); + + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(p->dl.dl_bw, &rq->dl); + } + + /* + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: + * when the task blocks and when it is terminating + * (p->state == TASK_DEAD). We can handle the two cases in the same + * way, because from GRUB's point of view the same thing is happening + * (the task moves from "active contending" to "active non contending" + * or "inactive") + */ + if (flags & DEQUEUE_SLEEP) + task_non_contending(p); } /* @@ -1100,6 +1554,37 @@ out: return cpu; } +static void migrate_task_rq_dl(struct task_struct *p) +{ + struct rq *rq; + + if (p->state != TASK_WAKING) + return; + + rq = task_rq(p); + /* + * Since p->state == TASK_WAKING, set_task_cpu() has been called + * from try_to_wake_up(). Hence, p->pi_lock is locked, but + * rq->lock is not... So, lock it + */ + raw_spin_lock(&rq->lock); + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + sub_rq_bw(p->dl.dl_bw, &rq->dl); + raw_spin_unlock(&rq->lock); +} + static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) { /* @@ -1255,19 +1740,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void task_dead_dl(struct task_struct *p) -{ - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - - /* - * Since we are TASK_DEAD we won't slip out of the domain! - */ - raw_spin_lock_irq(&dl_b->lock); - /* XXX we should retain the bw until 0-lag */ - dl_b->total_bw -= p->dl.dl_bw; - raw_spin_unlock_irq(&dl_b->lock); -} - static void set_curr_task_dl(struct rq *rq) { struct task_struct *p = rq->curr; @@ -1533,7 +2005,7 @@ retry: * then possible that next_task has migrated. */ task = pick_next_pushable_dl_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task is still there. We don't try * again, some other cpu will pull it when ready. @@ -1551,7 +2023,11 @@ retry: } deactivate_task(rq, next_task, 0); + sub_running_bw(next_task->dl.dl_bw, &rq->dl); + sub_rq_bw(next_task->dl.dl_bw, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); + add_running_bw(next_task->dl.dl_bw, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1639,7 +2115,11 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + sub_running_bw(p->dl.dl_bw, &src_rq->dl); + sub_rq_bw(p->dl.dl_bw, &src_rq->dl); set_task_cpu(p, this_cpu); + add_rq_bw(p->dl.dl_bw, &this_rq->dl); + add_running_bw(p->dl.dl_bw, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -1695,7 +2175,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(&src_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw); + __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&src_dl_b->lock); } @@ -1737,13 +2217,26 @@ void __init init_sched_dl_class(void) static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* - * Start the deadline timer; if we switch back to dl before this we'll - * continue consuming our current CBS slice. If we stay outside of - * SCHED_DEADLINE until the deadline passes, the timer will reset the - * task. + * task_non_contending() can start the "inactive timer" (if the 0-lag + * time is in the future). If the task switches back to dl before + * the "inactive timer" fires, it can continue to consume its current + * runtime using its current deadline. If it stays outside of + * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer() + * will reset the task parameters. */ - if (!start_dl_timer(p)) - __dl_clear_params(p); + if (task_on_rq_queued(p) && p->dl.dl_runtime) + task_non_contending(p); + + if (!task_on_rq_queued(p)) + sub_rq_bw(p->dl.dl_bw, &rq->dl); + + /* + * We cannot use inactive_task_timer() to invoke sub_running_bw() + * at the 0-lag time, because the task could have been migrated + * while SCHED_OTHER in the meanwhile. + */ + if (p->dl.dl_non_contending) + p->dl.dl_non_contending = 0; /* * Since this might be the only -deadline task on the rq, @@ -1762,11 +2255,15 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); /* If p is not queued we will update its parameters at next wakeup. */ - if (!task_on_rq_queued(p)) - return; + if (!task_on_rq_queued(p)) { + add_rq_bw(p->dl.dl_bw, &rq->dl); + return; + } /* * If p is boosted we already updated its params in * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), @@ -1836,6 +2333,7 @@ const struct sched_class dl_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, + .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, @@ -1845,7 +2343,6 @@ const struct sched_class dl_sched_class = { .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, - .task_dead = task_dead_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, @@ -1854,6 +2351,317 @@ const struct sched_class dl_sched_class = { .update_curr = update_curr_dl, }; +int sched_dl_global_validate(void) +{ + u64 runtime = global_rt_runtime(); + u64 period = global_rt_period(); + u64 new_bw = to_ratio(period, runtime); + struct dl_bw *dl_b; + int cpu, ret = 0; + unsigned long flags; + + /* + * Here we want to check the bandwidth not being set to some + * value smaller than the currently allocated bandwidth in + * any of the root_domains. + * + * FIXME: Cycling on all the CPUs is overdoing, but simpler than + * cycling on root_domains... Discussion on different/better + * solutions is welcome! + */ + for_each_possible_cpu(cpu) { + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + if (new_bw < dl_b->total_bw) + ret = -EBUSY; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + + if (ret) + break; + } + + return ret; +} + +void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +{ + if (global_rt_runtime() == RUNTIME_INF) { + dl_rq->bw_ratio = 1 << RATIO_SHIFT; + dl_rq->extra_bw = 1 << BW_SHIFT; + } else { + dl_rq->bw_ratio = to_ratio(global_rt_runtime(), + global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); + dl_rq->extra_bw = to_ratio(global_rt_period(), + global_rt_runtime()); + } +} + +void sched_dl_do_global(void) +{ + u64 new_bw = -1; + struct dl_bw *dl_b; + int cpu; + unsigned long flags; + + def_dl_bandwidth.dl_period = global_rt_period(); + def_dl_bandwidth.dl_runtime = global_rt_runtime(); + + if (global_rt_runtime() != RUNTIME_INF) + new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + + /* + * FIXME: As above... + */ + for_each_possible_cpu(cpu) { + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + dl_b->bw = new_bw; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); + } +} + +/* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +int sched_dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr) +{ + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + u64 period = attr->sched_period ?: attr->sched_deadline; + u64 runtime = attr->sched_runtime; + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; + int cpus, err = -1; + + /* !deadline task may carry old deadline bandwidth */ + if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) + return 0; + + /* + * Either if a task, enters, leave, or stays -deadline but changes + * its parameters, we may need to update accordingly the total + * allocated bandwidth of the container. + */ + raw_spin_lock(&dl_b->lock); + cpus = dl_bw_cpus(task_cpu(p)); + if (dl_policy(policy) && !task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, 0, new_bw)) { + if (hrtimer_active(&p->dl.inactive_timer)) + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + err = 0; + } else if (dl_policy(policy) && task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + /* + * XXX this is slightly incorrect: when the task + * utilization decreases, we should delay the total + * utilization change until the task's 0-lag point. + * But this would require to set the task's "inactive + * timer" when the task is not inactive. + */ + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + dl_change_utilization(p, new_bw); + err = 0; + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { + /* + * Do not decrease the total deadline utilization here, + * switched_from_dl() will take care to do it at the correct + * (0-lag) time. + */ + err = 0; + } + raw_spin_unlock(&dl_b->lock); + + return err; +} + +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */ +void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = attr->sched_runtime; + dl_se->dl_deadline = attr->sched_deadline; + dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; + dl_se->flags = attr->sched_flags; + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); +} + +void __getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + attr->sched_priority = p->rt_priority; + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + attr->sched_period = dl_se->dl_period; + attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). + */ +bool __checkparam_dl(const struct sched_attr *attr) +{ + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; + + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; + + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; + + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; + + return true; +} + +/* + * This function clears the sched_dl_entity static params. + */ +void __dl_clear_params(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; + dl_se->dl_density = 0; + + dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; +} + +bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + if (dl_se->dl_runtime != attr->sched_runtime || + dl_se->dl_deadline != attr->sched_deadline || + dl_se->dl_period != attr->sched_period || + dl_se->flags != attr->sched_flags) + return true; + + return false; +} + +#ifdef CONFIG_SMP +int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) +{ + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b; + bool overflow; + int cpus, ret; + unsigned long flags; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(dest_cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw, cpus); + ret = 0; + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + return ret; +} + +int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) +{ + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; + + rcu_read_lock_sched(); + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); + + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + rcu_read_unlock_sched(); + return ret; +} + +bool dl_cpu_busy(unsigned int cpu) +{ + unsigned long flags; + struct dl_bw *dl_b; + bool overflow; + int cpus; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + return overflow; +} +#endif + #ifdef CONFIG_SCHED_DEBUG extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 38f019324f1a..4fa66de52bd6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -552,15 +552,21 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) +#define PU(x) \ + SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(rt_rq->x)) #define PN(x) \ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) - P(rt_nr_running); + PU(rt_nr_running); +#ifdef CONFIG_SMP + PU(rt_nr_migratory); +#endif P(rt_throttled); PN(rt_time); PN(rt_runtime); #undef PN +#undef PU #undef P } @@ -569,14 +575,21 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) struct dl_bw *dl_bw; SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); - SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); + +#define PU(x) \ + SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) + + PU(dl_nr_running); #ifdef CONFIG_SMP + PU(dl_nr_migratory); dl_bw = &cpu_rq(cpu)->rd->dl_bw; #else dl_bw = &dl_rq->dl_bw; #endif SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); + +#undef PU } extern __read_mostly int sched_clock_running; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d71109321841..008c514dc241 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) } /* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -1381,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long capacity_of(int cpu); -static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -2469,7 +2469,8 @@ void task_numa_work(struct callback_head *work) return; - down_read(&mm->mmap_sem); + if (!down_read_trylock(&mm->mmap_sem)) + return; vma = find_vma(mm, start); if (!vma) { reset_ptenuma_scan(p); @@ -2584,6 +2585,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } } + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + */ +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + struct numa_stats prev_load, this_load; + s64 this_eff_load, prev_eff_load; + + update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); + update_numa_stats(&this_load, cpu_to_node(this_cpu)); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) { + unsigned long current_load = task_h_load(current); + + if (this_load.load > current_load) + this_load.load -= current_load; + else + this_load.load = 0; + } + + /* + * In low-load situations, where this_cpu's node is idle due to the + * sync cause above having dropped this_load.load to 0, move the task. + * Moving to an idle socket will not create a bad imbalance. + * + * Otherwise check if the nodes are near enough in load to allow this + * task to be woken on this_cpu's node. + */ + if (this_load.load > 0) { + unsigned long task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_load.compute_capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_load.compute_capacity; + + this_eff_load *= this_load.load + task_load; + prev_eff_load *= prev_load.load - task_load; + + return this_eff_load <= prev_eff_load; + } + + return true; +} #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2596,6 +2651,15 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } + +#ifdef CONFIG_SMP +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + return true; +} +#endif /* !SMP */ #endif /* CONFIG_NUMA_BALANCING */ static void @@ -2916,12 +2980,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); if (cfs_rq) { cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); return 1; } @@ -2982,8 +3046,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) * differential update where we store the last value we propagated. This in * turn allows skipping updates if the differential is 'small'. * - * Updating tg's load_avg is necessary before update_cfs_share() (which is - * done) and effective_load() (which is not done because it is too costly). + * Updating tg's load_avg is necessary before update_cfs_share(). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -3563,7 +3626,7 @@ static inline void check_schedstat_required(void) trace_sched_stat_runtime_enabled()) { printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " "stat_blocked and stat_runtime require the " - "kernel parameter schedstats=enabled or " + "kernel parameter schedstats=enable or " "kernel.sched_schedstats=1\n"); } #endif @@ -4642,24 +4705,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } +/* + * Both these cpu hotplug callbacks race against unregister_fair_sched_group() + * + * The race is harmless, since modifying bandwidth settings of unhooked group + * bits doesn't do much. + */ + +/* cpu online calback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; raw_spin_unlock(&cfs_b->lock); } + rcu_read_unlock(); } +/* cpu offline callback */ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; + + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - for_each_leaf_cfs_rq(rq, cfs_rq) { if (!cfs_rq->runtime_enabled) continue; @@ -4677,6 +4759,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } + rcu_read_unlock(); } #else /* CONFIG_CFS_BANDWIDTH */ @@ -5215,126 +5298,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } -#ifdef CONFIG_FAIR_GROUP_SCHED -/* - * effective_load() calculates the load change as seen from the root_task_group - * - * Adding load to a group doesn't make a group heavier, but can cause movement - * of group shares between cpus. Assuming the shares were perfectly aligned one - * can calculate the shift in shares. - * - * Calculate the effective load difference if @wl is added (subtracted) to @tg - * on this @cpu and results in a total addition (subtraction) of @wg to the - * total group weight. - * - * Given a runqueue weight distribution (rw_i) we can compute a shares - * distribution (s_i) using: - * - * s_i = rw_i / \Sum rw_j (1) - * - * Suppose we have 4 CPUs and our @tg is a direct child of the root group and - * has 7 equal weight tasks, distributed as below (rw_i), with the resulting - * shares distribution (s_i): - * - * rw_i = { 2, 4, 1, 0 } - * s_i = { 2/7, 4/7, 1/7, 0 } - * - * As per wake_affine() we're interested in the load of two CPUs (the CPU the - * task used to run on and the CPU the waker is running on), we need to - * compute the effect of waking a task on either CPU and, in case of a sync - * wakeup, compute the effect of the current task going to sleep. - * - * So for a change of @wl to the local @cpu with an overall group weight change - * of @wl we can compute the new shares distribution (s'_i) using: - * - * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) - * - * Suppose we're interested in CPUs 0 and 1, and want to compute the load - * differences in waking a task to CPU 0. The additional task changes the - * weight and shares distributions like: - * - * rw'_i = { 3, 4, 1, 0 } - * s'_i = { 3/8, 4/8, 1/8, 0 } - * - * We can then compute the difference in effective weight by using: - * - * dw_i = S * (s'_i - s_i) (3) - * - * Where 'S' is the group weight as seen by its parent. - * - * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) - * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - - * 4/7) times the weight of the group. - */ -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) -{ - struct sched_entity *se = tg->se[cpu]; - - if (!tg->parent) /* the trivial, non-cgroup case */ - return wl; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = se->my_q; - long W, w = cfs_rq_load_avg(cfs_rq); - - tg = cfs_rq->tg; - - /* - * W = @wg + \Sum rw_j - */ - W = wg + atomic_long_read(&tg->load_avg); - - /* Ensure \Sum rw_j >= rw_i */ - W -= cfs_rq->tg_load_avg_contrib; - W += w; - - /* - * w = rw_i + @wl - */ - w += wl; - - /* - * wl = S * s'_i; see (2) - */ - if (W > 0 && w < W) - wl = (w * (long)scale_load_down(tg->shares)) / W; - else - wl = scale_load_down(tg->shares); - - /* - * Per the above, wl is the new se->load.weight value; since - * those are clipped to [MIN_SHARES, ...) do so now. See - * calc_cfs_shares(). - */ - if (wl < MIN_SHARES) - wl = MIN_SHARES; - - /* - * wl = dw_i = S * (s'_i - s_i); see (3) - */ - wl -= se->avg.load_avg; - - /* - * Recursively apply this logic to all parent groups to compute - * the final effective load change on the root group. Since - * only the @tg group gets extra weight, all parent groups can - * only redistribute existing shares. @wl is the shift in shares - * resulting from this level per the above. - */ - wg = 0; - } - - return wl; -} -#else - -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) -{ - return wl; -} - -#endif - static void record_wakee(struct task_struct *p) { /* @@ -5385,67 +5348,25 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { - s64 this_load, load; - s64 this_eff_load, prev_eff_load; - int idx, this_cpu; - struct task_group *tg; - unsigned long weight; - int balanced; - - idx = sd->wake_idx; - this_cpu = smp_processor_id(); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - tg = task_group(current); - weight = current->se.avg.load_avg; - - this_load += effective_load(tg, this_cpu, -weight, -weight); - load += effective_load(tg, prev_cpu, 0, -weight); - } - - tg = task_group(p); - weight = p->se.avg.load_avg; + int this_cpu = smp_processor_id(); + bool affine = false; /* - * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped this_load to 0, we'll - * always have an imbalance, but there's really nothing you can do - * about that, so that's good too. - * - * Otherwise check if either cpus are near enough in load to allow this - * task to be woken on this_cpu. + * Common case: CPUs are in the same socket, and select_idle_sibling() + * will do its thing regardless of what we return: */ - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); - - if (this_load > 0) { - this_eff_load *= this_load + - effective_load(tg, this_cpu, weight, weight); - - prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); - } - - balanced = this_eff_load <= prev_eff_load; + if (cpus_share_cache(prev_cpu, this_cpu)) + affine = true; + else + affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); + if (affine) { + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); + } - if (!balanced) - return 0; - - schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); - - return 1; + return affine; } static inline int task_util(struct task_struct *p); @@ -5484,12 +5405,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), + if (!cpumask_intersects(sched_group_span(group), &p->cpus_allowed)) continue; local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + sched_group_span(group)); /* * Tally up the load of all CPUs in the group and find @@ -5499,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, runnable_load = 0; max_spare_cap = 0; - for_each_cpu(i, sched_group_cpus(group)) { + for_each_cpu(i, sched_group_span(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -5602,10 +5523,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Check if we have any choice: */ if (group->group_weight == 1) - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5640,43 +5561,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -/* - * Implement a for_each_cpu() variant that starts the scan at a given cpu - * (@start), and wraps around. - * - * This is used to scan for idle CPUs; such that not all CPUs looking for an - * idle CPU find the same CPU. The down-side is that tasks tend to cycle - * through the LLC domain. - * - * Especially tbench is found sensitive to this. - */ - -static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) -{ - int next; - -again: - next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); - - if (*wrapped) { - if (next >= start) - return nr_cpumask_bits; - } else { - if (next >= nr_cpumask_bits) { - *wrapped = 1; - n = -1; - goto again; - } - } - - return next; -} - -#define for_each_cpu_wrap(cpu, mask, start, wrap) \ - for ((wrap) = 0, (cpu) = (start)-1; \ - (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ - (cpu) < nr_cpumask_bits; ) - #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -5736,7 +5620,7 @@ unlock: static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu, wrap; + int core, cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5746,7 +5630,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); - for_each_cpu_wrap(core, cpus, target, wrap) { + for_each_cpu_wrap(core, cpus, target) { bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -5809,27 +5693,38 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct sched_domain *this_sd; - u64 avg_cost, avg_idle = this_rq()->avg_idle; + u64 avg_cost, avg_idle; u64 time, cost; s64 delta; - int cpu, wrap; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1; - avg_cost = this_sd->avg_scan_cost; - /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. */ - if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost) + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; + + if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) return -1; + if (sched_feat(SIS_PROP)) { + u64 span_avg = sd->span_weight * avg_idle; + if (span_avg > 4*avg_cost) + nr = div_u64(span_avg, avg_cost); + else + nr = 4; + } + time = local_clock(); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + if (!--nr) + return -1; if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) @@ -6011,11 +5906,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) + if (cpu == prev_cpu) + goto pick_cpu; + + if (wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { + pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); @@ -6168,8 +6067,11 @@ static void set_last_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->last = se; + } } static void set_next_buddy(struct sched_entity *se) @@ -6177,8 +6079,11 @@ static void set_next_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->next = se; + } } static void set_skip_buddy(struct sched_entity *se) @@ -6686,6 +6591,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (dst_nid == p->numa_preferred_nid) return 0; + /* Leaving a core idle is often worse than degrading locality. */ + if (env->idle != CPU_NOT_IDLE) + return -1; + if (numa_group) { src_faults = group_faults(p, src_nid); dst_faults = group_faults(p, dst_nid); @@ -6970,10 +6879,28 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; struct rq_flags rf; rq_lock_irqsave(rq, &rf); @@ -6983,7 +6910,7 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; /* throttled entities do not contribute to load */ @@ -6997,6 +6924,13 @@ static void update_blocked_averages(int cpu) se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) update_load_avg(se, 0); + + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); } rq_unlock_irqrestore(rq, &rf); } @@ -7229,7 +7163,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) { + for_each_cpu(cpu, sched_group_span(sdg)) { struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); @@ -7408,7 +7342,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); /* Bias balancing toward cpus of our domain */ @@ -7572,7 +7506,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *sgs = &tmp_sgs; int local_group; - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); if (local_group) { sds->local = sg; sgs = local; @@ -7927,7 +7861,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -8033,7 +7967,6 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - struct cpumask *sg_cpus, *sg_mask; int cpu, balance_cpu = -1; /* @@ -8043,11 +7976,9 @@ static int should_we_balance(struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE) return 1; - sg_cpus = sched_group_cpus(sg); - sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ - for_each_cpu_and(cpu, sg_cpus, env->cpus) { - if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { + if (!idle_cpu(cpu)) continue; balance_cpu = cpu; @@ -8083,7 +8014,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), + .dst_grpmask = sched_group_span(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, @@ -8659,6 +8590,10 @@ void nohz_balance_enter_idle(int cpu) if (!cpu_active(cpu)) return; + /* Spare idle load balancing on CPUs that don't want to be disturbed: */ + if (!is_housekeeping_cpu(cpu)) + return; + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; @@ -9523,10 +9458,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 11192e0cb122..d3fb15555291 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true) * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ SCHED_FEAT(SIS_AVG_CPU, false) +SCHED_FEAT(SIS_PROP, true) /* * Issue a WARN when we do multiple update_rq_clock() calls @@ -76,7 +77,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2a25a9ec2c6e..6c23e30c0e5c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -219,6 +219,7 @@ static void do_idle(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { @@ -265,7 +266,7 @@ static void do_idle(void) smp_mb__after_atomic(); sched_ttwu_pending(); - schedule_preempt_disabled(); + schedule_idle(); if (unlikely(klp_patch_pending(current))) klp_update_patch_state(current); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index f15fb2bdbc0d..f14716a3522f 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -117,7 +117,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * load-average relies on per-cpu sampling from the tick, it is affected by * NO_HZ. * - * The basic idea is to fold the nr_active delta into a global idle-delta upon + * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon * entering NO_HZ state such that we can include this as an 'extra' cpu delta * when we read the global state. * @@ -126,7 +126,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * - When we go NO_HZ idle during the window, we can negate our sample * contribution, causing under-accounting. * - * We avoid this by keeping two idle-delta counters and flipping them + * We avoid this by keeping two NO_HZ-delta counters and flipping them * when the window starts, thus separating old and new NO_HZ load. * * The only trick is the slight shift in index flip for read vs write. @@ -137,22 +137,22 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * r:0 0 1 1 0 0 1 1 0 * w:0 1 1 0 0 1 1 0 0 * - * This ensures we'll fold the old idle contribution in this window while + * This ensures we'll fold the old NO_HZ contribution in this window while * accumlating the new one. * - * - When we wake up from NO_HZ idle during the window, we push up our + * - When we wake up from NO_HZ during the window, we push up our * contribution, since we effectively move our sample point to a known * busy state. * * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which + * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. + * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ + * intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_idle[2]; +static atomic_long_t calc_load_nohz[2]; static int calc_load_idx; static inline int calc_load_write_idx(void) @@ -167,7 +167,7 @@ static inline int calc_load_write_idx(void) /* * If the folding window started, make sure we start writing in the - * next idle-delta. + * next NO_HZ-delta. */ if (!time_before(jiffies, READ_ONCE(calc_load_update))) idx++; @@ -180,24 +180,24 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_enter_idle(void) +void calc_load_nohz_start(void) { struct rq *this_rq = this_rq(); long delta; /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. */ delta = calc_load_fold_active(this_rq, 0); if (delta) { int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); + atomic_long_add(delta, &calc_load_nohz[idx]); } } -void calc_load_exit_idle(void) +void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -217,13 +217,13 @@ void calc_load_exit_idle(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_fold_idle(void) +static long calc_load_nohz_fold(void) { int idx = calc_load_read_idx(); long delta = 0; - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); + if (atomic_long_read(&calc_load_nohz[idx])) + delta = atomic_long_xchg(&calc_load_nohz[idx], 0); return delta; } @@ -299,9 +299,9 @@ calc_load_n(unsigned long load, unsigned long exp, /* * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. + * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into + * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold + * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. * * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. @@ -330,7 +330,7 @@ static void calc_global_nohz(void) } /* - * Flip the idle index... + * Flip the NO_HZ index... * * Make sure we first write the new time then flip the index, so that * calc_load_write_idx() will see the new time when it reads the new @@ -341,7 +341,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_fold_idle(void) { return 0; } +static inline long calc_load_nohz_fold(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -362,9 +362,9 @@ void calc_global_load(unsigned long ticks) return; /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. + * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. */ - delta = calc_load_fold_idle(); + delta = calc_load_nohz_fold(); if (delta) atomic_long_add(delta, &calc_load_tasks); @@ -378,7 +378,8 @@ void calc_global_load(unsigned long ticks) WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + * In case we went to NO_HZ for multiple LOAD_FREQ intervals + * catch up in bulk. */ calc_global_nohz(); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 979b7341008a..45caf937ef90 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + int skip; + + /* + * When span == cpu_online_mask, taking each rq->lock + * can be time-consuming. Try to avoid it when possible. + */ + raw_spin_lock(&rt_rq->rt_runtime_lock); + skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (skip) + continue; raw_spin_lock(&rq->lock); if (rt_rq->rt_time) { @@ -1819,7 +1830,7 @@ retry: * pushing. */ task = pick_next_pushable_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue @@ -2438,6 +2449,316 @@ const struct sched_class rt_sched_class = { .update_curr = update_curr_rt, }; +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ + struct task_struct *g, *p; + + /* + * Autogroups do not have RT tasks; see autogroup_create(). + */ + if (task_group_is_autogroup(tg)) + return 0; + + for_each_process_thread(g, p) { + if (rt_task(p) && task_group(p) == tg) + return 1; + } + + return 0; +} + +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; + +static int tg_rt_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; + + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); + } + + if (sum > total) + return -EINVAL; + + return 0; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + int ret; + + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + rcu_read_lock(); + ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_unlock(); + + return ret; +} + +static int tg_set_rt_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) +{ + int i, err = 0; + + /* + * Disallowing the root group RT runtime is BAD, it would disallow the + * kernel creating (and or operating) RT threads. + */ + if (tg == &root_task_group && rt_runtime == 0) + return -EINVAL; + + /* No period doesn't make any sense. */ + if (rt_period == 0) + return -EINVAL; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) + goto unlock; + + raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); +unlock: + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return err; +} + +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_bandwidth.rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} + +int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + int ret = 0; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return ret; +} + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +static int sched_rt_global_validate(void) +{ + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + return -EINVAL; + + return 0; +} + +static void sched_rt_do_global(void) +{ + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} + +int sched_rt_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_validate(); + if (ret) + goto undo; + + ret = sched_dl_global_validate(); + if (ret) + goto undo; + + ret = sched_rt_global_constraints(); + if (ret) + goto undo; + + sched_rt_do_global(); + sched_dl_do_global(); + } + if (0) { +undo: + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } + mutex_unlock(&mutex); + + return ret; +} + +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* + * Make sure that internally we keep jiffies. + * Also, writing zero resets the timeslice to default: + */ + if (!ret && write) { + sched_rr_timeslice = + sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : + msecs_to_jiffies(sysctl_sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + #ifdef CONFIG_SCHED_DEBUG extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7808ab050599..eeef1a3086d1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -39,9 +39,9 @@ #include "cpuacct.h" #ifdef CONFIG_SCHED_DEBUG -#define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else -#define SCHED_WARN_ON(x) ((void)(x)) +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif struct rq; @@ -218,23 +218,25 @@ static inline int dl_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } -extern struct dl_bw *dl_bw_of(int i); - struct dl_bw { raw_spinlock_t lock; u64 bw, total_bw; }; +static inline void __dl_update(struct dl_bw *dl_b, s64 bw); + static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); } static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); } static inline @@ -244,7 +246,22 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } +void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); +extern int sched_dl_global_validate(void); +extern void sched_dl_do_global(void); +extern int sched_dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr); +extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern bool __checkparam_dl(const struct sched_attr *attr); +extern void __dl_clear_params(struct task_struct *p); +extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); +extern int dl_task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial); +extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED @@ -366,6 +383,11 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); +extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); +extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); +extern long sched_group_rt_runtime(struct task_group *tg); +extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_online_group(struct task_group *tg, @@ -558,6 +580,30 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; + + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + u64 extra_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; }; #ifdef CONFIG_SMP @@ -606,11 +652,9 @@ struct root_domain { extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; -extern cpumask_var_t fallback_doms; -extern cpumask_var_t sched_domains_tmpmask; extern void init_defrootdomain(void); -extern int init_sched_domains(const struct cpumask *cpu_map); +extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); #endif /* CONFIG_SMP */ @@ -1025,7 +1069,11 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ - unsigned long cpumask[0]; /* iteration mask */ +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + + unsigned long cpumask[0]; /* balance mask */ }; struct sched_group { @@ -1046,16 +1094,15 @@ struct sched_group { unsigned long cpumask[0]; }; -static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +static inline struct cpumask *sched_group_span(struct sched_group *sg) { return to_cpumask(sg->cpumask); } /* - * cpumask masking which cpus in the group are allowed to iterate up the domain - * tree. + * See build_balance_mask(). */ -static inline struct cpumask *sched_group_mask(struct sched_group *sg) +static inline struct cpumask *group_balance_mask(struct sched_group *sg) { return to_cpumask(sg->sgc->cpumask); } @@ -1066,7 +1113,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg) */ static inline unsigned int group_first_cpu(struct sched_group *group) { - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); } extern int group_balance_cpu(struct sched_group *sg); @@ -1422,7 +1469,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) curr->sched_class->set_curr_task(rq); } +#ifdef CONFIG_SMP #define sched_class_highest (&stop_sched_class) +#else +#define sched_class_highest (&dl_sched_class) +#endif #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1467,6 +1518,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) } #endif +extern void schedule_idle(void); + extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); @@ -1484,7 +1537,12 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); @@ -1926,6 +1984,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu); static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif + +#ifdef CONFIG_SMP +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} +#else +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} +#endif + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { u64 total; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1b0b4fb12837..79895aec281e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ cpumask_var_t sched_domains_tmpmask; +cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG @@ -35,7 +36,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); @@ -45,14 +46,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %*pbl level %s\n", + printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + if (!cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -65,29 +66,47 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_cpus(group))) { + if (!cpumask_weight(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } if (!(sd->flags & SD_OVERLAP) && - cpumask_intersects(groupmask, sched_group_cpus(group))) { + cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + cpumask_or(groupmask, groupmask, sched_group_span(group)); - printk(KERN_CONT " %*pbl", - cpumask_pr_args(sched_group_cpus(group))); - if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %lu)", - group->sgc->capacity); + printk(KERN_CONT " %d:{ span=%*pbl", + group->sgc->id, + cpumask_pr_args(sched_group_span(group))); + + if ((sd->flags & SD_OVERLAP) && + !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { + printk(KERN_CONT " mask=%*pbl", + cpumask_pr_args(group_balance_mask(group))); + } + + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) + printk(KERN_CONT " cap=%lu", group->sgc->capacity); + + if (group == sd->groups && sd->child && + !cpumask_equal(sched_domain_span(sd->child), + sched_group_span(group))) { + printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); } + printk(KERN_CONT " }"); + group = group->next; + + if (group != sd->groups) + printk(KERN_CONT ","); + } while (group != sd->groups); printk(KERN_CONT "\n"); @@ -113,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) return; } - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu); for (;;) { if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) @@ -477,46 +496,214 @@ enum s_alloc { }; /* - * Build an iteration mask that can exclude certain CPUs from the upwards - * domain traversal. + * Return the canonical balance CPU for this group, this is the first CPU + * of this group that's also in the balance mask. * - * Asymmetric node setups can result in situations where the domain tree is of - * unequal depth, make sure to skip domains that already cover the entire - * range. + * The balance mask are all those CPUs that could actually end up at this + * group. See build_balance_mask(). * - * In that case build_sched_domains() will have terminated the iteration early - * and our sibling sd spans will be empty. Domains should always include the - * CPU they're built on, so check that. + * Also see should_we_balance(). */ -static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +int group_balance_cpu(struct sched_group *sg) { - const struct cpumask *span = sched_domain_span(sd); + return cpumask_first(group_balance_mask(sg)); +} + + +/* + * NUMA topology (first read the regular topology blurb below) + * + * Given a node-distance table, for example: + * + * node 0 1 2 3 + * 0: 10 20 30 20 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 20 30 20 10 + * + * which represents a 4 node ring topology like: + * + * 0 ----- 1 + * | | + * | | + * | | + * 3 ----- 2 + * + * We want to construct domains and groups to represent this. The way we go + * about doing this is to build the domains on 'hops'. For each NUMA level we + * construct the mask of all nodes reachable in @level hops. + * + * For the above NUMA topology that gives 3 levels: + * + * NUMA-2 0-3 0-3 0-3 0-3 + * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} + * + * NUMA-1 0-1,3 0-2 1-3 0,2-3 + * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} + * + * NUMA-0 0 1 2 3 + * + * + * As can be seen; things don't nicely line up as with the regular topology. + * When we iterate a domain in child domain chunks some nodes can be + * represented multiple times -- hence the "overlap" naming for this part of + * the topology. + * + * In order to minimize this overlap, we only build enough groups to cover the + * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. + * + * Because: + * + * - the first group of each domain is its child domain; this + * gets us the first 0-1,3 + * - the only uncovered node is 2, who's child domain is 1-3. + * + * However, because of the overlap, computing a unique CPU for each group is + * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both + * groups include the CPUs of Node-0, while those CPUs would not in fact ever + * end up at those groups (they would end up in group: 0-1,3). + * + * To correct this we have to introduce the group balance mask. This mask + * will contain those CPUs in the group that can reach this group given the + * (child) domain tree. + * + * With this we can once again compute balance_cpu and sched_group_capacity + * relations. + * + * XXX include words on how balance_cpu is unique and therefore can be + * used for sched_group_capacity links. + * + * + * Another 'interesting' topology is: + * + * node 0 1 2 3 + * 0: 10 20 20 30 + * 1: 20 10 20 20 + * 2: 20 20 10 20 + * 3: 30 20 20 10 + * + * Which looks a little like: + * + * 0 ----- 1 + * | / | + * | / | + * | / | + * 2 ----- 3 + * + * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 + * are not. + * + * This leads to a few particularly weird cases where the sched_domain's are + * not of the same number for each cpu. Consider: + * + * NUMA-2 0-3 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-1 0-2 0-3 0-3 1-3 + * + * NUMA-0 0 1 2 3 + * + */ + + +/* + * Build the balance mask; it contains only those CPUs that can arrive at this + * group and should be considered to continue balancing. + * + * We do this during the group creation pass, therefore the group information + * isn't complete yet, however since each group represents a (child) domain we + * can fully construct this using the sched_domain bits (which are already + * complete). + */ +static void +build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) +{ + const struct cpumask *sg_span = sched_group_span(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; - for_each_cpu(i, span) { + cpumask_clear(mask); + + for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + + /* + * Can happen in the asymmetric case, where these siblings are + * unused. The mask will not be empty because those CPUs that + * do have the top domain _should_ span the domain. + */ + if (!sibling->child) continue; - cpumask_set_cpu(i, sched_group_mask(sg)); + /* If we would not end up here, we can't continue from here */ + if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) + continue; + + cpumask_set_cpu(i, mask); } + + /* We must not have empty masks here */ + WARN_ON_ONCE(cpumask_empty(mask)); } /* - * Return the canonical balance CPU for this group, this is the first CPU - * of this group that's also in the iteration mask. + * XXX: This creates per-node group entries; since the load-balancer will + * immediately access remote memory to construct this group's load-balance + * statistics having the groups node local is of dubious benefit. */ -int group_balance_cpu(struct sched_group *sg) +static struct sched_group * +build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) { - return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); + struct sched_group *sg; + struct cpumask *sg_span; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(cpu)); + + if (!sg) + return NULL; + + sg_span = sched_group_span(sg); + if (sd->child) + cpumask_copy(sg_span, sched_domain_span(sd->child)); + else + cpumask_copy(sg_span, sched_domain_span(sd)); + + return sg; +} + +static void init_overlap_sched_group(struct sched_domain *sd, + struct sched_group *sg) +{ + struct cpumask *mask = sched_domains_tmpmask2; + struct sd_data *sdd = sd->private; + struct cpumask *sg_span; + int cpu; + + build_balance_mask(sd, sg, mask); + cpu = cpumask_first_and(sched_group_span(sg), mask); + + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + if (atomic_inc_return(&sg->sgc->ref) == 1) + cpumask_copy(group_balance_mask(sg), mask); + else + WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); + + /* + * Initialize sgc->capacity such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg_span = sched_group_span(sg); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; } static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { - struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + struct sched_group *first = NULL, *last = NULL, *sg; const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; @@ -525,7 +712,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct cpumask *sg_span; if (cpumask_test_cpu(i, covered)) @@ -533,44 +720,27 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sibling = *per_cpu_ptr(sdd->sd, i); - /* See the comment near build_group_mask(). */ + /* + * Asymmetric node setups can result in situations where the + * domain tree is of unequal depth, make sure to skip domains + * that already cover the entire range. + * + * In that case build_sched_domains() will have terminated the + * iteration early and our sibling sd spans will be empty. + * Domains should always include the CPU they're built on, so + * check that. + */ if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(cpu)); - + sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; - sg_span = sched_group_cpus(sg); - if (sibling->child) - cpumask_copy(sg_span, sched_domain_span(sibling->child)); - else - cpumask_set_cpu(i, sg_span); - + sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); - sg->sgc = *per_cpu_ptr(sdd->sgc, i); - if (atomic_inc_return(&sg->sgc->ref) == 1) - build_group_mask(sd, sg); - - /* - * Initialize sgc->capacity such that even if we mess up the - * domains and no possible iteration will get us here, we won't - * die on a /0 trap. - */ - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; - - /* - * Make sure the first group of this domain contains the - * canonical balance CPU. Otherwise the sched_domain iteration - * breaks. See update_sg_lb_stats(). - */ - if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - group_balance_cpu(sg) == cpu) - groups = sg; + init_overlap_sched_group(sd, sg); if (!first) first = sg; @@ -579,7 +749,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) last = sg; last->next = first; } - sd->groups = groups; + sd->groups = first; return 0; @@ -589,23 +759,106 @@ fail: return -ENOMEM; } -static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) + +/* + * Package topology (also see the load-balance blurb in fair.c) + * + * The scheduler builds a tree structure to represent a number of important + * topology features. By default (default_topology[]) these include: + * + * - Simultaneous multithreading (SMT) + * - Multi-Core Cache (MC) + * - Package (DIE) + * + * Where the last one more or less denotes everything up to a NUMA node. + * + * The tree consists of 3 primary data structures: + * + * sched_domain -> sched_group -> sched_group_capacity + * ^ ^ ^ ^ + * `-' `-' + * + * The sched_domains are per-cpu and have a two way link (parent & child) and + * denote the ever growing mask of CPUs belonging to that level of topology. + * + * Each sched_domain has a circular (double) linked list of sched_group's, each + * denoting the domains of the level below (or individual CPUs in case of the + * first domain level). The sched_group linked by a sched_domain includes the + * CPU of that sched_domain [*]. + * + * Take for instance a 2 threaded, 2 core, 2 cache cluster part: + * + * CPU 0 1 2 3 4 5 6 7 + * + * DIE [ ] + * MC [ ] [ ] + * SMT [ ] [ ] [ ] [ ] + * + * - or - + * + * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 + * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 + * + * CPU 0 1 2 3 4 5 6 7 + * + * One way to think about it is: sched_domain moves you up and down among these + * topology levels, while sched_group moves you sideways through it, at child + * domain granularity. + * + * sched_group_capacity ensures each unique sched_group has shared storage. + * + * There are two related construction problems, both require a CPU that + * uniquely identify each group (for a given domain): + * + * - The first is the balance_cpu (see should_we_balance() and the + * load-balance blub in fair.c); for each group we only want 1 CPU to + * continue balancing at a higher domain. + * + * - The second is the sched_group_capacity; we want all identical groups + * to share a single sched_group_capacity. + * + * Since these topologies are exclusive by construction. That is, its + * impossible for an SMT thread to belong to multiple cores, and cores to + * be part of multiple caches. There is a very clear and unique location + * for each CPU in the hierarchy. + * + * Therefore computing a unique CPU for each group is trivial (the iteration + * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ + * group), we can simply pick the first CPU in each group. + * + * + * [*] in other words, the first group of each domain is its child domain. + */ + +static struct sched_group *get_group(int cpu, struct sd_data *sdd) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; + struct sched_group *sg; if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) { - *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + sg = *per_cpu_ptr(sdd->sg, cpu); + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + + /* For claim_allocations: */ + atomic_inc(&sg->ref); + atomic_inc(&sg->sgc->ref); - /* For claim_allocations: */ - atomic_set(&(*sg)->sgc->ref, 1); + if (child) { + cpumask_copy(sched_group_span(sg), sched_domain_span(child)); + cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); + } else { + cpumask_set_cpu(cpu, sched_group_span(sg)); + cpumask_set_cpu(cpu, group_balance_mask(sg)); } - return cpu; + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + + return sg; } /* @@ -624,34 +877,20 @@ build_sched_groups(struct sched_domain *sd, int cpu) struct cpumask *covered; int i; - get_group(cpu, sdd, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (cpu != cpumask_first(span)) - return 0; - lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct sched_group *sg; - int group, j; if (cpumask_test_cpu(i, covered)) continue; - group = get_group(i, sdd, &sg); - cpumask_setall(sched_group_mask(sg)); + sg = get_group(i, sdd); - for_each_cpu(j, span) { - if (get_group(j, sdd, NULL) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } + cpumask_or(covered, covered, sched_group_span(sg)); if (!first) first = sg; @@ -660,6 +899,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) last = sg; } last->next = first; + sd->groups = first; return 0; } @@ -683,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) do { int cpu, max_cpu = -1; - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg->group_weight = cpumask_weight(sched_group_span(sg)); if (!(sd->flags & SD_ASYM_PACKING)) goto next; - for_each_cpu(cpu, sched_group_cpus(sg)) { + for_each_cpu(cpu, sched_group_span(sg)) { if (max_cpu < 0) max_cpu = cpu; else if (sched_asym_prefer(cpu, max_cpu)) @@ -1308,6 +1548,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; +#ifdef CONFIG_SCHED_DEBUG + sgc->id = j; +#endif + *per_cpu_ptr(sdd->sgc, j) = sgc; } } @@ -1407,7 +1651,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = build_sched_domain(tl, cpu_map, attr, sd, i); if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + if (tl->flags & SDTL_OVERLAP) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; @@ -1478,7 +1722,7 @@ static struct sched_domain_attr *dattr_cur; * cpumask) fails, then fallback to a single sched domain, * as determined by the single cpumask fallback_doms. */ -cpumask_var_t fallback_doms; +static cpumask_var_t fallback_doms; /* * arch_update_cpu_topology lets virtualized architectures update the @@ -1520,10 +1764,14 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated CPUs, but could be used to * exclude other special cases in the future. */ -int init_sched_domains(const struct cpumask *cpu_map) +int sched_init_domains(const struct cpumask *cpu_map) { int err; + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); + zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); + zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index b8c84c6dee64..17f11c6b0a9f 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -12,44 +12,44 @@ #include <linux/hash.h> #include <linux/kthread.h> -void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) +void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { - spin_lock_init(&q->lock); - lockdep_set_class_and_name(&q->lock, key, name); - INIT_LIST_HEAD(&q->task_list); + spin_lock_init(&wq_head->lock); + lockdep_set_class_and_name(&wq_head->lock, key, name); + INIT_LIST_HEAD(&wq_head->head); } EXPORT_SYMBOL(__init_waitqueue_head); -void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); + wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue_entry_tail(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue); -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_tail(q, wait); - spin_unlock_irqrestore(&q->lock, flags); + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue_entry_tail(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue_exclusive); -void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; - spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __remove_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(remove_wait_queue); @@ -63,12 +63,12 @@ EXPORT_SYMBOL(remove_wait_queue); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { - wait_queue_t *curr, *next; + wait_queue_entry_t *curr, *next; - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + list_for_each_entry_safe(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && @@ -79,7 +79,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, /** * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue + * @wq_head: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function @@ -87,35 +87,35 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, +void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __wake_up_common(wq_head, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(q, mode, nr, 0, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(q, mode, 1, 0, key); + __wake_up_common(wq_head, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue + * @wq_head: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets @@ -130,30 +130,30 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key); * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, +void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; int wake_flags = 1; /* XXX WF_SYNC */ - if (unlikely(!q)) + if (unlikely(!wq_head)) return; if (unlikely(nr_exclusive != 1)) wake_flags = 0; - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); /* * __wake_up_sync - see __wake_up_sync_key() */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive) { - __wake_up_sync_key(q, mode, nr_exclusive, NULL); + __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ @@ -170,48 +170,48 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * loads to move into the critical region). */ void -prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue(q, wait); + wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + if (list_empty(&wq_entry->entry)) + __add_wait_queue(wq_head, wq_entry); set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue_tail(q, wait); + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + if (list_empty(&wq_entry->entry)) + __add_wait_queue_entry_tail(wq_head, wq_entry); set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); -void init_wait_entry(wait_queue_t *wait, int flags) +void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) { - wait->flags = flags; - wait->private = current; - wait->func = autoremove_wake_function; - INIT_LIST_HEAD(&wait->task_list); + wq_entry->flags = flags; + wq_entry->private = current; + wq_entry->func = autoremove_wake_function; + INIT_LIST_HEAD(&wq_entry->entry); } EXPORT_SYMBOL(init_wait_entry); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; long ret = 0; - spin_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); if (unlikely(signal_pending_state(state, current))) { /* * Exclusive waiter must not fail if it was selected by wakeup, @@ -219,24 +219,24 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) * * The caller will recheck the condition and return success if * we were already woken up, we can not miss the event because - * wakeup locks/unlocks the same q->lock. + * wakeup locks/unlocks the same wq_head->lock. * * But we need to ensure that set-condition + wakeup after that * can't see us, it should wake up another exclusive waiter if * we fail. */ - list_del_init(&wait->task_list); + list_del_init(&wq_entry->entry); ret = -ERESTARTSYS; } else { - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); + if (list_empty(&wq_entry->entry)) { + if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_entry_tail(wq_head, wq_entry); else - __add_wait_queue(q, wait); + __add_wait_queue(wq_head, wq_entry); } set_current_state(state); } - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); return ret; } @@ -249,10 +249,10 @@ EXPORT_SYMBOL(prepare_to_wait_event); * condition in the caller before they add the wait * entry to the wake queue. */ -int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) +int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) { - if (likely(list_empty(&wait->task_list))) - __add_wait_queue_tail(wq, wait); + if (likely(list_empty(&wait->entry))) + __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) @@ -265,10 +265,10 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) } EXPORT_SYMBOL(do_wait_intr); -int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait) +int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) { - if (likely(list_empty(&wait->task_list))) - __add_wait_queue_tail(wq, wait); + if (likely(list_empty(&wait->entry))) + __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) @@ -283,14 +283,14 @@ EXPORT_SYMBOL(do_wait_intr_irq); /** * finish_wait - clean up after waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor + * @wq_head: waitqueue waited on + * @wq_entry: wait descriptor * * Sets current thread back to running state and removes * the wait descriptor from the given waitqueue if still * queued. */ -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -308,20 +308,20 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) * have _one_ other CPU that looks at or modifies * the list). */ - if (!list_empty_careful(&wait->task_list)) { - spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); - spin_unlock_irqrestore(&q->lock, flags); + if (!list_empty_careful(&wq_entry->entry)) { + spin_lock_irqsave(&wq_head->lock, flags); + list_del_init(&wq_entry->entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } } EXPORT_SYMBOL(finish_wait); -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { - int ret = default_wake_function(wait, mode, sync, key); + int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wait->task_list); + list_del_init(&wq_entry->entry); return ret; } EXPORT_SYMBOL(autoremove_wake_function); @@ -334,24 +334,24 @@ static inline bool is_kthread_should_stop(void) /* * DEFINE_WAIT_FUNC(wait, woken_wake_func); * - * add_wait_queue(&wq, &wait); + * add_wait_queue(&wq_head, &wait); * for (;;) { * if (condition) * break; * * p->state = mode; condition = true; * smp_mb(); // A smp_wmb(); // C - * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * if (!wq_entry->flags & WQ_FLAG_WOKEN) wq_entry->flags |= WQ_FLAG_WOKEN; * schedule() try_to_wake_up(); * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ - * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * wq_entry->flags &= ~WQ_FLAG_WOKEN; condition = true; * smp_mb() // B smp_wmb(); // C - * wait->flags |= WQ_FLAG_WOKEN; + * wq_entry->flags |= WQ_FLAG_WOKEN; * } - * remove_wait_queue(&wq, &wait); + * remove_wait_queue(&wq_head, &wait); * */ -long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) { set_current_state(mode); /* A */ /* @@ -359,7 +359,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must * also observe all state before the wakeup. */ - if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); @@ -369,13 +369,13 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss * an event. */ - smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */ return timeout; } EXPORT_SYMBOL(wait_woken); -int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { /* * Although this function is called under waitqueue lock, LOCK @@ -385,267 +385,8 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) * and is paired with smp_store_mb() in wait_woken(). */ smp_wmb(); /* C */ - wait->flags |= WQ_FLAG_WOKEN; + wq_entry->flags |= WQ_FLAG_WOKEN; - return default_wake_function(wait, mode, sync, key); + return default_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(woken_wake_function); - -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) - return 0; - else - return autoremove_wake_function(wait, mode, sync, key); -} -EXPORT_SYMBOL(wake_bit_function); - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) - * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are - * permitted return codes. Nonzero return codes halt waiting and return. - */ -int __sched -__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - wait_bit_action_f *action, unsigned mode) -{ - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key, mode); - } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq, &q->wait); - return ret; -} -EXPORT_SYMBOL(__wait_on_bit); - -int __sched out_of_line_wait_on_bit(void *word, int bit, - wait_bit_action_f *action, unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit); - -int __sched out_of_line_wait_on_bit_timeout( - void *word, int bit, wait_bit_action_f *action, - unsigned mode, unsigned long timeout) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - wait.key.timeout = jiffies + timeout; - return __wait_on_bit(wq, &wait, action, mode); -} -EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); - -int __sched -__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - wait_bit_action_f *action, unsigned mode) -{ - int ret = 0; - - for (;;) { - prepare_to_wait_exclusive(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) { - ret = action(&q->key, mode); - /* - * See the comment in prepare_to_wait_event(). - * finish_wait() does not necessarily takes wq->lock, - * but test_and_set_bit() implies mb() which pairs with - * smp_mb__after_atomic() before wake_up_page(). - */ - if (ret) - finish_wait(wq, &q->wait); - } - if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { - if (!ret) - finish_wait(wq, &q->wait); - return 0; - } else if (ret) { - return ret; - } - } -} -EXPORT_SYMBOL(__wait_on_bit_lock); - -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - wait_bit_action_f *action, unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit_lock(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); - -void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) -{ - struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &key); -} -EXPORT_SYMBOL(__wake_up_bit); - -/** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. - * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_atomic(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. - */ -void wake_up_bit(void *word, int bit) -{ - __wake_up_bit(bit_waitqueue(word, bit), word, bit); -} -EXPORT_SYMBOL(wake_up_bit); - -/* - * Manipulate the atomic_t address to produce a better bit waitqueue table hash - * index (we're keying off bit -1, but that would produce a horrible hash - * value). - */ -static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) -{ - if (BITS_PER_LONG == 64) { - unsigned long q = (unsigned long)p; - return bit_waitqueue((void *)(q & ~1), q & 1); - } - return bit_waitqueue(p, 0); -} - -static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, - void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - atomic_t *val = key->flags; - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - atomic_read(val) != 0) - return 0; - return autoremove_wake_function(wait, mode, sync, key); -} - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, - * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero - * return codes halt waiting and return. - */ -static __sched -int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(atomic_t *), unsigned mode) -{ - atomic_t *val; - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - val = q->key.flags; - if (atomic_read(val) == 0) - break; - ret = (*action)(val); - } while (!ret && atomic_read(val) != 0); - finish_wait(wq, &q->wait); - return ret; -} - -#define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue name = { \ - .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wait = { \ - .private = current, \ - .func = wake_atomic_t_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wait.task_list), \ - }, \ - } - -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), - unsigned mode) -{ - wait_queue_head_t *wq = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wait, p); - - return __wait_on_atomic_t(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); - -/** - * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @p: The atomic_t being waited on, a kernel virtual address - * - * Wake up anyone waiting for the atomic_t to go to zero. - * - * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t - * check is done by the waiter's wake function, not the by the waker itself). - */ -void wake_up_atomic_t(atomic_t *p) -{ - __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); -} -EXPORT_SYMBOL(wake_up_atomic_t); - -__sched int bit_wait(struct wait_bit_key *word, int mode) -{ - schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL(bit_wait); - -__sched int bit_wait_io(struct wait_bit_key *word, int mode) -{ - io_schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL(bit_wait_io); - -__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_timeout); - -__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - io_schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c new file mode 100644 index 000000000000..f8159698aa4d --- /dev/null +++ b/kernel/sched/wait_bit.c @@ -0,0 +1,286 @@ +/* + * The implementation of the wait_bit*() and related waiting APIs: + */ +#include <linux/wait_bit.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> +#include <linux/hash.h> + +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) + +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + +int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wq_entry, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + wait_bit_action_f *action, unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) + ret = (*action)(&wbq_entry->key, mode); + } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, + wait_bit_action_f *action, unsigned mode) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + return __wait_on_bit(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched out_of_line_wait_on_bit_timeout( + void *word, int bit, wait_bit_action_f *action, + unsigned mode, unsigned long timeout) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + wq_entry.key.timeout = jiffies + timeout; + return __wait_on_bit(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); + +int __sched +__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + wait_bit_action_f *action, unsigned mode) +{ + int ret = 0; + + for (;;) { + prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { + ret = action(&wbq_entry->key, mode); + /* + * See the comment in prepare_to_wait_event(). + * finish_wait() does not necessarily takes wwq_head->lock, + * but test_and_set_bit() implies mb() which pairs with + * smp_mb__after_atomic() before wake_up_page(). + */ + if (ret) + finish_wait(wq_head, &wbq_entry->wq_entry); + } + if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { + if (!ret) + finish_wait(wq_head, &wbq_entry->wq_entry); + return 0; + } else if (ret) { + return ret; + } + } +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, + wait_bit_action_f *action, unsigned mode) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + return __wait_on_bit_lock(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) +{ + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq_head)) + __wake_up(wq_head, TASK_NORMAL, 1, &key); +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_atomic(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wq_entry, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + val = wbq_entry->key.flags; + if (atomic_read(val) == 0) + break; + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue_entry name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wq_entry = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .entry = \ + LIST_HEAD_INIT((name).wq_entry.entry), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + struct wait_queue_head *wq_head = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wq_entry, p); + + return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @p: The atomic_t being waited on, a kernel virtual address + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); + +__sched int bit_wait(struct wait_bit_key *word, int mode) +{ + schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(bit_wait); + +__sched int bit_wait_io(struct wait_bit_key *word, int mode) +{ + io_schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(bit_wait_io); + +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) +{ + unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_timeout); + +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) +{ + unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + io_schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_io_timeout); + +void __init wait_bit_init(void) +{ + int i; + + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); +} diff --git a/kernel/signal.c b/kernel/signal.c index a8c54f384553..9b28954e058f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -510,7 +510,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) +static void collect_signal(int sig, struct sigpending *list, siginfo_t *info, + bool *resched_timer) { struct sigqueue *q, *first = NULL; @@ -532,6 +533,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); + + *resched_timer = + (first->flags & SIGQUEUE_PREALLOC) && + (info->si_code == SI_TIMER) && + (info->si_sys_private); + __sigqueue_free(first); } else { /* @@ -548,12 +555,12 @@ still_pending: } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - siginfo_t *info) + siginfo_t *info, bool *resched_timer) { int sig = next_signal(pending, mask); if (sig) - collect_signal(sig, pending, info); + collect_signal(sig, pending, info, resched_timer); return sig; } @@ -565,15 +572,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, */ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { + bool resched_timer = false; int signr; /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ - signr = __dequeue_signal(&tsk->pending, mask, info); + signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, - mask, info); + mask, info, &resched_timer); #ifdef CONFIG_POSIX_TIMERS /* * itimer signal ? @@ -621,7 +629,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) current->jobctl |= JOBCTL_STOP_DEQUEUED; } #ifdef CONFIG_POSIX_TIMERS - if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { + if (resched_timer) { /* * Release the siglock to ensure proper locking order * of timer locks outside of siglocks. Note, we leave @@ -1237,7 +1245,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, } /* * This sighand can be already freed and even reused, but - * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which + * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which * initializes ->siglock: this slab can't go away, it has * the same object type, ->siglock can't be reinitialized. * @@ -2092,7 +2100,6 @@ static void do_jobctl_trap(void) static int ptrace_signal(int signr, siginfo_t *info) { - ptrace_signal_deliver(); /* * We do not check sig_kernel_stop(signr) but set this marker * unconditionally because we do not know whether debugger will diff --git a/kernel/smp.c b/kernel/smp.c index a817769b53c0..3061483cb3ad 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -30,6 +30,7 @@ enum { struct call_function_data { struct call_single_data __percpu *csd; cpumask_var_t cpumask; + cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); @@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; + if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, + cpu_to_node(cpu))) { + free_cpumask_var(cfd->cpumask); + return -ENOMEM; + } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } @@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu) struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } @@ -428,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask, cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, cfd->cpumask); + __cpumask_clear_cpu(this_cpu, cfd->cpumask); /* Some callers race with other cpus changing the passed mask */ if (unlikely(!cpumask_weight(cfd->cpumask))) return; + cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); @@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask, csd->flags |= CSD_FLAG_SYNCHRONOUS; csd->func = func; csd->info = info; - llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); + if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) + __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(cfd->cpumask); + arch_send_call_function_ipi_mask(cfd->cpumask_ipi); if (wait) { for_each_cpu(cpu, cfd->cpumask) { diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index ece4b177052b..939a158eab11 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1119,7 +1119,7 @@ static ssize_t bin_uuid(struct file *file, /* Only supports reads */ if (oldval && oldlen) { char buf[UUID_STRING_LEN + 1]; - uuid_be uuid; + uuid_t uuid; result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) @@ -1128,7 +1128,7 @@ static ssize_t bin_uuid(struct file *file, buf[result] = '\0'; result = -EIO; - if (uuid_be_to_bin(buf, &uuid)) + if (uuid_parse(buf, &uuid)) goto out; if (oldlen > 16) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 4008d9f95dd7..ac09bc29eb08 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL Note the boot CPU will still be kept outside the range to handle the timekeeping duty. -config NO_HZ_FULL_SYSIDLE - bool "Detect full-system idle state for full dynticks system" - depends on NO_HZ_FULL - default n - help - At least one CPU must keep the scheduling-clock tick running for - timekeeping purposes whenever there is a non-idle CPU, where - "non-idle" also includes dynticks CPUs as long as they are - running non-idle tasks. Because the underlying adaptive-tick - support cannot distinguish between all CPUs being idle and - all CPUs each running a single task in dynticks mode, the - underlying support simply ensures that there is always a CPU - handling the scheduling-clock tick, whether or not all CPUs - are idle. This Kconfig option enables scalable detection of - the all-CPUs-idle state, thus allowing the scheduling-clock - tick to be disabled when all CPUs are idle. Note that scalable - detection of the all-CPUs-idle state means that larger systems - will be slower to declare the all-CPUs-idle state. - - Say Y if you would like to help debug all-CPUs-idle detection. - - Say N if you are unsure. - -config NO_HZ_FULL_SYSIDLE_SMALL - int "Number of CPUs above which large-system approach is used" - depends on NO_HZ_FULL_SYSIDLE - range 1 NR_CPUS - default 8 - help - The full-system idle detection mechanism takes a lazy approach - on large systems, as is required to attain decent scalability. - However, on smaller systems, scalability is not anywhere near as - large a concern as is energy efficiency. The sysidle subsystem - therefore uses a fast but non-scalable algorithm for small - systems and a lazier but scalable algorithm for large systems. - This Kconfig parameter defines the number of CPUs in the largest - system that will be considered to be "small". - - The default value will be fine in most cases. Battery-powered - systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger - numbers of CPUs, and (3) are suffering from battery-lifetime - problems due to long sysidle latencies might wish to experiment - with larger values for this Kconfig parameter. On the other - hand, they might be even better served by disabling NO_HZ_FULL - entirely, given that NO_HZ_FULL is intended for HPC and - real-time workloads that at present do not tend to be run on - battery-powered systems. - - Take the default if you are unsure. - config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5cb5b0008d97..ee2f4202d82a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -387,7 +387,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - start = ktime_add(start, base->gettime()); + start = ktime_add_safe(start, base->gettime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -475,7 +475,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) overrun++; } - alarm->node.expires = ktime_add(alarm->node.expires, interval); + alarm->node.expires = ktime_add_safe(alarm->node.expires, interval); return overrun; } EXPORT_SYMBOL_GPL(alarm_forward); @@ -660,13 +660,21 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, /* start the timer */ timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); + + /* + * Rate limit to the tick as a hot fix to prevent DOS. Will be + * mopped up later. + */ + if (timr->it.alarm.interval < TICK_NSEC) + timr->it.alarm.interval = TICK_NSEC; + exp = timespec64_to_ktime(new_setting->it_value); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now; now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); - exp = ktime_add(now, exp); + exp = ktime_add_safe(now, exp); } alarm_start(&timr->it.alarm.alarmtimer, exp); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 93621ae718d3..03918a19cf2d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data) continue; } + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1370f067fb51..d2a1e6dd0291 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -825,8 +825,10 @@ static void check_thread_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ - pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -838,8 +840,10 @@ static void check_thread_timers(struct task_struct *tsk, soft += USEC_PER_SEC; sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; } - pr_info("RT Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("RT Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } @@ -936,8 +940,10 @@ static void check_process_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ - pr_info("RT Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("RT Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -945,8 +951,10 @@ static void check_process_timers(struct task_struct *tsk, /* * At the soft limit, send a SIGXCPU every second. */ - pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); if (soft < hard) { soft++; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 987e496bb51a..b398c2ea69b2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -37,9 +37,11 @@ static int tick_broadcast_forced; static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); #ifdef CONFIG_TICK_ONESHOT +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); #else +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } #endif @@ -867,7 +869,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, /** * tick_broadcast_setup_oneshot - setup the broadcast device */ -void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { int cpu = smp_processor_id(); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f738251000fe..be0ac01f2e12 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -126,7 +126,6 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } /* Functions related to oneshot broadcasting */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) -extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); extern int tick_broadcast_oneshot_active(void); @@ -134,7 +133,6 @@ extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); extern struct cpumask *tick_get_broadcast_oneshot_mask(void); #else /* !(BROADCAST && ONESHOT): */ -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 64c97fc130c4..c7a899c5ce64 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; + /* + * In case the current tick fired too early past its expected + * expiration, make sure we don't bypass the next clock reprogramming + * to the same deadline. + */ + ts->next_tick = 0; } #endif update_process_times(user_mode(regs)); @@ -554,7 +560,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); } static ktime_t tick_nohz_start_idle(struct tick_sched *ts) @@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + + /* + * Reset to make sure next tick stop doesn't get fooled by past + * cached clock deadline. + */ + ts->next_tick = 0; } static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, @@ -701,8 +713,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { - tick = 0; - /* * Tell the timer code that the base is not idle, i.e. undo * the effect of get_next_timer_interrupt(): @@ -712,23 +722,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ - if (!ts->tick_stopped) - goto out; - - /* - * If, OTOH, we did stop it, but there's a pending (expired) - * timer reprogram the timer hardware to fire now. - * - * We will not restart the tick proper, just prod the timer - * hardware into firing an interrupt to process the pending - * timers. Just like tick_irq_exit() will not restart the tick - * for 'normal' interrupts. - * - * Only once we exit the idle loop will we re-enable the tick, - * see tick_nohz_idle_exit(). - */ - if (delta == 0) { - tick_nohz_restart(ts, now); + if (!ts->tick_stopped) { + tick = 0; goto out; } } @@ -771,8 +766,16 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, tick = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && (expires == dev->next_event)) - goto out; + if (ts->tick_stopped && (expires == ts->next_tick)) { + /* Sanity check: make sure clockevent is actually programmed */ + if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) + goto out; + + WARN_ON_ONCE(1); + printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", + basemono, ts->next_tick, dev->next_event, + hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); + } /* * nohz_stop_sched_tick can be called several times before @@ -782,8 +785,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - nohz_balance_enter_idle(cpu); - calc_load_enter_idle(); + calc_load_nohz_start(); cpu_load_update_nohz_start(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -791,6 +793,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, trace_tick_stop(1, TICK_DEP_MASK_NONE); } + ts->next_tick = tick; + /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. @@ -801,12 +805,17 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, goto out; } + hrtimer_set_expires(&ts->sched_timer, tick); + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(tick, 1); out: - /* Update the estimated sleep length */ + /* + * Update the estimated sleep length until the next timer + * (not only the tick). + */ ts->sleep_length = ktime_sub(dev->next_event, now); return tick; } @@ -823,7 +832,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) */ timer_clear_idle(); - calc_load_exit_idle(); + calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick @@ -864,6 +873,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; + /* + * Make sure the CPU doesn't get fooled by obsolete tick + * deadline if it comes back online later. + */ + ts->next_tick = 0; return false; } @@ -923,8 +937,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ts->idle_expires = expires; } - if (!was_stopped && ts->tick_stopped) + if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; + nohz_balance_enter_idle(cpu); + } } } @@ -1172,6 +1188,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (regs) tick_sched_handle(ts, regs); + else + ts->next_tick = 0; /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index bf38226e5c17..075444e3d48e 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -27,6 +27,7 @@ enum tick_nohz_mode { * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. + * @next_tick: Next tick to be fired when in dynticks mode. * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls @@ -44,6 +45,7 @@ struct tick_sched { unsigned long check_clocks; enum tick_nohz_mode nohz_mode; ktime_t last_tick; + ktime_t next_tick; int inidle; int tick_stopped; unsigned long idle_jiffies; diff --git a/kernel/time/time.c b/kernel/time/time.c index 6574bba44b55..49c73c6ed648 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -230,20 +230,6 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; } -/** - * current_fs_time - Return FS time - * @sb: Superblock. - * - * Return the current time truncated to the time granularity supported by - * the fs. - */ -struct timespec current_fs_time(struct super_block *sb) -{ - struct timespec now = current_kernel_time(); - return timespec_trunc(now, sb->s_time_gran); -} -EXPORT_SYMBOL(current_fs_time); - /* * Convert jiffies to milliseconds and back. * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9652bc57fd09..b602c48cb841 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -118,6 +118,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->offs_boot = ktime_add(tk->offs_boot, delta); } +/* + * tk_clock_read - atomic clocksource read() helper + * + * This helper is necessary to use in the read paths because, while the + * seqlock ensures we don't return a bad value while structures are updated, + * it doesn't protect from potential crashes. There is the possibility that + * the tkr's clocksource may change between the read reference, and the + * clock reference passed to the read function. This can cause crashes if + * the wrong clocksource is passed to the wrong read function. + * This isn't necessary to use when holding the timekeeper_lock or doing + * a read of the fast-timekeeper tkrs (which is protected by its own locking + * and update logic). + */ +static inline u64 tk_clock_read(struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + return clock->read(clock); +} + #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ @@ -175,7 +195,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) */ do { seq = read_seqcount_begin(&tk_core.seq); - now = tkr->read(tkr->clock); + now = tk_clock_read(tkr); last = tkr->cycle_last; mask = tkr->mask; max = tkr->clock->max_cycles; @@ -209,7 +229,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) u64 cycle_now, delta; /* read clocksource */ - cycle_now = tkr->read(tkr->clock); + cycle_now = tk_clock_read(tkr); /* calculate the delta since the last update_wall_time */ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); @@ -238,12 +258,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; - tk->tkr_mono.read = clock->read; tk->tkr_mono.mask = clock->mask; - tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); + tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; - tk->tkr_raw.read = clock->read; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; @@ -262,7 +280,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* Go back from cycles -> shifted ns */ tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; - tk->raw_interval = (interval * clock->mult) >> clock->shift; + tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { @@ -404,7 +422,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) now += timekeeping_delta_to_ns(tkr, clocksource_delta( - tkr->read(tkr->clock), + tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); @@ -461,6 +479,10 @@ static u64 dummy_clock_read(struct clocksource *cs) return cycles_at_suspend; } +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. @@ -477,13 +499,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk) struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); - cycles_at_suspend = tkr->read(tkr->clock); - tkr_dummy.read = dummy_clock_read; + cycles_at_suspend = tk_clock_read(tkr); + tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); - tkr_dummy.read = dummy_clock_read; + tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } @@ -649,11 +671,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - struct clocksource *clock = tk->tkr_mono.clock; u64 cycle_now, delta; u64 nsec; - cycle_now = tk->tkr_mono.read(clock); + cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -929,8 +950,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) do { seq = read_seqcount_begin(&tk_core.seq); - - now = tk->tkr_mono.read(tk->tkr_mono.clock); + now = tk_clock_read(&tk->tkr_mono); systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, @@ -1108,7 +1128,7 @@ int get_device_system_crosststamp(int (*get_time_fn) * Check whether the system counter value provided by the * device driver is on the current timekeeping interval. */ - now = tk->tkr_mono.read(tk->tkr_mono.clock); + now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; if (!cycle_between(interval_start, cycles, now)) { clock_was_set_seq = tk->clock_was_set_seq; @@ -1629,7 +1649,7 @@ void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk->tkr_mono.read(clock); + cycle_now = tk_clock_read(&tk->tkr_mono); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && cycle_now > tk->tkr_mono.cycle_last) { u64 nsec, cyc_delta; @@ -1976,7 +1996,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, u32 shift, unsigned int *clock_set) { u64 interval = tk->cycle_interval << shift; - u64 raw_nsecs; + u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) @@ -1991,14 +2011,15 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = (u64)tk->raw_interval << shift; - raw_nsecs += tk->raw_time.tv_nsec; - if (raw_nsecs >= NSEC_PER_SEC) { - u64 raw_secs = raw_nsecs; - raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - tk->raw_time.tv_sec += raw_secs; + tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; + tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; + snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { + tk->tkr_raw.xtime_nsec -= snsec_per_sec; + tk->raw_time.tv_sec++; } - tk->raw_time.tv_nsec = raw_nsecs; + tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift; + tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; @@ -2030,7 +2051,7 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), + offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bd8ae8d5ae9c..bc364f86100a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -867,7 +867,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), + BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu); } } @@ -900,7 +900,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r); } @@ -1662,14 +1662,14 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; if (attr == &dev_attr_act_mask) { - if (sscanf(buf, "%llx", &value) != 1) { + if (kstrtoull(buf, 0, &value)) { /* Assume it is a list of trace category names */ ret = blk_trace_str2mask(buf); if (ret < 0) goto out; value = ret; } - } else if (sscanf(buf, "%llu", &value) != 1) + } else if (kstrtoull(buf, 0, &value)) goto out; ret = -ENXIO; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 39dca4e86a94..b308be30dfb9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4144,9 +4144,9 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, int i, ret = -ENODEV; int size; - if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) + if (!glob || !strlen(glob) || !strcmp(glob, "*")) func_g.search = NULL; - else if (glob) { + else { int not; func_g.type = filter_parse_regex(glob, strlen(glob), @@ -4256,6 +4256,14 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, return ret; } +void clear_ftrace_function_probes(struct trace_array *tr) +{ + struct ftrace_func_probe *probe, *n; + + list_for_each_entry_safe(probe, n, &tr->func_probes, list) + unregister_ftrace_function_probe_func(NULL, tr, probe->probe_ops); +} + static LIST_HEAD(ftrace_commands); static DEFINE_MUTEX(ftrace_cmd_mutex); @@ -4329,9 +4337,6 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, command = strsep(&next, ":"); - if (WARN_ON_ONCE(!tr)) - return -EINVAL; - mutex_lock(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { if (strcmp(p->name, command) == 0) { @@ -5055,7 +5060,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) } out: - kfree(fgd->new_hash); + free_ftrace_hash(fgd->new_hash); kfree(fgd); return ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4ad4420b33d6..091e801145c9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1558,7 +1558,7 @@ static __init int init_trace_selftests(void) return 0; } -early_initcall(init_trace_selftests); +core_initcall(init_trace_selftests); #else static inline int run_tracer_selftest(struct tracer *type) { @@ -2568,7 +2568,36 @@ static inline void ftrace_trace_stack(struct trace_array *tr, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); + struct ring_buffer *buffer = tr->trace_buffer.buffer; + + if (rcu_is_watching()) { + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + return; + } + + /* + * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), + * but if the above rcu_is_watching() failed, then the NMI + * triggered someplace critical, and rcu_irq_enter() should + * not be called from NMI. + */ + if (unlikely(in_nmi())) + return; + + /* + * It is possible that a function is being traced in a + * location that RCU is not watching. A call to + * rcu_irq_enter() will make sure that it is, but there's + * a few internal rcu functions that could be traced + * where that wont work either. In those cases, we just + * do nothing. + */ + if (unlikely(rcu_irq_enter_disabled())) + return; + + rcu_irq_enter_irqson(); + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + rcu_irq_exit_irqson(); } /** @@ -3311,13 +3340,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (iter->started && cpumask_test_cpu(iter->cpu, iter->started)) + if (cpumask_available(iter->started) && + cpumask_test_cpu(iter->cpu, iter->started)) return; if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; - if (iter->started) + if (cpumask_available(iter->started)) cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ @@ -6851,6 +6881,9 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, char *number; int ret; + if (!tr) + return -ENODEV; + /* hash funcs only work with set_ftrace_filter */ if (!enable) return -EINVAL; @@ -7549,6 +7582,7 @@ static int instance_rmdir(const char *name) } tracing_set_nop(tr); + clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 291a1bca5748..39fd77330aab 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -980,6 +980,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, extern int unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, struct ftrace_probe_ops *ops); +extern void clear_ftrace_function_probes(struct trace_array *tr); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); @@ -998,6 +999,10 @@ static inline __init int unregister_ftrace_command(char *cmd_name) { return -EINVAL; } +static inline void clear_ftrace_function_probes(struct trace_array *tr) +{ +} + /* * The ops parameter passed in is usually undefined. * This must be a macro. diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a3bddbfd0874..a0910c0cdf2e 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -654,6 +654,9 @@ ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash, { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; @@ -670,6 +673,9 @@ ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash, { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, @@ -682,6 +688,9 @@ ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash, { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + ops = &dump_probe_ops; /* Only dump once. */ @@ -695,6 +704,9 @@ ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash, { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + ops = &cpudump_probe_ops; /* Only dump once. */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8485f6738a87..b53c8d369163 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -707,20 +707,16 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - if (isdigit(argv[1][0])) { - /* an address specified */ - ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); - if (ret) { - pr_info("Failed to parse address.\n"); - return ret; - } - } else { + + /* try to parse an address. if that fails, try to read the + * input as a symbol. */ + if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { /* a symbol specified */ symbol = argv[1]; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret) { - pr_info("Failed to parse symbol.\n"); + pr_info("Failed to parse either an address or a symbol.\n"); return ret; } if (offset && is_return && @@ -1535,6 +1531,11 @@ static __init int kprobe_trace_self_tests_init(void) end: release_all_trace_kprobes(); + /* + * Wait for the optimizer work to finish. Otherwise it might fiddle + * with probes in already freed __init text. + */ + wait_for_kprobe_optimizer(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 76aa04d4c925..b4a751e8f9d6 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -409,7 +409,9 @@ static const struct file_operations stack_trace_fops = { static int stack_trace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_FILTER, inode, file); } @@ -476,7 +478,7 @@ static __init int stack_trace_init(void) NULL, &stack_trace_fops); trace_create_file("stack_trace_filter", 0444, d_tracer, - NULL, &stack_trace_filter_fops); + &trace_ops, &stack_trace_filter_fops); if (stack_trace_filter_buf[0]) ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c74bf39ef764..a86688fabc55 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2864,11 +2864,11 @@ bool flush_work(struct work_struct *work) EXPORT_SYMBOL_GPL(flush_work); struct cwt_wait { - wait_queue_t wait; + wait_queue_entry_t wait; struct work_struct *work; }; -static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); |