diff options
Diffstat (limited to 'kernel')
65 files changed, 1749 insertions, 662 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 72aa080f91f0..4cb8e8b23c6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/audit.c b/kernel/audit.c index 833267bbd80b..6dd556931739 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -641,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); + kfree_skb(skb); rc = -ECONNREFUSED; goto err; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 62d686d96581..9eb8b3511636 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD) + FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT) static void audit_free_parent(struct audit_parent *parent) { @@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule) list_del(&krule->rlist); if (list_empty(&watch->rules)) { + /* + * audit_remove_watch() drops our reference to 'parent' which + * can get freed. Grab our own reference to be safe. + */ + audit_get_parent(parent); audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - audit_get_parent(parent); + if (list_empty(&parent->watches)) fsnotify_destroy_mark(&parent->mark, audit_watch_group); - audit_put_parent(parent); - } + audit_put_parent(parent); } } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9bbd33497d3d..e833ed914358 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode) bpf_any_put(inode->i_private, type); } +/* + * Display the mount options in /proc/mounts. + */ +static int bpf_show_options(struct seq_file *m, struct dentry *root) +{ + umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + + if (mode != S_IRWXUGO) + seq_printf(m, ",mode=%o", mode); + return 0; +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, - .show_options = generic_show_options, + .show_options = bpf_show_options, .evict_inode = bpf_evict_inode, }; @@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode; int ret; - save_mount_options(sb, data); - ret = bpf_parse_options(data, &opts); if (ret) return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 045646da97cc..6c772adabad2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1289,7 +1289,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info_len = min_t(u32, sizeof(info), info_len); if (copy_from_user(&info, uinfo, info_len)) - return err; + return -EFAULT; info.type = prog->type; info.id = prog->aux->id; @@ -1312,7 +1312,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.xlated_prog_len; - info.xlated_prog_len = bpf_prog_size(prog->len); + info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a86723c5b64..664d93972373 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; + regs[regno].value_from_signed = false; regs[regno].min_align = 0; } @@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } -static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +static bool __is_pointer_value(bool allow_ptr_leaks, + const struct bpf_reg_state *reg) { - if (env->allow_ptr_leaks) + if (allow_ptr_leaks) return false; - switch (env->cur_state.regs[regno].type) { + switch (reg->type) { case UNKNOWN_VALUE: case CONST_IMM: return false; @@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } } +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +{ + return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); +} + static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, int off, int size, bool strict) { @@ -1832,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_align = dst_reg->min_align; /* We don't know anything about what was done to this register, mark it - * as unknown. + * as unknown. Also, if both derived bounds came from signed/unsigned + * mixed compares and one side is unbounded, we cannot really do anything + * with them as boundaries cannot be trusted. Thus, arithmetic of two + * regs of such kind will get invalidated bounds on the dst side. */ - if (min_val == BPF_REGISTER_MIN_RANGE && - max_val == BPF_REGISTER_MAX_RANGE) { + if ((min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (BPF_SRC(insn->code) == BPF_X && + ((min_val != BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (min_val == BPF_REGISTER_MIN_RANGE && + max_val != BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && + dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && + dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && + regs[insn->dst_reg].value_from_signed != + regs[insn->src_reg].value_from_signed)) { reset_reg_range_values(regs, insn->dst_reg); return; } @@ -1844,10 +1865,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, * do our normal operations to the register, we need to set the values * to the min/max since they are undefined. */ - if (min_val == BPF_REGISTER_MIN_RANGE) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - if (max_val == BPF_REGISTER_MAX_RANGE) - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (opcode != BPF_SUB) { + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + } switch (opcode) { case BPF_ADD: @@ -1858,10 +1881,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg->min_align = min(src_align, dst_align); break; case BPF_SUB: + /* If one of our values was at the end of our ranges, then the + * _opposite_ value in the dst_reg goes to the end of our range. + */ + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value -= min_val; + dst_reg->min_value -= max_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value -= max_val; + dst_reg->max_value -= min_val; dst_reg->min_align = min(src_align, dst_align); break; case BPF_MUL: @@ -2023,6 +2053,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); + regs[insn->dst_reg].value_from_signed = false; } } else if (opcode > BPF_END) { @@ -2198,40 +2229,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. */ false_reg->max_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val + 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. */ false_reg->max_value = val - 1; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2239,6 +2293,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg @@ -2248,41 +2308,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* * If this is false, then the val is <= the register, if it is * true the register <= to the val. */ false_reg->min_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val - 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* If this is false then constant < register, if it is true then * the register < constant. */ false_reg->min_value = val + 1; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2290,6 +2373,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8362bac0d179..87a1213dd326 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -63,6 +63,7 @@ #include <linux/cgroup.h> #include <linux/wait.h> +DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ diff --git a/kernel/cpu.c b/kernel/cpu.c index b03a32595cfe..eee033134262 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -271,11 +271,26 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ +static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); + static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ wait_for_completion(&st->done); + if (WARN_ON_ONCE((!cpu_online(cpu)))) + return -ECANCELED; + + /* Unpark the stopper thread and the hotplug thread of the target cpu */ + stop_machine_unpark(cpu); + kthread_unpark(st->thread); + + /* Should we go further up ? */ + if (st->target > CPUHP_AP_ONLINE_IDLE) { + __cpuhp_kick_ap_work(st); + wait_for_completion(&st->done); + } return st->result; } @@ -296,9 +311,7 @@ static int bringup_cpu(unsigned int cpu) irq_unlock_sparse(); if (ret) return ret; - ret = bringup_wait_for_ap(cpu); - BUG_ON(!cpu_online(cpu)); - return ret; + return bringup_wait_for_ap(cpu); } /* @@ -767,31 +780,20 @@ void notify_cpu_starting(unsigned int cpu) } /* - * Called from the idle task. We need to set active here, so we can kick off - * the stopper thread and unpark the smpboot threads. If the target state is - * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the - * cpu further. + * Called from the idle task. Wake up the controlling task which brings the + * stopper and the hotplug thread of the upcoming CPU up and then delegates + * the rest of the online bringup to the hotplug thread. */ void cpuhp_online_idle(enum cpuhp_state state) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); - unsigned int cpu = smp_processor_id(); /* Happens for the boot cpu */ if (state != CPUHP_AP_ONLINE_IDLE) return; st->state = CPUHP_AP_ONLINE_IDLE; - - /* Unpark the stopper thread and the hotplug thread of this cpu */ - stop_machine_unpark(cpu); - kthread_unpark(st->thread); - - /* Should we go further up ? */ - if (st->target > CPUHP_AP_ONLINE_IDLE) - __cpuhp_kick_ap_work(st); - else - complete(&st->done); + complete(&st->done); } /* Requires cpu_add_remove_lock to be held */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index fcbd568f1e95..6db80fc0810b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,10 +14,12 @@ #include <asm/sections.h> /* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +static unsigned char *vmcoreinfo_data; +static size_t vmcoreinfo_size; +u32 *vmcoreinfo_note; + +/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ +static unsigned char *vmcoreinfo_data_safecopy; /* * parsing the "crashkernel" commandline @@ -324,8 +326,23 @@ static void update_vmcoreinfo_note(void) final_note(buf); } +void crash_update_vmcoreinfo_safecopy(void *ptr) +{ + if (ptr) + memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); + + vmcoreinfo_data_safecopy = ptr; +} + void crash_save_vmcoreinfo(void) { + if (!vmcoreinfo_note) + return; + + /* Use the safe copy to generate vmcoreinfo note if have */ + if (vmcoreinfo_data_safecopy) + vmcoreinfo_data = vmcoreinfo_data_safecopy; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } @@ -340,7 +357,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) r = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); @@ -356,11 +373,26 @@ void __weak arch_crash_save_vmcoreinfo(void) phys_addr_t __weak paddr_vmcoreinfo_note(void) { - return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } static int __init crash_save_vmcoreinfo_init(void) { + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + if (!vmcoreinfo_data) { + pr_warn("Memory allocation for vmcoreinfo_data failed\n"); + return -ENOMEM; + } + + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, + GFP_KERNEL | __GFP_ZERO); + if (!vmcoreinfo_note) { + free_page((unsigned long)vmcoreinfo_data); + vmcoreinfo_data = NULL; + pr_warn("Memory allocation for vmcoreinfo_note failed\n"); + return -ENOMEM; + } + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); VMCOREINFO_PAGESIZE(PAGE_SIZE); diff --git a/kernel/events/core.c b/kernel/events/core.c index 1538df9b2b65..3504125871d2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1452,6 +1452,13 @@ static enum event_type_t get_event_type(struct perf_event *event) lockdep_assert_held(&ctx->lock); + /* + * It's 'group type', really, because if our group leader is + * pinned, so are we. + */ + if (event->group_leader != event) + event = event->group_leader; + event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; @@ -2210,6 +2217,33 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } +/* + * Complement to update_event_times(). This computes the tstamp_* values to + * continue 'enabled' state from @now, and effectively discards the time + * between the prior tstamp_stopped and now (as we were in the OFF state, or + * just switched (context) time base). + * + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and + * cannot have been scheduled in yet. And going into INACTIVE state means + * '@event->tstamp_stopped = @now'. + * + * Thus given the rules of update_event_times(): + * + * total_time_enabled = tstamp_stopped - tstamp_enabled + * total_time_running = tstamp_stopped - tstamp_running + * + * We can insert 'tstamp_stopped == now' and reverse them to compute new + * tstamp_* values. + */ +static void __perf_event_enable_time(struct perf_event *event, u64 now) +{ + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); + + event->tstamp_stopped = now; + event->tstamp_enabled = now - event->total_time_enabled; + event->tstamp_running = now - event->total_time_running; +} + static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -2217,9 +2251,12 @@ static void add_event_to_ctx(struct perf_event *event, list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; + /* + * We can be called with event->state == STATE_OFF when we create with + * .disabled = 1. In that case the IOC_ENABLE will call this function. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2464,10 +2501,11 @@ static void __perf_event_mark_enabled(struct perf_event *event) u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; + __perf_event_enable_time(event, tstamp); list_for_each_entry(sub, &event->sibling_list, group_entry) { + /* XXX should not be > INACTIVE if event isn't */ if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; + __perf_event_enable_time(sub, tstamp); } } @@ -4378,7 +4416,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { + struct perf_event_context *ctx = leader->ctx; struct perf_event *sub; + unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -4408,12 +4448,15 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + raw_spin_lock_irqsave(&ctx->lock, flags); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); } + raw_spin_unlock_irqrestore(&ctx->lock, flags); return 0; } @@ -5078,7 +5121,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -5101,7 +5144,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event); + event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and @@ -5399,7 +5442,7 @@ aux_unlock: vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); return ret; } @@ -7321,21 +7364,6 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } -static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) -{ - /* - * Due to interrupt latency (AKA "skid"), we may enter the - * kernel before taking an overflow, even if the PMU is only - * counting user events. - * To avoid leaking information to userspace, we must always - * reject kernel samples when exclude_kernel is set. - */ - if (event->attr.exclude_kernel && !user_mode(regs)) - return false; - - return true; -} - /* * Generic event overflow handling, sampling. */ @@ -7357,12 +7385,6 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); /* - * For security, drop the skid kernel samples if necessary. - */ - if (!sample_is_allowed(event, regs)) - return ret; - - /* * XXX event_limit might not quite work as expected on inherited * events */ @@ -10010,28 +10032,27 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; /* - * Do not allow to attach to a group in a different - * task or CPU context: + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. */ - if (move_group) { - /* - * Make sure we're both on the same task, or both - * per-cpu events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + if (group_leader->cpu != event->cpu) + goto err_context; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } + /* + * Make sure we're both on the same task, or both + * per-CPU events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Do not allow to attach to a group in a different task + * or CPU context. If we're moving SW events, we'll fix + * this up later, so allow that. + */ + if (!move_group && group_leader->ctx != ctx) + goto err_context; /* * Only a group leader can be exclusive or pinned diff --git a/kernel/exit.c b/kernel/exit.c index 2bbc23273e2f..c5548faa9f37 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1590,9 +1590,6 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, wo.wo_rusage = ru; ret = do_wait(&wo); - if (ret > 0) - ret = 0; - put_pid(pid); return ret; } @@ -1603,6 +1600,11 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); + int signo = 0; + if (err > 0) { + signo = SIGCHLD; + err = 0; + } if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) @@ -1612,7 +1614,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, return err; user_access_begin(); - unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user((short)info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); @@ -1637,6 +1639,10 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; + /* -INT_MIN is not defined */ + if (upid == INT_MIN) + return -ESRCH; + if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { @@ -1714,6 +1720,11 @@ COMPAT_SYSCALL_DEFINE5(waitid, struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); + int signo = 0; + if (err > 0) { + signo = SIGCHLD; + err = 0; + } if (!err && uru) { /* kernel_waitid() overwrites everything in ru */ @@ -1729,7 +1740,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, return err; user_access_begin(); - unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user((short)info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); diff --git a/kernel/extable.c b/kernel/extable.c index 223df4a328a4..38c2412401a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___ex_table, __stop___ex_table-1, addr); + e = search_extable(__start___ex_table, + __stop___ex_table - __start___ex_table, addr); if (!e) e = search_module_extables(addr); return e; diff --git a/kernel/fork.c b/kernel/fork.c index aa01b810c0bd..cbbea277b3fb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) void *stack; int i; - local_irq_disable(); for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s = this_cpu_read(cached_stacks[i]); + struct vm_struct *s; + + s = this_cpu_xchg(cached_stacks[i], NULL); if (!s) continue; - this_cpu_write(cached_stacks[i], NULL); tsk->stack_vm_area = s; - local_irq_enable(); return s->addr; } - local_irq_enable(); stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, @@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK if (task_stack_vm_area(tsk)) { - unsigned long flags; int i; - local_irq_save(flags); for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_read(cached_stacks[i])) + if (this_cpu_cmpxchg(cached_stacks[i], + NULL, tsk->stack_vm_area) != NULL) continue; - this_cpu_write(cached_stacks[i], tsk->stack_vm_area); - local_irq_restore(flags); return; } - local_irq_restore(flags); vfree_atomic(tsk->stack); return; @@ -560,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_long(); + tsk->stack_canary = get_random_canary(); #endif /* @@ -579,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) kcov_task_init(tsk); +#ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +#endif + return tsk; free_stack: @@ -808,8 +806,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); - clear_tlb_flush_pending(mm); + init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif @@ -1637,9 +1636,9 @@ static __latent_entropy struct task_struct *copy_process( prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqcount_init(&p->vtime_seqcount); - p->vtime_snap = 0; - p->vtime_snap_whence = VTIME_INACTIVE; + seqcount_init(&p->vtime.seqcount); + p->vtime.starttime = 0; + p->vtime.state = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) diff --git a/kernel/futex.c b/kernel/futex.c index c934689043b2..f50b434756c1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -212,7 +212,7 @@ struct futex_pi_state { atomic_t refcount; union futex_key key; -}; +} __randomize_layout; /** * struct futex_q - The hashed futex queue entry, one per waiting task @@ -246,7 +246,7 @@ struct futex_q { struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; -}; +} __randomize_layout; static const struct futex_q futex_q_init = { /* list gets initialized in queue_me()*/ @@ -670,13 +670,14 @@ again: * this reference was taken by ihold under the page lock * pinning the inode in place so i_lock was unnecessary. The * only way for this check to fail is if the inode was - * truncated in parallel so warn for now if this happens. + * truncated in parallel which is almost certainly an + * application bug. In such a case, just retry. * * We are not calling into get_futex_key_refs() in file-backed * cases, therefore a successful atomic_inc return below will * guarantee that get_futex_key() will still imply smp_mb(); (B). */ - if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { + if (!atomic_inc_not_zero(&inode->i_count)) { rcu_read_unlock(); put_page(page); diff --git a/kernel/groups.c b/kernel/groups.c index d09727692a2a..434f6665f187 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -5,6 +5,7 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/security.h> +#include <linux/sort.h> #include <linux/syscalls.h> #include <linux/user_namespace.h> #include <linux/vmalloc.h> @@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info, return 0; } -/* a simple Shell sort */ +static int gid_cmp(const void *_a, const void *_b) +{ + kgid_t a = *(kgid_t *)_a; + kgid_t b = *(kgid_t *)_b; + + return gid_gt(a, b) - gid_lt(a, b); +} + static void groups_sort(struct group_info *group_info) { - int base, max, stride; - int gidsetsize = group_info->ngroups; - - for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) - ; /* nothing */ - stride /= 3; - - while (stride) { - max = gidsetsize - stride; - for (base = 0; base < max; base++) { - int left = base; - int right = left + stride; - kgid_t tmp = group_info->gid[right]; - - while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { - group_info->gid[right] = group_info->gid[left]; - right = left; - left -= stride; - } - group_info->gid[right] = tmp; - } - stride /= 3; - } + sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), + gid_cmp, NULL); } /* a simple bsearch */ diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d2747f9c5707..d69bd77252a7 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -110,6 +110,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) struct cpumask *masks; cpumask_var_t nmsk, *node_to_present_cpumask; + /* + * If there aren't any vectors left after applying the pre/post + * vectors don't bother with assigning affinity. + */ + if (!affv) + return NULL; + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; @@ -192,15 +199,19 @@ out: /** * irq_calc_affinity_vectors - Calculate the optimal number of vectors + * @minvec: The minimum number of vectors available * @maxvec: The maximum number of vectors available * @affd: Description of the affinity requirements */ -int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) +int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) { int resv = affd->pre_vectors + affd->post_vectors; int vecs = maxvec - resv; int ret; + if (resv > minvec) + return 0; + get_online_cpus(); ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; put_online_cpus(); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ad43468e89f0..3675c6004f2a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } -static void irq_state_set_disabled(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); -} - static void irq_state_clr_masked(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } -static void irq_state_set_masked(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); -} - static void irq_state_clr_started(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); @@ -234,7 +224,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) return IRQ_STARTUP_MANAGED; } #else -static int +static __always_inline int __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) { return IRQ_STARTUP_NORMAL; @@ -1010,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - unsigned long flags; + unsigned long flags, trigger, tmp; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) @@ -1024,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); + trigger = irqd_get_trigger_type(&desc->irq_data); + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); if (irq_settings_has_no_balance_set(desc)) @@ -1035,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); - irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + tmp = irq_settings_get_trigger_mask(desc); + if (tmp != IRQ_TYPE_NONE) + trigger = tmp; + + irqd_set(&desc->irq_data, trigger); irq_put_desc_unlock(desc, flags); } diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index aee8f7ec40af..638eb9c83d9f 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -95,8 +95,13 @@ static bool migrate_one_irq(struct irq_desc *desc) affinity = cpu_online_mask; brokeaff = true; } - - err = irq_do_set_affinity(d, affinity, true); + /* + * Do not set the force argument of irq_do_set_affinity() as this + * disables the masking of offline CPUs from the supplied affinity + * mask and therefore might keep/reassign the irq to the outgoing + * CPU. + */ + err = irq_do_set_affinity(d, affinity, false); if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 9da14d125df4..a2c48058354c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +static inline void irq_state_set_disabled(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); +} + +static inline void irq_state_set_masked(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); +} + #undef __irqd_to_state static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) @@ -437,7 +447,9 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc) # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else -static inline void irq_domain_debugfs_init(struct dentry *root); +static inline void irq_domain_debugfs_init(struct dentry *root) +{ +} # endif #else /* CONFIG_GENERIC_IRQ_DEBUGFS */ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 1a9abc1c8ea0..259a22aa9934 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) struct irq_data *data = irq_get_irq_data(irq); struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; - if (!data || !ipimask || cpu > nr_cpu_ids) + if (!data || !ipimask || cpu >= nr_cpu_ids) return INVALID_HWIRQ; if (!cpumask_test_cpu(cpu, ipimask)) @@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, if (!chip->ipi_send_single && !chip->ipi_send_mask) return -EINVAL; - if (cpu > nr_cpu_ids) + if (cpu >= nr_cpu_ids) return -EINVAL; if (dest) { diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8bbd06405e60..73be2b3909bd 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -373,6 +373,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); + mutex_init(&desc->request_mutex); init_rcu_head(&desc->rcu); desc_set_defaults(irq, desc, node, affinity, owner); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 14fe862aa2e3..f1f251479aa6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,5 +1,6 @@ #define pr_fmt(fmt) "irq: " fmt +#include <linux/acpi.h> #include <linux/debugfs.h> #include <linux/hardirq.h> #include <linux/interrupt.h> @@ -155,6 +156,21 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } +#ifdef CONFIG_ACPI + } else if (is_acpi_device_node(fwnode)) { + struct acpi_buffer buf = { + .length = ACPI_ALLOCATE_BUFFER, + }; + acpi_handle handle; + + handle = acpi_device_handle(to_acpi_device_node(fwnode)); + if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { + domain->name = buf.pointer; + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + } + + domain->fwnode = fwnode; +#endif } else if (of_node) { char *name; @@ -1667,8 +1683,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { - if (d->debugfs_file) - debugfs_remove(d->debugfs_file); + debugfs_remove(d->debugfs_file); } void __init irq_domain_debugfs_init(struct dentry *root) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5c11c1730ba5..1d1a5b945ab4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1090,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. + * + * Locking rules: + * + * desc->request_mutex Provides serialization against a concurrent free_irq() + * chip_bus_lock Provides serialization for slow bus operations + * desc->lock Provides serialization against hard interrupts + * + * chip_bus_lock and desc->lock are sufficient for all other management and + * interrupt related functions. desc->request_mutex solely serializes + * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) @@ -1168,7 +1178,34 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->flags &= ~IRQF_ONESHOT; /* + * Protects against a concurrent __free_irq() call which might wait + * for synchronize_irq() to complete without holding the optional + * chip bus lock and desc->lock. + */ + mutex_lock(&desc->request_mutex); + + /* + * Acquire bus lock as the irq_request_resources() callback below + * might rely on the serialization or the magic power management + * functions which are abusing the irq_bus_lock() callback, + */ + chip_bus_lock(desc); + + /* First installed action requests resources. */ + if (!desc->action) { + ret = irq_request_resources(desc); + if (ret) { + pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", + new->name, irq, desc->irq_data.chip->name); + goto out_bus_unlock; + } + } + + /* * The following block of code has to be executed atomically + * protected against a concurrent interrupt and any of the other + * management calls which are not serialized via + * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; @@ -1267,13 +1304,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - ret = irq_request_resources(desc); - if (ret) { - pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", - new->name, irq, desc->irq_data.chip->name); - goto out_unlock; - } - init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -1281,10 +1311,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); - if (ret) { - irq_release_resources(desc); + if (ret) goto out_unlock; - } } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ @@ -1347,6 +1375,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); irq_setup_timings(desc, new); @@ -1378,6 +1408,12 @@ mismatch: out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); + if (!desc->action) + irq_release_resources(desc); +out_bus_unlock: + chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); + out_thread: if (new->thread) { struct task_struct *t = new->thread; @@ -1417,9 +1453,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) if (retval < 0) return retval; - chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); - chip_bus_sync_unlock(desc); if (retval) irq_chip_pm_put(&desc->irq_data); @@ -1443,6 +1477,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; + mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); @@ -1458,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); return NULL; } @@ -1475,8 +1511,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc->action) { irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); - irq_release_resources(desc); - irq_remove_timings(desc); } #ifdef CONFIG_SMP @@ -1486,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif raw_spin_unlock_irqrestore(&desc->lock, flags); + /* + * Drop bus_lock here so the changes which were done in the chip + * callbacks above are synced out to the irq chips which hang + * behind a slow bus (I2C, SPI) before calling synchronize_irq(). + * + * Aside of that the bus_lock can also be taken from the threaded + * handler in irq_finalize_oneshot() which results in a deadlock + * because synchronize_irq() would wait forever for the thread to + * complete, which is blocked on the bus lock. + * + * The still held desc->request_mutex() protects against a + * concurrent request_irq() of this irq so the release of resources + * and timing data is properly serialized. + */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); @@ -1518,6 +1566,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + /* Last action releases resources */ + if (!desc->action) { + /* + * Reaquire bus lock as irq_release_resources() might + * require it to deallocate resources over the slow bus. + */ + chip_bus_lock(desc); + irq_release_resources(desc); + chip_bus_sync_unlock(desc); + irq_remove_timings(desc); + } + + mutex_unlock(&desc->request_mutex); + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); @@ -1674,9 +1736,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, return retval; } - chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(desc); if (retval) { irq_chip_pm_put(&desc->irq_data); @@ -1924,9 +1984,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) if (retval < 0) return retval; - chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); - chip_bus_sync_unlock(desc); if (retval) irq_chip_pm_put(&desc->irq_data); @@ -1935,9 +1993,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) } /** - * request_percpu_irq - allocate a percpu interrupt line + * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. + * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * @@ -1950,8 +2009,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) * the handler gets called with the interrupted CPU's instance of * that variable. */ -int request_percpu_irq(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *dev_id) +int __request_percpu_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *devname, + void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -1965,12 +2025,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; + if (flags && flags != IRQF_TIMER) + return -EINVAL; + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; - action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; + action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; @@ -1980,9 +2043,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } - chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(desc); if (retval) { irq_chip_pm_put(&desc->irq_data); @@ -1991,7 +2052,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } -EXPORT_SYMBOL_GPL(request_percpu_irq); +EXPORT_SYMBOL_GPL(__request_percpu_irq); /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cea1de0161f1..6bd9b58429cc 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc) /* Pretend that it got disabled ! */ desc->depth++; + irq_state_set_disabled(desc); + irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6a3b249a2ae1..127e7cfafa55 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -28,12 +28,6 @@ #include <asm/sections.h> -#ifdef CONFIG_KALLSYMS_ALL -#define all_var 1 -#else -#define all_var 0 -#endif - /* * These will be re-linked against their real values * during the second link stage. @@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr) static int is_ksym_addr(unsigned long addr) { - if (all_var) + if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); @@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr, if (!symbol_end) { if (is_kernel_inittext(addr)) symbol_end = (unsigned long)_einittext; - else if (all_var) + else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) symbol_end = (unsigned long)_end; else symbol_end = (unsigned long)_etext; diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 3a47fa998fe0..ea34ed8bb952 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -11,6 +11,10 @@ #include <linux/bug.h> #include <linux/err.h> #include <linux/kcmp.h> +#include <linux/capability.h> +#include <linux/list.h> +#include <linux/eventpoll.h> +#include <linux/file.h> #include <asm/unistd.h> @@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2) return err; } +#ifdef CONFIG_EPOLL +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + struct file *filp, *filp_epoll, *filp_tgt; + struct kcmp_epoll_slot slot; + struct files_struct *files; + + if (copy_from_user(&slot, uslot, sizeof(slot))) + return -EFAULT; + + filp = get_file_raw_ptr(task1, idx1); + if (!filp) + return -EBADF; + + files = get_files_struct(task2); + if (!files) + return -EBADF; + + spin_lock(&files->file_lock); + filp_epoll = fcheck_files(files, slot.efd); + if (filp_epoll) + get_file(filp_epoll); + else + filp_tgt = ERR_PTR(-EBADF); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (filp_epoll) { + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); + fput(filp_epoll); + } else + + if (IS_ERR(filp_tgt)) + return PTR_ERR(filp_tgt); + + return kcmp_ptr(filp, filp_tgt, KCMP_FILE); +} +#else +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + return -EOPNOTSUPP; +} +#endif + SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2) { @@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, ret = -EOPNOTSUPP; #endif break; + case KCMP_EPOLL_TFD: + ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); + break; default: ret = -EINVAL; break; diff --git a/kernel/kexec.c b/kernel/kexec.c index 980936a90ee6..e62ec4dc6620 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + for (i = 0; i < nr_segments; i++) { ret = kimage_load_segment(image, &image->segment[i]); if (ret) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 154ffb489b93..1ae7c41c33c1 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -482,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image, return pages; } +int kimage_crash_copy_vmcoreinfo(struct kimage *image) +{ + struct page *vmcoreinfo_page; + void *safecopy; + + if (image->type != KEXEC_TYPE_CRASH) + return 0; + + /* + * For kdump, allocate one vmcoreinfo safe copy from the + * crash memory. as we have arch_kexec_protect_crashkres() + * after kexec syscall, we naturally protect it from write + * (even read) access under kernel direct mapping. But on + * the other hand, we still need to operate it when crash + * happens to generate vmcoreinfo note, hereby we rely on + * vmap for this purpose. + */ + vmcoreinfo_page = kimage_alloc_control_pages(image, 0); + if (!vmcoreinfo_page) { + pr_warn("Could not allocate vmcoreinfo buffer\n"); + return -ENOMEM; + } + safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + if (!safecopy) { + pr_warn("Could not vmap vmcoreinfo buffer\n"); + return -ENOMEM; + } + + image->vmcoreinfo_data_copy = safecopy; + crash_update_vmcoreinfo_safecopy(safecopy); + + return 0; +} + static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) @@ -569,6 +603,11 @@ void kimage_free(struct kimage *image) if (!image) return; + if (image->vmcoreinfo_data_copy) { + crash_update_vmcoreinfo_safecopy(NULL); + vunmap(image->vmcoreinfo_data_copy); + } + kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 766e7e4d3ad9..9f48f4412297 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,13 +26,6 @@ #include <linux/vmalloc.h> #include "kexec_internal.h" -/* - * Declare these symbols weak so that if architecture provides a purgatory, - * these will be overridden. - */ -char __weak kexec_purgatory[0]; -size_t __weak kexec_purgatory_size = 0; - static int kexec_calculate_store_digests(struct kimage *image); /* Architectures can provide this probe function */ @@ -298,6 +291,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + ret = kexec_calculate_store_digests(image); if (ret) goto out; diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 799a8a452187..50dfcb039a41 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -17,6 +17,8 @@ extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE #include <linux/purgatory.h> void kimage_file_post_load_cleanup(struct kimage *image); +extern char kexec_purgatory[]; +extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ diff --git a/kernel/kmod.c b/kernel/kmod.c index 563f97e2be36..2f37acde640b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -45,8 +45,6 @@ #include <trace/events/module.h> -extern int max_threads; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -56,6 +54,33 @@ static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); #ifdef CONFIG_MODULES +/* + * Assuming: + * + * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + * (u64) THREAD_SIZE * 8UL); + * + * If you need less than 50 threads would mean we're dealing with systems + * smaller than 3200 pages. This assuems you are capable of having ~13M memory, + * and this would only be an be an upper limit, after which the OOM killer + * would take effect. Systems like these are very unlikely if modules are + * enabled. + */ +#define MAX_KMOD_CONCURRENT 50 +static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); +static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); + +/* + * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads + * running at the same time without returning. When this happens we + * believe you've somehow ended up with a recursive module dependency + * creating a loop. + * + * We have no option but to fail. + * + * Userspace should proactively try to detect and prevent these. + */ +#define MAX_KMOD_ALL_BUSY_TIMEOUT 5 /* modprobe_path is set via /proc/sys. @@ -127,11 +152,7 @@ int __request_module(bool wait, const char *fmt, ...) { va_list args; char module_name[MODULE_NAME_LEN]; - unsigned int max_modprobes; int ret; - static atomic_t kmod_concurrent = ATOMIC_INIT(0); -#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ - static int kmod_loop_msg; /* * We don't allow synchronous module loading from async. Module @@ -154,40 +175,34 @@ int __request_module(bool wait, const char *fmt, ...) if (ret) return ret; - /* If modprobe needs a service that is in a module, we get a recursive - * loop. Limit the number of running kmod threads to max_threads/2 or - * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method - * would be to run the parents of this process, counting how many times - * kmod was invoked. That would mean accessing the internals of the - * process tables to get the command line, proc_pid_cmdline is static - * and it is not worth changing the proc code just to handle this case. - * KAO. - * - * "trace the ppid" is simple, but will fail if someone's - * parent exits. I think this is as good as it gets. --RR - */ - max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); - atomic_inc(&kmod_concurrent); - if (atomic_read(&kmod_concurrent) > max_modprobes) { - /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg < 5) { - printk(KERN_ERR - "request_module: runaway loop modprobe %s\n", - module_name); - kmod_loop_msg++; + if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { + pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", + atomic_read(&kmod_concurrent_max), + MAX_KMOD_CONCURRENT, module_name); + ret = wait_event_killable_timeout(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0, + MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); + if (!ret) { + pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", + module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); + return -ETIME; + } else if (ret == -ERESTARTSYS) { + pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); + return ret; } - atomic_dec(&kmod_concurrent); - return -ENOMEM; } trace_module_request(module_name, wait, _RET_IP_); ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); - atomic_dec(&kmod_concurrent); + atomic_inc(&kmod_concurrent_max); + wake_up(&kmod_wq); + return ret; } EXPORT_SYMBOL(__request_module); + #endif /* CONFIG_MODULES */ static void call_usermodehelper_freeinfo(struct subprocess_info *info) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6756d750b31b..a1606a4224e1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1771,24 +1771,13 @@ unsigned long __weak arch_deref_entry_point(void *entry) int register_jprobes(struct jprobe **jps, int num) { - struct jprobe *jp; int ret = 0, i; if (num <= 0) return -EINVAL; + for (i = 0; i < num; i++) { - unsigned long addr, offset; - jp = jps[i]; - addr = arch_deref_entry_point(jp->entry); - - /* Verify probepoint is a function entry point */ - if (kallsyms_lookup_size_offset(addr, NULL, &offset) && - offset == 0) { - jp->kp.pre_handler = setjmp_pre_handler; - jp->kp.break_handler = longjmp_break_handler; - ret = register_kprobe(&jp->kp); - } else - ret = -EINVAL; + ret = register_jprobe(jps[i]); if (ret < 0) { if (i > 0) @@ -1796,13 +1785,30 @@ int register_jprobes(struct jprobe **jps, int num) break; } } + return ret; } EXPORT_SYMBOL_GPL(register_jprobes); int register_jprobe(struct jprobe *jp) { - return register_jprobes(&jp, 1); + unsigned long addr, offset; + struct kprobe *kp = &jp->kp; + + /* + * Verify probepoint as well as the jprobe handler are + * valid function entry points. + */ + addr = arch_deref_entry_point(jp->entry); + + if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 && + kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) { + kp->pre_handler = setjmp_pre_handler; + kp->break_handler = longjmp_break_handler; + return register_kprobe(kp); + } + + return -EINVAL; } EXPORT_SYMBOL_GPL(register_jprobe); @@ -1888,12 +1894,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); -bool __weak arch_function_offset_within_entry(unsigned long offset) +bool __weak arch_kprobe_on_func_entry(unsigned long offset) { return !offset; } -bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) +bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) { kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); @@ -1901,7 +1907,7 @@ bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsign return false; if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || - !arch_function_offset_within_entry(offset)) + !arch_kprobe_on_func_entry(offset)) return false; return true; @@ -1914,7 +1920,7 @@ int register_kretprobe(struct kretprobe *rp) int i; void *addr; - if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) + if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) return -EINVAL; if (kretprobe_blacklist_size) { diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 23cd70651238..46ba853656f6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sprintf(buf, "%pa %x\n", &vmcore_base, - (unsigned int)sizeof(vmcoreinfo_note)); + (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); @@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = { NULL }; -static struct attribute_group kernel_attr_group = { +static const struct attribute_group kernel_attr_group = { .attrs = kernel_attrs, }; diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index cc3ed0ccdfa2..2655f26ec882 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -20,6 +20,7 @@ #include <linux/cpumask.h> #include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/spinlock.h> #include <asm/qrwlock.h> /* diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b2caec7315af..fd24153e8a48 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -28,6 +28,7 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/mutex.h> +#include <linux/prefetch.h> #include <asm/byteorder.h> #include <asm/qspinlock.h> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 78069895032a..649dc9d3951a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); - rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index c65f7989f850..20819df98125 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) out_nolock: list_del(&waiter.list); - if (!list_empty(&sem->wait_list)) - __rwsem_do_wake(sem, 1); + if (!list_empty(&sem->wait_list) && sem->count >= 0) + __rwsem_do_wake(sem, 0); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return -EINTR; diff --git a/kernel/module.c b/kernel/module.c index b3dbdde82e80..40f983cbea81 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -300,6 +300,7 @@ int unregister_module_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_module_notifier); struct load_info { + const char *name; Elf_Ehdr *hdr; unsigned long len; Elf_Shdr *sechdrs; @@ -600,7 +601,7 @@ static struct module *find_module_all(const char *name, size_t len, module_assert_mutex_or_preempt(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) @@ -1273,12 +1274,13 @@ static u32 resolve_rel_crc(const s32 *crc) return *(u32 *)((void *)crc + *crc); } -static int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static int check_version(const struct load_info *info, const char *symname, struct module *mod, const s32 *crc) { + Elf_Shdr *sechdrs = info->sechdrs; + unsigned int versindex = info->index.vers; unsigned int i, num_versions; struct modversion_info *versions; @@ -1312,17 +1314,16 @@ static int check_version(Elf_Shdr *sechdrs, } /* Broken toolchain. Warn once, then let it go.. */ - pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); + pr_warn_once("%s: no symbol version for %s\n", info->name, symname); return 1; bad_version: pr_warn("%s: disagrees about version of symbol %s\n", - mod->name, symname); + info->name, symname); return 0; } -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { const s32 *crc; @@ -1338,8 +1339,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, BUG(); } preempt_enable(); - return check_version(sechdrs, versindex, - VMLINUX_SYMBOL_STR(module_layout), mod, crc); + return check_version(info, VMLINUX_SYMBOL_STR(module_layout), + mod, crc); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -1353,8 +1354,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, return strcmp(amagic, bmagic) == 0; } #else -static inline int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_version(const struct load_info *info, const char *symname, struct module *mod, const s32 *crc) @@ -1362,8 +1362,7 @@ static inline int check_version(Elf_Shdr *sechdrs, return 1; } -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { return 1; @@ -1399,7 +1398,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (!sym) goto unlock; - if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) { + if (!check_version(info, name, mod, crc)) { sym = ERR_PTR(-EINVAL); goto getname; } @@ -1662,31 +1661,36 @@ static inline void remove_notes_attrs(struct module *mod) } #endif /* CONFIG_KALLSYMS */ -static void add_usage_links(struct module *mod) +static void del_usage_links(struct module *mod) { #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; - int nowarn; mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) { - nowarn = sysfs_create_link(use->target->holders_dir, - &mod->mkobj.kobj, mod->name); - } + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); mutex_unlock(&module_mutex); #endif } -static void del_usage_links(struct module *mod) +static int add_usage_links(struct module *mod) { + int ret = 0; #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) - sysfs_remove_link(use->target->holders_dir, mod->name); + list_for_each_entry(use, &mod->target_list, target_list) { + ret = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + if (ret) + break; + } mutex_unlock(&module_mutex); + if (ret) + del_usage_links(mod); #endif + return ret; } static int module_add_modinfo_attrs(struct module *mod) @@ -1797,13 +1801,18 @@ static int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_param; - add_usage_links(mod); + err = add_usage_links(mod); + if (err) + goto out_unreg_modinfo_attrs; + add_sect_attrs(mod, info); add_notes_attrs(mod, info); kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; +out_unreg_modinfo_attrs: + module_remove_modinfo_attrs(mod); out_unreg_param: module_param_sysfs_remove(mod); out_unreg_holders: @@ -2910,9 +2919,15 @@ static int rewrite_section_headers(struct load_info *info, int flags) info->index.vers = 0; /* Pretend no __versions section! */ else info->index.vers = find_sec(info, "__versions"); + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->index.info = find_sec(info, ".modinfo"); + if (!info->index.info) + info->name = "(missing .modinfo section)"; + else + info->name = get_modinfo(info, "name"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; - info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + return 0; } @@ -2952,21 +2967,29 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { - pr_warn("No module found in object\n"); + pr_warn("%s: No module found in object\n", + info->name ?: "(missing .modinfo name field)"); return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; + /* + * If we didn't load the .modinfo 'name' field, fall back to + * on-disk struct mod 'name' field. + */ + if (!info->name) + info->name = mod->name; + if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", mod->name); + pr_warn("%s: module has no symbols (stripped?)\n", info->name); return ERR_PTR(-ENOEXEC); } info->index.pcpu = find_pcpusec(info); /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) + if (!check_modstruct_version(info, mod)) return ERR_PTR(-ENOEXEC); return mod; @@ -2987,7 +3010,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return err; } else if (!same_magic(modmagic, vermagic, info->index.vers)) { pr_err("%s: version magic '%s' should be '%s'\n", - mod->name, modmagic, vermagic); + info->name, modmagic, vermagic); return -ENOEXEC; } @@ -3237,7 +3260,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, /* module_blacklist is a comma-separated list of module names */ static char *module_blacklist; -static bool blacklisted(char *module_name) +static bool blacklisted(const char *module_name) { const char *p; size_t len; @@ -3267,7 +3290,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (IS_ERR(mod)) return mod; - if (blacklisted(mod->name)) + if (blacklisted(info->name)) return ERR_PTR(-EPERM); err = check_modinfo(mod, info, flags); @@ -4196,7 +4219,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) goto out; e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, + mod->num_exentries, addr); out: preempt_enable(); diff --git a/kernel/pid.c b/kernel/pid.c index 731c4e528f4e..020dedbdf066 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) + if (type != PIDTYPE_PID) { + if (type == __PIDTYPE_TGID) + type = PIDTYPE_PID; task = task->group_leader; + } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); @@ -575,13 +572,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - unsigned int pidhash_size; - pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); - pidhash_size = 1U << pidhash_shift; } void __init pidmap_init(void) diff --git a/kernel/power/main.c b/kernel/power/main.c index d401c21136d1..42bd800a6755 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -705,7 +705,7 @@ static struct attribute * g[] = { NULL, }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 222317721c5a..0972a8e09d08 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1650,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable) { unsigned long size; - size = global_page_state(NR_SLAB_RECLAIMABLE) + size = global_node_page_state(NR_SLAB_RECLAIMABLE) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c667b427b4..0869b20fba81 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2069,7 +2069,7 @@ out: /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened - * @cookie: context's cookie for pinning + * @rf: request-queue flags for pinning * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 076a2e31951c..29a397067ffa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -610,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + } + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, policy_is_shared(policy) ? sugov_update_shared : diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 67c70e287647..14d2dbf97c53 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr, utime = curr->utime; /* - * If either stime or both stime and utime are 0, assume all runtime is - * userspace. Once a task gets some ticks, the monotonicy code at - * 'update' will ensure things converge to the observed ratio. + * If either stime or utime are 0, assume all runtime is userspace. + * Once a task gets some ticks, the monotonicy code at 'update:' + * will ensure things converge to the observed ratio. */ - if (stime != 0) { - if (utime == 0) - stime = rtime; - else - stime = scale_stime(stime, rtime, stime + utime); + if (stime == 0) { + utime = rtime; + goto update; } + if (utime == 0) { + stime = rtime; + goto update; + } + + stime = scale_stime(stime, rtime, stime + utime); + +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. @@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static u64 vtime_delta(struct task_struct *tsk) +static u64 vtime_delta(struct vtime *vtime) { - unsigned long now = READ_ONCE(jiffies); + unsigned long long clock; - if (time_before(now, (unsigned long)tsk->vtime_snap)) + clock = sched_clock(); + if (clock < vtime->starttime) return 0; - return jiffies_to_nsecs(now - tsk->vtime_snap); + return clock - vtime->starttime; } -static u64 get_vtime_delta(struct task_struct *tsk) +static u64 get_vtime_delta(struct vtime *vtime) { - unsigned long now = READ_ONCE(jiffies); - u64 delta, other; + u64 delta = vtime_delta(vtime); + u64 other; /* * Unlike tick based timing, vtime based timing never has lost @@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - tsk->vtime_snap); other = account_other_time(delta); - WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); - tsk->vtime_snap = now; + WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); + vtime->starttime += delta; return delta - other; } -static void __vtime_account_system(struct task_struct *tsk) +static void __vtime_account_system(struct task_struct *tsk, + struct vtime *vtime) +{ + vtime->stime += get_vtime_delta(vtime); + if (vtime->stime >= TICK_NSEC) { + account_system_time(tsk, irq_count(), vtime->stime); + vtime->stime = 0; + } +} + +static void vtime_account_guest(struct task_struct *tsk, + struct vtime *vtime) { - account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); + vtime->gtime += get_vtime_delta(vtime); + if (vtime->gtime >= TICK_NSEC) { + account_guest_time(tsk, vtime->gtime); + vtime->gtime = 0; + } } void vtime_account_system(struct task_struct *tsk) { - if (!vtime_delta(tsk)) + struct vtime *vtime = &tsk->vtime; + + if (!vtime_delta(vtime)) return; - write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); + /* We might have scheduled out from guest path */ + if (current->flags & PF_VCPU) + vtime_account_guest(tsk, vtime); + else + __vtime_account_system(tsk, vtime); + write_seqcount_end(&vtime->seqcount); } -void vtime_account_user(struct task_struct *tsk) +void vtime_user_enter(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); - tsk->vtime_snap_whence = VTIME_SYS; - if (vtime_delta(tsk)) - account_user_time(tsk, get_vtime_delta(tsk)); - write_seqcount_end(&tsk->vtime_seqcount); + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); + __vtime_account_system(tsk, vtime); + vtime->state = VTIME_USER; + write_seqcount_end(&vtime->seqcount); } -void vtime_user_enter(struct task_struct *tsk) +void vtime_user_exit(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - __vtime_account_system(tsk); - tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seqcount); + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); + vtime->utime += get_vtime_delta(vtime); + if (vtime->utime >= TICK_NSEC) { + account_user_time(tsk, vtime->utime); + vtime->utime = 0; + } + vtime->state = VTIME_SYS; + write_seqcount_end(&vtime->seqcount); } void vtime_guest_enter(struct task_struct *tsk) { + struct vtime *vtime = &tsk->vtime; /* * The flags must be updated under the lock with - * the vtime_snap flush and update. + * the vtime_starttime flush and update. * That enforces a right ordering and update sequence * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. */ - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - __vtime_account_system(tsk); + write_seqcount_begin(&vtime->seqcount); + __vtime_account_system(tsk, vtime); current->flags |= PF_VCPU; - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); + vtime_account_guest(tsk, vtime); current->flags &= ~PF_VCPU; - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { - account_idle_time(get_vtime_delta(tsk)); + account_idle_time(get_vtime_delta(&tsk->vtime)); } void arch_vtime_task_switch(struct task_struct *prev) { - write_seqcount_begin(&prev->vtime_seqcount); - prev->vtime_snap_whence = VTIME_INACTIVE; - write_seqcount_end(&prev->vtime_seqcount); + struct vtime *vtime = &prev->vtime; + + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_INACTIVE; + write_seqcount_end(&vtime->seqcount); + + vtime = ¤t->vtime; - write_seqcount_begin(¤t->vtime_seqcount); - current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = jiffies; - write_seqcount_end(¤t->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_SYS; + vtime->starttime = sched_clock(); + write_seqcount_end(&vtime->seqcount); } void vtime_init_idle(struct task_struct *t, int cpu) { + struct vtime *vtime = &t->vtime; unsigned long flags; local_irq_save(flags); - write_seqcount_begin(&t->vtime_seqcount); - t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = jiffies; - write_seqcount_end(&t->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_SYS; + vtime->starttime = sched_clock(); + write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } u64 task_gtime(struct task_struct *t) { + struct vtime *vtime = &t->vtime; unsigned int seq; u64 gtime; @@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t) return t->gtime; do { - seq = read_seqcount_begin(&t->vtime_seqcount); + seq = read_seqcount_begin(&vtime->seqcount); gtime = t->gtime; - if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) - gtime += vtime_delta(t); + if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) + gtime += vtime->gtime + vtime_delta(vtime); - } while (read_seqcount_retry(&t->vtime_seqcount, seq)); + } while (read_seqcount_retry(&vtime->seqcount, seq)); return gtime; } @@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t) */ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { - u64 delta; + struct vtime *vtime = &t->vtime; unsigned int seq; + u64 delta; if (!vtime_accounting_enabled()) { *utime = t->utime; @@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) } do { - seq = read_seqcount_begin(&t->vtime_seqcount); + seq = read_seqcount_begin(&vtime->seqcount); *utime = t->utime; *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) + if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) continue; - delta = vtime_delta(t); + delta = vtime_delta(vtime); /* * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) - *utime += delta; - else if (t->vtime_snap_whence == VTIME_SYS) - *stime += delta; - } while (read_seqcount_retry(&t->vtime_seqcount, seq)); + if (vtime->state == VTIME_USER || t->flags & PF_VCPU) + *utime += vtime->utime + delta; + else if (vtime->state == VTIME_SYS) + *stime += vtime->stime + delta; + } while (read_seqcount_retry(&vtime->seqcount, seq)); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a84299f44b5d..755bd3f1a1a9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) struct sched_dl_entity *pi_se = &p->dl; /* - * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (absolute) deadline is - * smaller than our one... OTW we keep our runtime and - * deadline. + * Use the scheduling parameters of the top pi-waiter task if: + * - we have a top pi-waiter which is a SCHED_DEADLINE task AND + * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is + * smaller than our deadline OR we are a !SCHED_DEADLINE task getting + * boosted due to a SCHED_DEADLINE pi-waiter). + * Otherwise we keep our runtime and deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { + if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { pi_se = &pi_task->dl; } else if (!dl_prio(p->normal_prio)) { /* * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceedes its + * that is going to be deboosted, but exceeds its * runtime while doing so. No point in replenishing * it, as it's going to return back to its original * scheduling class after this. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 008c514dc241..c95880e216f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * - * Also avoid computing new_dst_cpu if we have already computed - * one in current iteration. + * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have + * already computed one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) + if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ @@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .tasks = LIST_HEAD_INIT(env.tasks), }; - /* - * For NEWLY_IDLE load_balancing, we don't need to consider - * other cpus in our group - */ - if (idle == CPU_NEWLY_IDLE) - env.dst_grpmask = NULL; - - cpumask_copy(cpus, cpu_active_mask); + cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); schedstat_inc(sd->lb_count[idle]); @@ -8151,7 +8144,15 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) { + /* + * Attempting to continue load balancing at the current + * sched_domain level only makes sense if there are + * active CPUs remaining as possible busiest CPUs to + * pull load from which are not contained within the + * destination group that is receiving any migrated + * load. + */ + if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; env.loop_break = sched_nr_migrate_break; goto redo; @@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data) .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, + /* + * can_migrate_task() doesn't need to compute new_dst_cpu + * for active balancing. Since we have CPU_IDLE, but no + * @dst_grpmask we need to make that test go away with lying + * about DST_PINNED. + */ + .flags = LBF_DST_PINNED, }; schedstat_inc(sd->alb_count); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 17f11c6b0a9f..d6afed6d0752 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, list_for_each_entry_safe(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + int ret = curr->func(curr, mode, wake_flags, key); + if (ret < 0) + break; + if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } } diff --git a/kernel/signal.c b/kernel/signal.c index 48a59eefd8ad..ed804a470dcd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) recalc_sigpending_and_wake(t); } } - if (action->sa.sa_handler == SIG_DFL) + /* + * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect + * debugging to leave init killable. + */ + if (action->sa.sa_handler == SIG_DFL && !t->ptrace) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); @@ -1402,6 +1406,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) return ret; } + /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ + if (pid == INT_MIN) + return -ESRCH; + read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, @@ -3299,12 +3307,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { +#ifdef __BIG_ENDIAN sigset_t set; - int err = do_sigpending(&set, sizeof(old_sigset_t)); - if (err == 0) - if (copy_to_user(set32, &set, sizeof(old_sigset_t))) - err = -EFAULT; + int err = do_sigpending(&set, sizeof(set.sig[0])); + if (!err) + err = put_user(set.sig[0], set32); return err; +#else + return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32)); +#endif } #endif diff --git a/kernel/sys.c b/kernel/sys.c index 47d901586b4e..2855ee73acd0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1362,7 +1362,7 @@ COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, ret = do_prlimit(current, resource, NULL, &r); if (!ret) { - struct rlimit r32; + struct compat_rlimit r32; if (r.rlim_cur > COMPAT_RLIM_INFINITY) r32.rlim_cur = COMPAT_RLIM_INFINITY; else @@ -2360,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) @@ -2368,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (down_write_killable(&me->mm->mmap_sem)) return -EINTR; if (arg2) - me->mm->def_flags |= VM_NOHUGEPAGE; + set_bit(MMF_DISABLE_THP, &me->mm->flags); else - me->mm->def_flags &= ~VM_NOHUGEPAGE; + clear_bit(MMF_DISABLE_THP, &me->mm->flags); up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dfba1a76cc3..6648fbbb8157 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -174,11 +174,32 @@ extern int no_unaligned_warning; #ifdef CONFIG_PROC_SYSCTL -#define SYSCTL_WRITES_LEGACY -1 -#define SYSCTL_WRITES_WARN 0 -#define SYSCTL_WRITES_STRICT 1 +/** + * enum sysctl_writes_mode - supported sysctl write modes + * + * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value + * to be written, and multiple writes on the same sysctl file descriptor + * will rewrite the sysctl value, regardless of file position. No warning + * is issued when the initial position is not 0. + * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is + * not 0. + * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at + * file position 0 and the value must be fully contained in the buffer + * sent to the write syscall. If dealing with strings respect the file + * position, but restrict this to the max length of the buffer, anything + * passed the max lenght will be ignored. Multiple writes will append + * to the buffer. + * + * These write modes control how current file position affects the behavior of + * updating sysctl values through the proc interface on each write. + */ +enum sysctl_writes_mode { + SYSCTL_WRITES_LEGACY = -1, + SYSCTL_WRITES_WARN = 0, + SYSCTL_WRITES_STRICT = 1, +}; -static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; +static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -880,6 +901,14 @@ static struct ctl_table kern_table[] = { #endif }, { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + { .procname = "soft_watchdog", .data = &soft_watchdog_enabled, .maxlen = sizeof (int), @@ -889,13 +918,6 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, - { .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), @@ -904,27 +926,29 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_SMP { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#endif /* CONFIG_SMP */ #endif -#ifdef CONFIG_SMP +#ifdef CONFIG_HARDLOCKUP_DETECTOR { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, + .procname = "hardlockup_panic", + .data = &hardlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", .data = &sysctl_hardlockup_all_cpu_backtrace, @@ -936,6 +960,8 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #endif +#endif + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", @@ -1950,6 +1976,32 @@ static void warn_sysctl_write(struct ctl_table *table) } /** + * proc_first_pos_non_zero_ignore - check if firs position is allowed + * @ppos: file position + * @table: the sysctl table + * + * Returns true if the first position is non-zero and the sysctl_writes_strict + * mode indicates this is not allowed for numeric input types. String proc + * hadlers can ignore the return value. + */ +static bool proc_first_pos_non_zero_ignore(loff_t *ppos, + struct ctl_table *table) +{ + if (!*ppos) + return false; + + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + return true; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + return false; + default: + return false; + } +} + +/** * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file @@ -1969,8 +2021,8 @@ static void warn_sysctl_write(struct ctl_table *table) int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) - warn_sysctl_write(table); + if (write) + proc_first_pos_non_zero_ignore(ppos, table); return _proc_do_string((char *)(table->data), table->maxlen, write, (char __user *)buffer, lenp, ppos); @@ -2128,19 +2180,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } -static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) +static int do_proc_douintvec_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) { if (write) { - if (*negp) + if (*lvalp > UINT_MAX) return -EINVAL; if (*lvalp > UINT_MAX) return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; - *negp = false; *lvalp = (unsigned long)val; } return 0; @@ -2172,17 +2223,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; @@ -2249,6 +2291,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } +static int do_proc_douintvec_w(unsigned int *tbl_data, + struct ctl_table *table, + void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + bool neg; + char *kbuf = NULL, *p; + + left = *lenp; + + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto bail_early; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return -EINVAL; + + left -= proc_skip_spaces(&p); + if (!left) { + err = -EINVAL; + goto out_free; + } + + err = proc_get_long(&p, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err || neg) { + err = -EINVAL; + goto out_free; + } + + if (conv(&lval, tbl_data, 1, data)) { + err = -EINVAL; + goto out_free; + } + + if (!err && left) + left -= proc_skip_spaces(&p); + +out_free: + kfree(kbuf); + if (err) + return -EINVAL; + + return 0; + + /* This is in keeping with old __do_proc_dointvec() */ +bail_early: + *ppos += *lenp; + return err; +} + +static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + + left = *lenp; + + if (conv(&lval, tbl_data, 0, data)) { + err = -EINVAL; + goto out; + } + + err = proc_put_long(&buffer, &left, lval, false); + if (err || !left) + goto out; + + err = proc_put_char(&buffer, &left, '\n'); + +out: + *lenp -= left; + *ppos += *lenp; + + return err; +} + +static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned int *i, vleft; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + + /* + * Arrays are not supported, keep this simple. *Do not* add + * support for them. + */ + if (vleft != 1) { + *lenp = 0; + return -EINVAL; + } + + if (!conv) + conv = do_proc_douintvec_conv; + + if (write) + return do_proc_douintvec_w(i, table, buffer, lenp, ppos, + conv, data); + return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); +} + +static int do_proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + return __do_proc_douintvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -2284,8 +2466,8 @@ int proc_dointvec(struct ctl_table *table, int write, int proc_douintvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_conv, NULL); + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* @@ -2390,6 +2572,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +struct do_proc_douintvec_minmax_conv_param { + unsigned int *min; + unsigned int *max; +}; + +static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + struct do_proc_douintvec_minmax_conv_param *param = data; + + if (write) { + unsigned int val = *lvalp; + + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -ERANGE; + + if (*lvalp > UINT_MAX) + return -EINVAL; + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +/** + * proc_douintvec_minmax - read a vector of unsigned ints with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. Negative + * strings are not allowed. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). There is a final sanity + * check for UINT_MAX to avoid having to support wrap around uses from + * userspace. + * + * Returns 0 on success. + */ +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_douintvec_minmax_conv_param param = { + .min = (unsigned int *) table->extra1, + .max = (unsigned int *) table->extra2, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_minmax_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -2447,17 +2688,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; @@ -2898,6 +3130,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2940,6 +3178,7 @@ EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL_GPL(proc_douintvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 939a158eab11..02e1859f2ca8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen) * CTL_KERN/KERN_VERSION is used by older glibc and cannot * ever go away. */ - if (name[0] == CTL_KERN && name[1] == KERN_VERSION) + if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) return; if (printk_ratelimit()) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 71ce3f4eead3..f2674a056c26 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -203,6 +203,7 @@ struct timer_base { bool migration_enabled; bool nohz_active; bool is_idle; + bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { - unsigned long jnow = READ_ONCE(jiffies); + unsigned long jnow; /* - * We only forward the base when it's idle and we have a delta between - * base clock and jiffies. + * We only forward the base when we are idle or have just come out of + * idle (must_forward_clk logic), and have a delta between base clock + * and jiffies. In the common case, run_timers will take care of it. */ - if (!base->is_idle || (long) (jnow - base->clk) < 2) + if (likely(!base->must_forward_clk)) + return; + + jnow = READ_ONCE(jiffies); + base->must_forward_clk = base->is_idle; + if ((long)(jnow - base->clk) < 2) return; /* @@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * same array bucket then just return: */ if (timer_pending(timer)) { + /* + * The downside of this optimization is that it can result in + * larger granularity than you would get from adding a new + * timer with this expiry. + */ if (timer->expires == expires) return 1; @@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * dequeue/enqueue dance. */ base = lock_timer_base(timer, &flags); + forward_timer_base(base); clk = base->clk; idx = calc_wheel_index(expires, clk); @@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } else { base = lock_timer_base(timer, &flags); + forward_timer_base(base); } ret = detach_if_pending(timer, base, false); @@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); + forward_timer_base(base); } } - /* Try to forward a stale timer base clock */ - forward_timer_base(base); - timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance @@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu) WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } + forward_timer_base(base); debug_activate(timer, timer->expires); internal_add_timer(base, timer); @@ -1495,12 +1508,18 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) base->is_idle = false; } else { if (!is_max_delta) - expires = basem + (nextevt - basej) * TICK_NSEC; + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* - * If we expect to sleep more than a tick, mark the base idle: + * If we expect to sleep more than a tick, mark the base idle. + * Also the tick is stopped so any added timer must forward + * the base clk itself to keep granularity small. This idle + * logic is only maintained for the BASE_STD base, deferrable + * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) + if ((expires - basem) > TICK_NSEC) { + base->must_forward_clk = true; base->is_idle = true; + } } raw_spin_unlock(&base->lock); @@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + /* + * must_forward_clk must be cleared before running timers so that any + * timer functions that call mod_timer will not try to forward the + * base. idle trcking / clock forwarding logic is only used with + * BASE_STD timers. + * + * The deferrable base does not do idle tracking at all, so we do + * not forward it. This can result in very large variations in + * granularity for deferrable timers, but they can be deferred for + * long periods due to idle. + */ + base->must_forward_clk = false; + __run_timers(base); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 37385193a608..dc498b605d5d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, fmt_cnt++; } - return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, - mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, - mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); +/* Horrid workaround for getting va_list handling working with different + * argument type combinations generically for 32 and 64 bit archs. + */ +#define __BPF_TP_EMIT() __BPF_ARG3_TP() +#define __BPF_TP(...) \ + __trace_printk(1 /* Fake ip will not be printed. */, \ + fmt, ##__VA_ARGS__) + +#define __BPF_ARG1_TP(...) \ + ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_TP(arg1, ##__VA_ARGS__) \ + : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ + : __BPF_TP((u32)arg1, ##__VA_ARGS__))) + +#define __BPF_ARG2_TP(...) \ + ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ + : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ + : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) + +#define __BPF_ARG3_TP(...) \ + ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ + : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ + : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) + + return __BPF_TP_EMIT(); } static const struct bpf_func_proto bpf_trace_printk_proto = { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2953d558bbee..96cea88fa00f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void) mutex_lock(&ftrace_lock); - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) + for (ops = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); + ops != &ftrace_list_end; + ops = rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock))) cnt++; mutex_unlock(&ftrace_lock); @@ -275,10 +278,11 @@ static void update_ftrace_function(void) * If there's only one ftrace_ops registered, the ftrace_ops_list * will point to the ops we want. */ - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); /* If there's no ftrace_ops registered, just call the stub function */ - if (ftrace_ops_list == &ftrace_list_end) { + if (set_function_trace_op == &ftrace_list_end) { func = ftrace_stub; /* @@ -286,7 +290,8 @@ static void update_ftrace_function(void) * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - } else if (ftrace_ops_list->next == &ftrace_list_end) { + } else if (rcu_dereference_protected(ftrace_ops_list->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { func = ftrace_ops_get_list_func(ftrace_ops_list); } else { @@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void) return ftrace_trace_function == ftrace_ops_list_func; } -static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static void add_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { - ops->next = *list; + rcu_assign_pointer(ops->next, *list); + /* * We are entering ops into the list but another * CPU might be walking that list. We need to make sure @@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) rcu_assign_pointer(*list, ops); } -static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (*list == ops && ops->next == &ftrace_list_end) { + if (rcu_dereference_protected(*list, + lockdep_is_held(&ftrace_lock)) == ops && + rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { *list = &ftrace_list_end; return 0; } @@ -878,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) function_profile_call(trace->func, 0, NULL, NULL); + /* If function graph is shutting down, ret_stack can be NULL */ + if (!current->ret_stack) + return 0; + if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) current->ret_stack[index].subtime = 0; @@ -1569,8 +1584,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); + rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); if (hash_contains_ip(ip, &hash)) ret = 1; @@ -2840,7 +2855,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If there's no more ops registered with ftrace, run a * sanity check to make sure all rec flags are cleared. */ - if (ftrace_ops_list == &ftrace_list_end) { + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -3816,7 +3832,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) int exclude_mod = 0; int found = 0; int ret; - int clear_filter; + int clear_filter = 0; if (func) { func_g.type = filter_parse_regex(func, len, &func_g.search, @@ -3950,7 +3966,7 @@ static int cache_mod(struct trace_array *tr, continue; /* no func matches all */ - if (!func || strcmp(func, "*") == 0 || + if (strcmp(func, "*") == 0 || (ftrace_mod->func && strcmp(ftrace_mod->func, func) == 0)) { ret = 0; @@ -3978,6 +3994,7 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable); +#ifdef CONFIG_MODULES static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, char *mod, bool enable) { @@ -4068,6 +4085,7 @@ static void process_cached_mods(const char *mod_name) kfree(mod); } +#endif /* * We register the module command as a template to show others how @@ -6451,7 +6469,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) update_ftrace_function(); ftrace_startup_sysctl(); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4ae268e687fe..81279c6602ff 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) for (i = 0; i < nr_pages; i++) { struct page *page; /* - * __GFP_NORETRY flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is - * not destabilized. + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, + GFP_KERNEL | __GFP_RETRY_MAYFAIL, cpu_to_node(cpu)); if (!bpage) goto free_pages; @@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); + GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * the page that was allocated, with the read page of the buffer. * * Returns: - * The page allocated, or NULL on error. + * The page allocated, or ERR_PTR */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = NULL; unsigned long flags; struct page *page; + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-ENODEV); + + cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) - return NULL; + return ERR_PTR(-ENOMEM); bpage = page_address(page); @@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * * for example: * rpage = ring_buffer_alloc_read_page(buffer, cpu); - * if (!rpage) - * return error; + * if (IS_ERR(rpage)) + * return PTR_ERR(rpage); * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); * if (ret >= 0) * process_page(rpage, ret); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 9fbcaf567886..68ee79afe31c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -113,7 +113,7 @@ static enum event_status read_page(int cpu) int i; bpage = ring_buffer_alloc_read_page(buffer, cpu); - if (!bpage) + if (IS_ERR(bpage)) return EVENT_DROPPED; ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 948ec32e0c27..44004d8aa3b3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1916,7 +1916,11 @@ static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; - if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(tsk->pid > PID_MAX_DEFAULT)) return 0; /* @@ -2002,7 +2006,11 @@ int trace_find_tgid(int pid) static int trace_save_tgid(struct task_struct *tsk) { - if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT)) return 0; tgid_map[tsk->pid] = tsk->tgid; @@ -2029,11 +2037,20 @@ static bool tracing_record_taskinfo_skip(int flags) */ void tracing_record_taskinfo(struct task_struct *task, int flags) { + bool done; + if (tracing_record_taskinfo_skip(flags)) return; - if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task)) - return; - if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task)) + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) return; __this_cpu_write(trace_taskinfo_save, false); @@ -2050,15 +2067,22 @@ void tracing_record_taskinfo(struct task_struct *task, int flags) void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) { + bool done; + if (tracing_record_taskinfo_skip(flags)) return; - if ((flags & TRACE_RECORD_CMDLINE) && - (!trace_save_cmdline(prev) || !trace_save_cmdline(next))) - return; + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); - if ((flags & TRACE_RECORD_TGID) && - (!trace_save_tgid(prev) || !trace_save_tgid(next))) + /* If recording any information failed, retry again soon. */ + if (!done) return; __this_cpu_write(trace_taskinfo_save, false); @@ -3334,14 +3358,23 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - - seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? " " : ""); - seq_printf(m, "# %s / _----=> need-resched\n", tgid ? " " : ""); - seq_printf(m, "# %s| / _---=> hardirq/softirq\n", tgid ? " " : ""); - seq_printf(m, "# %s|| / _--=> preempt-depth\n", tgid ? " " : ""); - seq_printf(m, "# %s||| / delay\n", tgid ? " " : ""); - seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); - seq_printf(m, "# | | | %s|||| | |\n", tgid ? " | " : ""); + const char tgid_space[] = " "; + const char space[] = " "; + + seq_printf(m, "# %s _-----=> irqs-off\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s / _----=> need-resched\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s| / _---=> hardirq/softirq\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s|| / _--=> preempt-depth\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s||| / delay\n", + tgid ? tgid_space : space); + seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", + tgid ? " TGID " : space); + seq_printf(m, "# | | | %s|||| | |\n", + tgid ? " | " : space); } void @@ -4689,6 +4722,76 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) { + if (trace_find_tgid(*ptr)) + return ptr; + } + + return NULL; +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + if (!tgid_map) + return NULL; + + v = &tgid_map[0]; + while (l <= *pos) { + v = saved_tgids_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int pid = (int *)v - tgid_map; + + seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid)); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +static const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { unsigned int *ptr = v; @@ -6495,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; - ssize_t ret; + ssize_t ret = 0; ssize_t size; if (!count) @@ -6509,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); - info->spare_cpu = iter->cpu_file; + if (IS_ERR(info->spare)) { + ret = PTR_ERR(info->spare); + info->spare = NULL; + } else { + info->spare_cpu = iter->cpu_file; + } } if (!info->spare) - return -ENOMEM; + return ret; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) @@ -6687,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ref->ref = 1; ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); - if (!ref->page) { - ret = -ENOMEM; + if (IS_ERR(ref->page)) { + ret = PTR_ERR(ref->page); + ref->page = NULL; kfree(ref); break; } @@ -7671,6 +7780,7 @@ static int instance_rmdir(const char *name) } kfree(tr->topts); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -7921,6 +8031,9 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_cmdlines_size", 0644, d_tracer, NULL, &tracing_saved_cmdlines_size_fops); + trace_create_file("saved_tgids", 0444, d_tracer, + NULL, &tracing_saved_tgids_fops); + trace_eval_init(); trace_create_eval_file(d_tracer); @@ -8186,6 +8299,7 @@ __init static int tracer_alloc_buffers(void) if (ret < 0) goto out_free_cpumask; /* Used for event triggers */ + ret = -ENOMEM; temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) goto out_rm_hp_state; @@ -8300,4 +8414,4 @@ __init static int clear_boot_tracer(void) } fs_initcall(tracer_init_tracefs); -late_initcall(clear_boot_tracer); +late_initcall_sync(clear_boot_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6ade1c55cc3a..490ba229931d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1210,9 +1210,9 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred *preds; - struct filter_pred *root; - char *filter_string; + struct filter_pred __rcu *preds; + struct filter_pred __rcu *root; + char *filter_string; }; struct event_subsystem { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 59a411ff60c7..181e139a8057 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call, if (err && set_str) append_filter_err(ps, filter); } + if (err && !set_str) { + free_event_filter(filter); + filter = NULL; + } create_filter_finish(ps); *filterp = filter; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b53c8d369163..c9b5aa10fbf9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + static int create_trace_kprobe(int argc, char **argv) { /* @@ -720,7 +728,7 @@ static int create_trace_kprobe(int argc, char **argv) return ret; } if (offset && is_return && - !function_offset_within_entry(NULL, symbol, offset)) { + !kprobe_on_func_entry(NULL, symbol, offset)) { pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; } @@ -736,6 +744,7 @@ static int create_trace_kprobe(int argc, char **argv) else snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); + sanitize_event_name(buf); event = buf; } tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b4a751e8f9d6..a4df67cbc711 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -406,6 +406,8 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +#ifdef CONFIG_DYNAMIC_FTRACE + static int stack_trace_filter_open(struct inode *inode, struct file *file) { @@ -423,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = { .release = ftrace_regex_release, }; +#endif /* CONFIG_DYNAMIC_FTRACE */ + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -477,8 +481,10 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); +#ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("stack_trace_filter", 0444, d_tracer, &trace_ops, &stack_trace_filter_fops); +#endif if (stack_trace_filter_buf[0]) ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 0a689bbb78ef..305039b122fa 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a) if (!a) return; - if (!a->pages) { - kfree(a); - return; - } + if (!a->pages) + goto free; for (i = 0; i < a->n_pages; i++) { if (!a->pages[i]) break; free_page((unsigned long)a->pages[i]); } + + kfree(a->pages); + + free: + kfree(a); } struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 03e0b69bb5bf..f5d52024f6b7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -9,7 +9,7 @@ * to those contributors as well. */ -#define pr_fmt(fmt) "NMI watchdog: " fmt +#define pr_fmt(fmt) "watchdog: " fmt #include <linux/mm.h> #include <linux/cpu.h> @@ -29,15 +29,58 @@ #include <linux/kvm_para.h> #include <linux/kthread.h> +/* Watchdog configuration */ static DEFINE_MUTEX(watchdog_proc_mutex); -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +int __read_mostly nmi_watchdog_enabled; + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | + NMI_WATCHDOG_ENABLED; #else unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif -int __read_mostly nmi_watchdog_enabled; + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR int __read_mostly soft_watchdog_enabled; +#endif + int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; @@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10; int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #endif -static struct cpumask watchdog_cpumask __read_mostly; +struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); - /* * The 'watchdog_running' variable is set to 1 when the watchdog threads * are registered/started and is set to 0 when the watchdog threads are @@ -72,7 +109,47 @@ static int __read_mostly watchdog_running; * of 'watchdog_running' cannot change while the watchdog is deactivated * temporarily (see related code in 'proc' handlers). */ -static int __read_mostly watchdog_suspended; +int __read_mostly watchdog_suspended; + +/* + * These functions can be overridden if an architecture implements its + * own hardlockup detector. + * + * watchdog_nmi_enable/disable can be implemented to start and stop when + * softlockup watchdog threads start and stop. The arch must select the + * SOFTLOCKUP_DETECTOR Kconfig. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} + +/* + * watchdog_nmi_reconfigure can be implemented to be notified after any + * watchdog configuration change. The arch hardlockup watchdog should + * respond to the following variables: + * - nmi_watchdog_enabled + * - watchdog_thresh + * - watchdog_cpumask + * - sysctl_hardlockup_all_cpu_backtrace + * - hardlockup_panic + * - watchdog_suspended + */ +void __weak watchdog_nmi_reconfigure(void) +{ +} + + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) + +atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); static u64 __read_mostly sample_period; @@ -120,6 +197,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#ifdef CONFIG_HARDLOCKUP_DETECTOR static int __init hardlockup_all_cpu_backtrace_setup(char *str) { sysctl_hardlockup_all_cpu_backtrace = @@ -128,6 +206,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) } __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -161,6 +240,7 @@ static void set_sample_period(void) * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + watchdog_update_hrtimer_threshold(sample_period); } /* Commands for resetting the watchdog */ @@ -213,18 +293,6 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_touch_ts, 0); } -/* watchdog detector functions */ -bool is_hardlockup(void) -{ - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return true; - - __this_cpu_write(hrtimer_interrupts_saved, hrint); - return false; -} - static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); @@ -237,21 +305,21 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -static void watchdog_interrupt_count(void) +/* watchdog detector functions */ +bool is_hardlockup(void) { - __this_cpu_inc(hrtimer_interrupts); -} + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); -/* - * These two functions are mostly architecture specific - * defining them as weak here. - */ -int __weak watchdog_nmi_enable(unsigned int cpu) -{ - return 0; + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) + return true; + + __this_cpu_write(hrtimer_interrupts_saved, hrint); + return false; } -void __weak watchdog_nmi_disable(unsigned int cpu) + +static void watchdog_interrupt_count(void) { + __this_cpu_inc(hrtimer_interrupts); } static int watchdog_enable_all_cpus(void); @@ -502,57 +570,6 @@ static void watchdog_unpark_threads(void) kthread_unpark(per_cpu(softlockup_watchdog, cpu)); } -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. - */ -int lockup_detector_suspend(void) -{ - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - mutex_unlock(&watchdog_proc_mutex); - - return ret; -} - -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. - */ -void lockup_detector_resume(void) -{ - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); -} - static int update_watchdog_all_cpus(void) { int ret; @@ -605,6 +622,100 @@ static void watchdog_disable_all_cpus(void) } #ifdef CONFIG_SYSCTL +static int watchdog_update_cpus(void) +{ + return smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask); +} +#endif + +#else /* SOFTLOCKUP */ +static int watchdog_park_threads(void) +{ + return 0; +} + +static void watchdog_unpark_threads(void) +{ +} + +static int watchdog_enable_all_cpus(void) +{ + return 0; +} + +static void watchdog_disable_all_cpus(void) +{ +} + +#ifdef CONFIG_SYSCTL +static int watchdog_update_cpus(void) +{ + return 0; +} +#endif + +static void set_sample_period(void) +{ +} +#endif /* SOFTLOCKUP */ + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + get_online_cpus(); + mutex_lock(&watchdog_proc_mutex); + /* + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). + */ + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } + + watchdog_nmi_reconfigure(); + + mutex_unlock(&watchdog_proc_mutex); + + return ret; +} + +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) +{ + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; + /* + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. + */ + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + watchdog_nmi_reconfigure(); + + mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); +} + +#ifdef CONFIG_SYSCTL /* * Update the run state of the lockup detectors. @@ -625,6 +736,8 @@ static int proc_watchdog_update(void) else watchdog_disable_all_cpus(); + watchdog_nmi_reconfigure(); + return err; } @@ -810,10 +923,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ - if (smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask) != 0) + if (watchdog_update_cpus() != 0) pr_err("cpumask update failed\n"); } + + watchdog_nmi_reconfigure(); } out: mutex_unlock(&watchdog_proc_mutex); diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 54a427d1f344..3a09ea1b1d3d 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -22,41 +22,9 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); - -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { /* * Using __raw here because some code paths have @@ -66,9 +34,64 @@ void touch_nmi_watchdog(void) * going off. */ raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); } -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); + +#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP +static DEFINE_PER_CPU(ktime_t, last_timestamp); +static DEFINE_PER_CPU(unsigned int, nmi_rearmed); +static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; + +void watchdog_update_hrtimer_threshold(u64 period) +{ + /* + * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 + * + * So it runs effectively with 2.5 times the rate of the NMI + * watchdog. That means the hrtimer should fire 2-3 times before + * the NMI watchdog expires. The NMI watchdog on x86 is based on + * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles + * might run way faster than expected and the NMI fires in a + * smaller period than the one deduced from the nominal CPU + * frequency. Depending on the Turbo-Mode factor this might be fast + * enough to get the NMI period smaller than the hrtimer watchdog + * period and trigger false positives. + * + * The sample threshold is used to check in the NMI handler whether + * the minimum time between two NMI samples has elapsed. That + * prevents false positives. + * + * Set this to 4/5 of the actual watchdog threshold period so the + * hrtimer is guaranteed to fire at least once within the real + * watchdog threshold. + */ + watchdog_hrtimer_sample_threshold = period * 2; +} + +static bool watchdog_check_timestamp(void) +{ + ktime_t delta, now = ktime_get_mono_fast_ns(); + + delta = now - __this_cpu_read(last_timestamp); + if (delta < watchdog_hrtimer_sample_threshold) { + /* + * If ktime is jiffies based, a stalled timer would prevent + * jiffies from being incremented and the filter would look + * at a stale timestamp and never trigger. + */ + if (__this_cpu_inc_return(nmi_rearmed) < 10) + return false; + } + __this_cpu_write(nmi_rearmed, 0); + __this_cpu_write(last_timestamp, now); + return true; +} +#else +static inline bool watchdog_check_timestamp(void) +{ + return true; +} +#endif static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, @@ -94,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } + if (!watchdog_check_timestamp()) + return; + /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a86688fabc55..ca937b0c3a96 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, /* yeap, return possible CPUs in @node that @attrs wants */ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + + if (cpumask_empty(cpumask)) { + pr_warn_once("WARNING: workqueue cpumask: online intersect > " + "possible intersect\n"); + return false; + } + return !cpumask_equal(cpumask, attrs->cpumask); use_dfl: @@ -3744,8 +3751,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; /* creating multiple pwqs breaks ordering guarantee */ - if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) - return -EINVAL; + if (!list_empty(&wq->pwqs)) { + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + return -EINVAL; + + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, attrs); if (!ctx) @@ -3929,6 +3940,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* + * Unbound && max_active == 1 used to imply ordered, which is no + * longer the case on NUMA machines due to per-node pools. While + * alloc_ordered_workqueue() is the right way to create an ordered + * workqueue, keep the previous behavior to avoid subtle breakages + * on NUMA. + */ + if ((flags & WQ_UNBOUND) && max_active == 1) + flags |= __WQ_ORDERED; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; @@ -4119,13 +4140,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) struct pool_workqueue *pwq; /* disallow meddling with max_active for ordered workqueues */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); + wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; for_each_pwq(pwq, wq) @@ -5253,7 +5275,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); |