diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 67 | ||||
-rw-r--r-- | kernel/audit.h | 5 | ||||
-rw-r--r-- | kernel/audit_tree.c | 9 | ||||
-rw-r--r-- | kernel/audit_watch.c | 4 | ||||
-rw-r--r-- | kernel/auditfilter.c | 12 | ||||
-rw-r--r-- | kernel/auditsc.c | 16 | ||||
-rw-r--r-- | kernel/cgroup.c | 11 | ||||
-rw-r--r-- | kernel/cpuset.c | 13 | ||||
-rw-r--r-- | kernel/debug/debug_core.c | 16 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_main.c | 48 | ||||
-rw-r--r-- | kernel/irq/manage.c | 4 | ||||
-rw-r--r-- | kernel/jump_label.c | 77 | ||||
-rw-r--r-- | kernel/kprobes.c | 26 | ||||
-rw-r--r-- | kernel/sched.c | 8 | ||||
-rw-r--r-- | kernel/sched_fair.c | 25 | ||||
-rw-r--r-- | kernel/sched_stats.h | 20 |
16 files changed, 204 insertions, 157 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index d96045789b54..77770a034d59 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) struct task_struct *tsk; int err; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - err = -ESRCH; - if (!tsk) - goto out; - err = 0; - - spin_lock_irq(&tsk->sighand->siglock); - if (!tsk->signal->audit_tty) - err = -EPERM; - spin_unlock_irq(&tsk->sighand->siglock); - if (err) - goto out; - - tty_audit_push_task(tsk, loginuid, sessionid); -out: - read_unlock(&tasklist_lock); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(tsk); + rcu_read_unlock(); + err = tty_audit_push_task(tsk, loginuid, sessionid); + put_task_struct(tsk); return err; } @@ -506,7 +499,7 @@ int audit_send_list(void *_dest) } struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, - int multi, void *payload, int size) + int multi, const void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; @@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg) * Allocates an skb, builds the netlink message, and sends it to the pid. * No failure notifications. */ -void audit_send_reply(int pid, int seq, int type, int done, int multi, - void *payload, int size) +static void audit_send_reply(int pid, int seq, int type, int done, int multi, + const void *payload, int size) { struct sk_buff *skb; struct task_struct *tsk; @@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TTY_GET: { struct audit_tty_status s; struct task_struct *tsk; + unsigned long flags; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); + if (tsk && lock_task_sighand(tsk, &flags)) { s.enabled = tsk->signal->audit_tty != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, - &s, sizeof(s)); + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); + + if (!err) + audit_send_reply(NETLINK_CB(skb).pid, seq, + AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status *s; struct task_struct *tsk; + unsigned long flags; if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) return -EINVAL; s = data; if (s->enabled != 0 && s->enabled != 1) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); + if (tsk && lock_task_sighand(tsk, &flags)) { tsk->signal->audit_tty = s->enabled != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); break; } default: diff --git a/kernel/audit.h b/kernel/audit.h index f7206db4e13d..91e7071c4d2c 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path, int *dirlen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, - void *payload, int size); -extern void audit_send_reply(int pid, int seq, int type, - int done, int multi, - void *payload, int size); + const void *payload, int size); extern void audit_panic(const char *message); struct audit_netlink_list { diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 7f18d3a4527e..37b2bea170c8 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -223,7 +223,7 @@ static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); struct fsnotify_mark *entry = &chunk->mark; - struct audit_chunk *new; + struct audit_chunk *new = NULL; struct audit_tree *owner; int size = chunk->count - 1; int i, j; @@ -232,9 +232,14 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); + if (size) + new = alloc_chunk(size); + spin_lock(&entry->lock); if (chunk->dead || !entry->i.inode) { spin_unlock(&entry->lock); + if (new) + free_chunk(new); goto out; } @@ -255,9 +260,9 @@ static void untag_chunk(struct node *p) goto out; } - new = alloc_chunk(size); if (!new) goto Fallback; + fsnotify_duplicate_mark(&new->mark, entry); if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { free_chunk(new); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index f0c9b2e7542d..d2e3c7866460 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -60,7 +60,7 @@ struct audit_parent { }; /* fsnotify handle. */ -struct fsnotify_group *audit_watch_group; +static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ @@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch) } } -void audit_remove_watch(struct audit_watch *watch) +static void audit_remove_watch(struct audit_watch *watch) { list_del(&watch->wlist); audit_put_parent(watch->parent); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index eb7675499fb5..add2819af71b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, case AUDIT_LOGINUID: result = audit_comparator(cb->loginuid, f->op, f->val); break; + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + if (f->lsm_rule) + result = security_audit_rule_match(cb->sid, + f->type, + f->op, + f->lsm_rule, + NULL); + break; } if (!result) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1b31c130d034..f49a0318c2ed 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -241,6 +241,10 @@ struct audit_context { pid_t pid; struct audit_cap_data cap; } capset; + struct { + int fd; + int flags; + } mmap; }; int fds[2]; @@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); break; } + case AUDIT_MMAP: { + audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, + context->mmap.flags); + break; } } audit_log_end(ab); } @@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid, context->type = AUDIT_CAPSET; } +void __audit_mmap_fd(int fd, int flags) +{ + struct audit_context *context = current->audit_context; + context->mmap.fd = fd; + context->mmap.flags = flags; + context->type = AUDIT_MMAP; +} + /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5cf366965d0c..66a416b42c18 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1460,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb) return 0; } -static int cgroup_get_sb(struct file_system_type *fs_type, +static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) + void *data) { struct cgroup_sb_opts opts; struct cgroupfs_root *root; @@ -1596,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, drop_parsed_module_refcounts(opts.subsys_bits); } - simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); - return 0; + return dget(sb->s_root); drop_new_super: deactivate_locked_super(sb); @@ -1608,7 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); - return ret; + return ERR_PTR(ret); } static void cgroup_kill_sb(struct super_block *sb) { @@ -1658,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) { static struct file_system_type cgroup_fs_type = { .name = "cgroup", - .get_sb = cgroup_get_sb, + .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, }; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 51b143e2a07a..4349935c2ad8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock); * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ -static int cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) +static struct dentry *cpuset_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, void *data) { struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - int ret = -ENODEV; + struct dentry *ret = ERR_PTR(-ENODEV); if (cgroup_fs) { char mountopts[] = "cpuset,noprefix," "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->get_sb(cgroup_fs, flags, - unused_dev_name, mountopts, mnt); + ret = cgroup_fs->mount(cgroup_fs, flags, + unused_dev_name, mountopts); put_filesystem(cgroup_fs); } return ret; @@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type, static struct file_system_type cpuset_fs_type = { .name = "cpuset", - .get_sb = cpuset_get_sb, + .mount = cpuset_mount, }; /* diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index fec596da9bd0..cefd4a11f6d9 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -209,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs) return 0; } -/** - * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. - * @regs: Current &struct pt_regs. - * - * This function will be called if the particular architecture must - * disable hardware debugging while it is processing gdb packets or - * handling exception. - */ -void __weak kgdb_disable_hw_debug(struct pt_regs *regs) -{ -} - /* * Some architectures need cache flushes when we set/clear a * breakpoint: @@ -484,7 +472,9 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, atomic_inc(&masters_in_kgdb); else atomic_inc(&slaves_in_kgdb); - kgdb_disable_hw_debug(ks->linux_regs); + + if (arch_kgdb_ops.disable_hw_break) + arch_kgdb_ops.disable_hw_break(regs); acquirelock: /* diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index d7bda21a106b..37755d621924 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, /* special case below */ } else { kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", - kdb_current, kdb_current->pid); + kdb_current, kdb_current ? kdb_current->pid : 0); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); #endif @@ -2603,20 +2603,17 @@ static int kdb_summary(int argc, const char **argv) */ static int kdb_per_cpu(int argc, const char **argv) { - char buf[256], fmtstr[64]; - kdb_symtab_t symtab; - cpumask_t suppress = CPU_MASK_NONE; - int cpu, diag; - unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL; + char fmtstr[64]; + int cpu, diag, nextarg = 1; + unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL; if (argc < 1 || argc > 3) return KDB_ARGCOUNT; - snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); - if (!kdbgetsymval(buf, &symtab)) { - kdb_printf("%s is not a per_cpu variable\n", argv[1]); - return KDB_BADADDR; - } + diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL); + if (diag) + return diag; + if (argc >= 2) { diag = kdbgetularg(argv[2], &bytesperword); if (diag) @@ -2649,46 +2646,25 @@ static int kdb_per_cpu(int argc, const char **argv) #define KDB_PCU(cpu) 0 #endif #endif - for_each_online_cpu(cpu) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + if (whichcpu != ~0UL && whichcpu != cpu) continue; - addr = symtab.sym_start + KDB_PCU(cpu); + addr = symaddr + KDB_PCU(cpu); diag = kdb_getword(&val, addr, bytesperword); if (diag) { kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " "read, diag=%d\n", cpu, addr, diag); continue; } -#ifdef CONFIG_SMP - if (!val) { - cpu_set(cpu, suppress); - continue; - } -#endif /* CONFIG_SMP */ kdb_printf("%5d ", cpu); kdb_md_line(fmtstr, addr, bytesperword == KDB_WORD_SIZE, 1, bytesperword, 1, 1, 0); } - if (cpus_weight(suppress) == 0) - return 0; - kdb_printf("Zero suppressed cpu(s):"); - for (cpu = first_cpu(suppress); cpu < num_possible_cpus(); - cpu = next_cpu(cpu, suppress)) { - kdb_printf(" %d", cpu); - if (cpu == num_possible_cpus() - 1 || - next_cpu(cpu, suppress) != cpu + 1) - continue; - while (cpu < num_possible_cpus() && - next_cpu(cpu, suppress) == cpu + 1) - ++cpu; - kdb_printf("-%d", cpu); - } - kdb_printf("\n"); - #undef KDB_PCU - return 0; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 644e8d5fa367..5f92acc5f952 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -324,6 +324,10 @@ void enable_irq(unsigned int irq) if (!desc) return; + if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, + KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) + return; + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 7be868bf25c6..3b79bd938330 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -39,6 +39,16 @@ struct jump_label_module_entry { struct module *mod; }; +void jump_label_lock(void) +{ + mutex_lock(&jump_label_mutex); +} + +void jump_label_unlock(void) +{ + mutex_unlock(&jump_label_mutex); +} + static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; @@ -152,7 +162,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type) struct jump_label_module_entry *e_module; int count; - mutex_lock(&jump_label_mutex); + jump_label_lock(); entry = get_jump_label_entry((jump_label_t)key); if (entry) { count = entry->nr_entries; @@ -168,13 +178,14 @@ void jump_label_update(unsigned long key, enum jump_label_type type) count = e_module->nr_entries; iter = e_module->table; while (count--) { - if (kernel_text_address(iter->code)) + if (iter->key && + kernel_text_address(iter->code)) arch_jump_label_transform(iter, type); iter++; } } } - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); } static int addr_conflict(struct jump_entry *entry, void *start, void *end) @@ -231,6 +242,7 @@ out: * overlaps with any of the jump label patch addresses. Code * that wants to modify kernel text should first verify that * it does not overlap with any of the jump label addresses. + * Caller must hold jump_label_mutex. * * returns 1 if there is an overlap, 0 otherwise */ @@ -241,7 +253,6 @@ int jump_label_text_reserved(void *start, void *end) struct jump_entry *iter_stop = __start___jump_table; int conflict = 0; - mutex_lock(&jump_label_mutex); iter = iter_start; while (iter < iter_stop) { if (addr_conflict(iter, start, end)) { @@ -256,10 +267,16 @@ int jump_label_text_reserved(void *start, void *end) conflict = module_conflict(start, end); #endif out: - mutex_unlock(&jump_label_mutex); return conflict; } +/* + * Not all archs need this. + */ +void __weak arch_jump_label_text_poke_early(jump_label_t addr) +{ +} + static __init int init_jump_label(void) { int ret; @@ -267,7 +284,7 @@ static __init int init_jump_label(void) struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; - mutex_lock(&jump_label_mutex); + jump_label_lock(); ret = build_jump_label_hashtable(__start___jump_table, __stop___jump_table); iter = iter_start; @@ -275,7 +292,7 @@ static __init int init_jump_label(void) arch_jump_label_text_poke_early(iter->code); iter++; } - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); return ret; } early_initcall(init_jump_label); @@ -366,6 +383,39 @@ static void remove_jump_label_module(struct module *mod) } } +static void remove_jump_label_module_init(struct module *mod) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + struct jump_entry *iter; + int i, count; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + if (e_module->mod != mod) + continue; + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (within_module_init(iter->code, mod)) + iter->key = 0; + iter++; + } + } + } + } +} + static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) @@ -375,16 +425,21 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, switch (val) { case MODULE_STATE_COMING: - mutex_lock(&jump_label_mutex); + jump_label_lock(); ret = add_jump_label_module(mod); if (ret) remove_jump_label_module(mod); - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); break; case MODULE_STATE_GOING: - mutex_lock(&jump_label_mutex); + jump_label_lock(); remove_jump_label_module(mod); - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); + break; + case MODULE_STATE_LIVE: + jump_label_lock(); + remove_jump_label_module_init(mod); + jump_label_unlock(); break; } return ret; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 99865c33a60d..9737a76e106f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1145,14 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p) if (ret) return ret; + jump_label_lock(); preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || ftrace_text_reserved(p->addr, p->addr) || - jump_label_text_reserved(p->addr, p->addr)) { - preempt_enable(); - return -EINVAL; - } + jump_label_text_reserved(p->addr, p->addr)) + goto fail_with_jump_label; /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; @@ -1166,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p) * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (unlikely(!try_module_get(probed_mod))) { - preempt_enable(); - return -EINVAL; - } + if (unlikely(!try_module_get(probed_mod))) + goto fail_with_jump_label; + /* * If the module freed .init.text, we couldn't insert * kprobes in there. @@ -1177,16 +1175,18 @@ int __kprobes register_kprobe(struct kprobe *p) if (within_module_init((unsigned long)p->addr, probed_mod) && probed_mod->state != MODULE_STATE_COMING) { module_put(probed_mod); - preempt_enable(); - return -EINVAL; + goto fail_with_jump_label; } } preempt_enable(); + jump_label_unlock(); p->nmissed = 0; INIT_LIST_HEAD(&p->list); mutex_lock(&kprobe_mutex); + jump_label_lock(); /* needed to call jump_label_text_reserved() */ + get_online_cpus(); /* For avoiding text_mutex deadlock. */ mutex_lock(&text_mutex); @@ -1214,12 +1214,18 @@ int __kprobes register_kprobe(struct kprobe *p) out: mutex_unlock(&text_mutex); put_online_cpus(); + jump_label_unlock(); mutex_unlock(&kprobe_mutex); if (probed_mod) module_put(probed_mod); return ret; + +fail_with_jump_label: + preempt_enable(); + jump_label_unlock(); + return -EINVAL; } EXPORT_SYMBOL_GPL(register_kprobe); diff --git a/kernel/sched.c b/kernel/sched.c index d42992bccdfa..aa14a56f9d03 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8510,12 +8510,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - set_task_rq(tsk, task_cpu(tsk)); - #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk, on_rq); + if (tsk->sched_class->task_move_group) + tsk->sched_class->task_move_group(tsk, on_rq); + else #endif + set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 933f3d1b62ea..f4f6a8326dd0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3869,13 +3869,26 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int on_rq) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); - - update_curr(cfs_rq); + /* + * If the task was not on the rq at the time of this cgroup movement + * it must have been asleep, sleeping tasks keep their ->vruntime + * absolute on their old rq until wakeup (needed for the fair sleeper + * bonus in place_entity()). + * + * If it was on the rq, we've just 'preempted' it, which does convert + * ->vruntime to a relative base. + * + * Make sure both cases convert their relative position when migrating + * to another cgroup's rq. This does somewhat interfere with the + * fair sleeper stuff for the first placement, but who cares. + */ + if (!on_rq) + p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + set_task_rq(p, task_cpu(p)); if (!on_rq) - place_entity(cfs_rq, &p->se, 1); + p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; } #endif @@ -3927,7 +3940,7 @@ static const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .moved_group = moved_group_fair, + .task_move_group = task_move_group_fair, #endif }; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 25c2f962f6fc..48ddf431db0e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) } /* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * Though we are interested in knowing how long it was from the *first* time a + * We are interested in knowing how long it was from the *first* time a * task was queued to the time that it finally hit a cpu, we call this routine * from dequeue_task() to account for possible rq->clock skew across cpus. The * delta taken on each cpu would annul the skew. @@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t) } /* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * * This function is only called from enqueue_task(), but also only updates * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. |