diff options
-rw-r--r-- | include/linux/sched.h | 54 | ||||
-rw-r--r-- | kernel/sched.c | 145 | ||||
-rw-r--r-- | kernel/sched_debug.c | 103 | ||||
-rw-r--r-- | kernel/sched_fair.c | 189 | ||||
-rw-r--r-- | kernel/sched_features.h | 55 | ||||
-rw-r--r-- | kernel/sched_rt.c | 2 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 3 | ||||
-rw-r--r-- | kernel/user.c | 2 |
8 files changed, 166 insertions, 387 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index dad7f668ebf7..43c945152732 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -275,11 +275,17 @@ extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); extern int get_nohz_load_balancer(void); +extern int nohz_ratelimit(int cpu); #else static inline int select_nohz_load_balancer(int cpu) { return 0; } + +static inline int nohz_ratelimit(int cpu) +{ + return 0; +} #endif /* @@ -1077,36 +1083,8 @@ struct load_weight { unsigned long weight, inv_weight; }; -/* - * CFS stats for a schedulable entity (task, task-group etc) - * - * Current field usage histogram: - * - * 4 se->block_start - * 4 se->run_node - * 4 se->sleep_start - * 6 se->load.weight - */ -struct sched_entity { - struct load_weight load; /* for load-balancing */ - struct rb_node run_node; - struct list_head group_node; - unsigned int on_rq; - - u64 exec_start; - u64 sum_exec_runtime; - u64 vruntime; - u64 prev_sum_exec_runtime; - - u64 last_wakeup; - u64 avg_overlap; - - u64 nr_migrations; - - u64 start_runtime; - u64 avg_wakeup; - #ifdef CONFIG_SCHEDSTATS +struct sched_statistics { u64 wait_start; u64 wait_max; u64 wait_count; @@ -1138,6 +1116,24 @@ struct sched_entity { u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; +}; +#endif + +struct sched_entity { + struct load_weight load; /* for load-balancing */ + struct rb_node run_node; + struct list_head group_node; + unsigned int on_rq; + + u64 exec_start; + u64 sum_exec_runtime; + u64 vruntime; + u64 prev_sum_exec_runtime; + + u64 nr_migrations; + +#ifdef CONFIG_SCHEDSTATS + struct sched_statistics statistics; #endif #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched.c b/kernel/sched.c index 49d2fa7b687a..52b7efd27416 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -492,8 +492,11 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ + u64 nohz_stamp; unsigned char in_nohz_recently; #endif + unsigned int skip_clock_update; + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -591,6 +594,13 @@ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { rq->curr->sched_class->check_preempt_curr(rq, p, flags); + + /* + * A queue event has occurred, and we're going to schedule. In + * this case, we can save a useless back to back clock update. + */ + if (test_tsk_need_resched(p)) + rq->skip_clock_update = 1; } static inline int cpu_of(struct rq *rq) @@ -625,7 +635,8 @@ static inline int cpu_of(struct rq *rq) inline void update_rq_clock(struct rq *rq) { - rq->clock = sched_clock_cpu(cpu_of(rq)); + if (!rq->skip_clock_update) + rq->clock = sched_clock_cpu(cpu_of(rq)); } /* @@ -1228,6 +1239,17 @@ void wake_up_idle_cpu(int cpu) if (!tsk_is_polling(rq->idle)) smp_send_reschedule(cpu); } + +int nohz_ratelimit(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 diff = rq->clock - rq->nohz_stamp; + + rq->nohz_stamp = rq->clock; + + return diff < (NSEC_PER_SEC / HZ) >> 1; +} + #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -1770,8 +1792,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); } } - update_rq_clock(rq1); - update_rq_clock(rq2); } /* @@ -1868,9 +1888,7 @@ static void update_avg(u64 *avg, u64 sample) static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) { - if (wakeup) - p->se.start_runtime = p->se.sum_exec_runtime; - + update_rq_clock(rq); sched_info_queued(p); p->sched_class->enqueue_task(rq, p, wakeup, head); p->se.on_rq = 1; @@ -1878,17 +1896,7 @@ enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { - if (sleep) { - if (p->se.last_wakeup) { - update_avg(&p->se.avg_overlap, - p->se.sum_exec_runtime - p->se.last_wakeup); - p->se.last_wakeup = 0; - } else { - update_avg(&p->se.avg_wakeup, - sysctl_sched_wakeup_granularity); - } - } - + update_rq_clock(rq); sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; @@ -2361,14 +2369,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, unsigned long flags; struct rq *rq; - if (!sched_feat(SYNC_WAKEUPS)) - wake_flags &= ~WF_SYNC; - this_cpu = get_cpu(); smp_wmb(); rq = task_rq_lock(p, &flags); - update_rq_clock(rq); if (!(p->state & state)) goto out; @@ -2409,7 +2413,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, rq = cpu_rq(cpu); raw_spin_lock(&rq->lock); - update_rq_clock(rq); /* * We migrated the task without holding either rq->lock, however @@ -2437,34 +2440,18 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, out_activate: #endif /* CONFIG_SMP */ - schedstat_inc(p, se.nr_wakeups); + schedstat_inc(p, se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p, se.nr_wakeups_sync); + schedstat_inc(p, se.statistics.nr_wakeups_sync); if (orig_cpu != cpu) - schedstat_inc(p, se.nr_wakeups_migrate); + schedstat_inc(p, se.statistics.nr_wakeups_migrate); if (cpu == this_cpu) - schedstat_inc(p, se.nr_wakeups_local); + schedstat_inc(p, se.statistics.nr_wakeups_local); else - schedstat_inc(p, se.nr_wakeups_remote); + schedstat_inc(p, se.statistics.nr_wakeups_remote); activate_task(rq, p, 1); success = 1; - /* - * Only attribute actual wakeups done by this task. - */ - if (!in_interrupt()) { - struct sched_entity *se = ¤t->se; - u64 sample = se->sum_exec_runtime; - - if (se->last_wakeup) - sample -= se->last_wakeup; - else - sample -= se->start_runtime; - update_avg(&se->avg_wakeup, sample); - - se->last_wakeup = se->sum_exec_runtime; - } - out_running: trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, wake_flags); @@ -2526,42 +2513,9 @@ static void __sched_fork(struct task_struct *p) p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; - p->se.last_wakeup = 0; - p->se.avg_overlap = 0; - p->se.start_runtime = 0; - p->se.avg_wakeup = sysctl_sched_wakeup_granularity; #ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.wait_max = 0; - p->se.wait_count = 0; - p->se.wait_sum = 0; - - p->se.sleep_start = 0; - p->se.sleep_max = 0; - p->se.sum_sleep_runtime = 0; - - p->se.block_start = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - - p->se.nr_migrations_cold = 0; - p->se.nr_failed_migrations_affine = 0; - p->se.nr_failed_migrations_running = 0; - p->se.nr_failed_migrations_hot = 0; - p->se.nr_forced_migrations = 0; - - p->se.nr_wakeups = 0; - p->se.nr_wakeups_sync = 0; - p->se.nr_wakeups_migrate = 0; - p->se.nr_wakeups_local = 0; - p->se.nr_wakeups_remote = 0; - p->se.nr_wakeups_affine = 0; - p->se.nr_wakeups_affine_attempts = 0; - p->se.nr_wakeups_passive = 0; - p->se.nr_wakeups_idle = 0; - + memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif INIT_LIST_HEAD(&p->rt.run_list); @@ -2675,7 +2629,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) BUG_ON(p->state != TASK_WAKING); p->state = TASK_RUNNING; - update_rq_clock(rq); activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); @@ -3629,23 +3582,9 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->state == TASK_RUNNING) { - u64 runtime = prev->se.sum_exec_runtime; - - runtime -= prev->se.prev_sum_exec_runtime; - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - - /* - * In order to avoid avg_overlap growing stale when we are - * indeed overlapping and hence not getting put to sleep, grow - * the avg_overlap on preemption. - * - * We use the average preemption runtime because that - * correlates to the amount of cache footprint a task can - * build up. - */ - update_avg(&prev->se.avg_overlap, runtime); - } + if (prev->se.on_rq) + update_rq_clock(rq); + rq->skip_clock_update = 0; prev->sched_class->put_prev_task(rq, prev); } @@ -3708,7 +3647,6 @@ need_resched_nonpreemptible: hrtick_clear(rq); raw_spin_lock_irq(&rq->lock); - update_rq_clock(rq); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@ -4265,7 +4203,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); - update_rq_clock(rq); oldprio = p->prio; prev_class = p->sched_class; @@ -4308,7 +4245,6 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); - update_rq_clock(rq); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -4591,7 +4527,6 @@ recheck: raw_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - update_rq_clock(rq); on_rq = p->se.on_rq; running = task_current(rq, p); if (on_rq) @@ -5602,7 +5537,6 @@ void sched_idle_next(void) __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - update_rq_clock(rq); activate_task(rq, p, 0); raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -5657,7 +5591,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu) for ( ; ; ) { if (!rq->nr_running) break; - update_rq_clock(rq); next = pick_next_task(rq); if (!next) break; @@ -5941,7 +5874,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ raw_spin_lock_irq(&rq->lock); - update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; @@ -7891,7 +7823,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p) { int on_rq; - update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) deactivate_task(rq, p, 0); @@ -7918,9 +7849,9 @@ void normalize_rt_tasks(void) p->se.exec_start = 0; #ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.sleep_start = 0; - p->se.block_start = 0; + p->se.statistics.wait_start = 0; + p->se.statistics.sleep_start = 0; + p->se.statistics.block_start = 0; #endif if (!rt_task(p)) { @@ -8253,8 +8184,6 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); - update_rq_clock(rq); - running = task_current(rq, tsk); on_rq = tsk->se.on_rq; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 67f95aada4b9..8a46a719f367 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, PN(se->vruntime); PN(se->sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS - PN(se->wait_start); - PN(se->sleep_start); - PN(se->block_start); - PN(se->sleep_max); - PN(se->block_max); - PN(se->exec_max); - PN(se->slice_max); - PN(se->wait_max); - PN(se->wait_sum); - P(se->wait_count); + PN(se->statistics.wait_start); + PN(se->statistics.sleep_start); + PN(se->statistics.block_start); + PN(se->statistics.sleep_max); + PN(se->statistics.block_max); + PN(se->statistics.exec_max); + PN(se->statistics.slice_max); + PN(se->statistics.wait_max); + PN(se->statistics.wait_sum); + P(se->statistics.wait_count); #endif P(se->load.weight); #undef PN @@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", SPLIT_NS(p->se.vruntime), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.sum_sleep_runtime)); + SPLIT_NS(p->se.statistics.sum_sleep_runtime)); #else SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); @@ -407,40 +407,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.exec_start); PN(se.vruntime); PN(se.sum_exec_runtime); - PN(se.avg_overlap); - PN(se.avg_wakeup); nr_switches = p->nvcsw + p->nivcsw; #ifdef CONFIG_SCHEDSTATS - PN(se.wait_start); - PN(se.sleep_start); - PN(se.block_start); - PN(se.sleep_max); - PN(se.block_max); - PN(se.exec_max); - PN(se.slice_max); - PN(se.wait_max); - PN(se.wait_sum); - P(se.wait_count); - PN(se.iowait_sum); - P(se.iowait_count); + PN(se.statistics.wait_start); + PN(se.statistics.sleep_start); + PN(se.statistics.block_start); + PN(se.statistics.sleep_max); + PN(se.statistics.block_max); + PN(se.statistics.exec_max); + PN(se.statistics.slice_max); + PN(se.statistics.wait_max); + PN(se.statistics.wait_sum); + P(se.statistics.wait_count); + PN(se.statistics.iowait_sum); + P(se.statistics.iowait_count); P(sched_info.bkl_count); P(se.nr_migrations); - P(se.nr_migrations_cold); - P(se.nr_failed_migrations_affine); - P(se.nr_failed_migrations_running); - P(se.nr_failed_migrations_hot); - P(se.nr_forced_migrations); - P(se.nr_wakeups); - P(se.nr_wakeups_sync); - P(se.nr_wakeups_migrate); - P(se.nr_wakeups_local); - P(se.nr_wakeups_remote); - P(se.nr_wakeups_affine); - P(se.nr_wakeups_affine_attempts); - P(se.nr_wakeups_passive); - P(se.nr_wakeups_idle); + P(se.statistics.nr_migrations_cold); + P(se.statistics.nr_failed_migrations_affine); + P(se.statistics.nr_failed_migrations_running); + P(se.statistics.nr_failed_migrations_hot); + P(se.statistics.nr_forced_migrations); + P(se.statistics.nr_wakeups); + P(se.statistics.nr_wakeups_sync); + P(se.statistics.nr_wakeups_migrate); + P(se.statistics.nr_wakeups_local); + P(se.statistics.nr_wakeups_remote); + P(se.statistics.nr_wakeups_affine); + P(se.statistics.nr_wakeups_affine_attempts); + P(se.statistics.nr_wakeups_passive); + P(se.statistics.nr_wakeups_idle); { u64 avg_atom, avg_per_cpu; @@ -491,32 +489,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) void proc_sched_set_task(struct task_struct *p) { #ifdef CONFIG_SCHEDSTATS - p->se.wait_max = 0; - p->se.wait_sum = 0; - p->se.wait_count = 0; - p->se.iowait_sum = 0; - p->se.iowait_count = 0; - p->se.sleep_max = 0; - p->se.sum_sleep_runtime = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - p->se.nr_migrations = 0; - p->se.nr_migrations_cold = 0; - p->se.nr_failed_migrations_affine = 0; - p->se.nr_failed_migrations_running = 0; - p->se.nr_failed_migrations_hot = 0; - p->se.nr_forced_migrations = 0; - p->se.nr_wakeups = 0; - p->se.nr_wakeups_sync = 0; - p->se.nr_wakeups_migrate = 0; - p->se.nr_wakeups_local = 0; - p->se.nr_wakeups_remote = 0; - p->se.nr_wakeups_affine = 0; - p->se.nr_wakeups_affine_attempts = 0; - p->se.nr_wakeups_passive = 0; - p->se.nr_wakeups_idle = 0; - p->sched_info.bkl_count = 0; + memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5a5ea2cd924f..49ad99378f82 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -35,8 +35,8 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ -unsigned int sysctl_sched_latency = 5000000ULL; -unsigned int normalized_sysctl_sched_latency = 5000000ULL; +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* * The initial- and re-scaling of tunables is configurable @@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 1000000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; +unsigned int sysctl_sched_min_granularity = 2000000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -static unsigned int sched_nr_latency = 5; +static unsigned int sched_nr_latency = 3; /* * After fork, child runs first. If set to 0 (default) then @@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, { unsigned long delta_exec_weighted; - schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); + schedstat_set(curr->statistics.exec_max, + max((u64)delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); @@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); + schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); } /* @@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->wait_max, max(se->wait_max, - rq_of(cfs_rq)->clock - se->wait_start)); - schedstat_set(se->wait_count, se->wait_count + 1); - schedstat_set(se->wait_sum, se->wait_sum + - rq_of(cfs_rq)->clock - se->wait_start); + schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, + rq_of(cfs_rq)->clock - se->statistics.wait_start)); + schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); + schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + + rq_of(cfs_rq)->clock - se->statistics.wait_start); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), - rq_of(cfs_rq)->clock - se->wait_start); + rq_of(cfs_rq)->clock - se->statistics.wait_start); } #endif - schedstat_set(se->wait_start, 0); + schedstat_set(se->statistics.wait_start, 0); } static inline void @@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (entity_is_task(se)) tsk = task_of(se); - if (se->sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; + if (se->statistics.sleep_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; if ((s64)delta < 0) delta = 0; - if (unlikely(delta > se->sleep_max)) - se->sleep_max = delta; + if (unlikely(delta > se->statistics.sleep_max)) + se->statistics.sleep_max = delta; - se->sleep_start = 0; - se->sum_sleep_runtime += delta; + se->statistics.sleep_start = 0; + se->statistics.sum_sleep_runtime += delta; if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); trace_sched_stat_sleep(tsk, delta); } } - if (se->block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->block_start; + if (se->statistics.block_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; if ((s64)delta < 0) delta = 0; - if (unlikely(delta > se->block_max)) - se->block_max = delta; + if (unlikely(delta > se->statistics.block_max)) + se->statistics.block_max = delta; - se->block_start = 0; - se->sum_sleep_runtime += delta; + se->statistics.block_start = 0; + se->statistics.sum_sleep_runtime += delta; if (tsk) { if (tsk->in_iowait) { - se->iowait_sum += delta; - se->iowait_count++; + se->statistics.iowait_sum += delta; + se->statistics.iowait_count++; trace_sched_stat_iowait(tsk, delta); } @@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime += sched_vslice(cfs_rq, se); /* sleeps up to a single latency don't count. */ - if (!initial && sched_feat(FAIR_SLEEPERS)) { + if (!initial) { unsigned long thresh = sysctl_sched_latency; /* - * Convert the sleeper threshold into virtual time. - * SCHED_IDLE is a special sub-class. We care about - * fairness only relative to other SCHED_IDLE tasks, - * all of which have the same weight. - */ - if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) || - task_of(se)->policy != SCHED_IDLE)) - thresh = calc_delta_fair(thresh, se); - - /* * Halve their sleep time's effect, to allow * for a gentler effect of sleepers: */ @@ -826,9 +817,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - se->sleep_start = rq_of(cfs_rq)->clock; + se->statistics.sleep_start = rq_of(cfs_rq)->clock; if (tsk->state & TASK_UNINTERRUPTIBLE) - se->block_start = rq_of(cfs_rq)->clock; + se->statistics.block_start = rq_of(cfs_rq)->clock; } #endif } @@ -912,7 +903,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * when there are only lesser-weight tasks around): */ if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->slice_max = max(se->slice_max, + se->statistics.slice_max = max(se->statistics.slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } #endif @@ -1240,7 +1231,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - struct task_struct *curr = current; unsigned long this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; @@ -1255,18 +1245,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (sync) { - if (sched_feat(SYNC_LESS) && - (curr->se.avg_overlap > sysctl_sched_migration_cost || - p->se.avg_overlap > sysctl_sched_migration_cost)) - sync = 0; - } else { - if (sched_feat(SYNC_MORE) && - (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost)) - sync = 1; - } - /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1306,7 +1284,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) if (sync && balanced) return 1; - schedstat_inc(p, se.nr_wakeups_affine_attempts); + schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); if (balanced || @@ -1318,7 +1296,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * there is no bad imbalance. */ schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); return 1; } @@ -1451,13 +1429,12 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); int new_cpu = cpu; - int want_affine = 0; + int want_affine = 0, cpu_idle = !current->pid; int want_sd = 1; int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { - if (sched_feat(AFFINE_WAKEUPS) && - cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) want_affine = 1; new_cpu = prev_cpu; } @@ -1509,13 +1486,15 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag * If there's an idle sibling in this domain, make that * the wake_affine target instead of the current cpu. */ - if (tmp->flags & SD_SHARE_PKG_RESOURCES) + if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES) target = select_idle_sibling(p, tmp, target); if (target >= 0) { if (tmp->flags & SD_WAKE_AFFINE) { affine_sd = tmp; want_affine = 0; + if (target != cpu) + cpu_idle = 1; } cpu = target; } @@ -1531,6 +1510,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag sd = tmp; } +#ifdef CONFIG_FAIR_GROUP_SCHED if (sched_feat(LB_SHARES_UPDATE)) { /* * Pick the largest domain to update shares over @@ -1544,9 +1524,12 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag if (tmp) update_shares(tmp); } +#endif - if (affine_sd && wake_affine(affine_sd, p, sync)) - return cpu; + if (affine_sd) { + if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync)) + return cpu; + } while (sd) { int load_idx = sd->forkexec_idx; @@ -1591,63 +1574,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } #endif /* CONFIG_SMP */ -/* - * Adaptive granularity - * - * se->avg_wakeup gives the average time a task runs until it does a wakeup, - * with the limit of wakeup_gran -- when it never does a wakeup. - * - * So the smaller avg_wakeup is the faster we want this task to preempt, - * but we don't want to treat the preemptee unfairly and therefore allow it - * to run for at least the amount of time we'd like to run. - * - * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one - * - * NOTE: we use *nr_running to scale with load, this nicely matches the - * degrading latency on load. - */ -static unsigned long -adaptive_gran(struct sched_entity *curr, struct sched_entity *se) -{ - u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running; - u64 gran = 0; - - if (this_run < expected_wakeup) - gran = expected_wakeup - this_run; - - return min_t(s64, gran, sysctl_sched_wakeup_granularity); -} - static unsigned long wakeup_gran(struct sched_entity *curr, struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; - if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN)) - gran = adaptive_gran(curr, se); - /* * Since its curr running now, convert the gran from real-time * to virtual-time in his units. + * + * By using 'se' instead of 'curr' we penalize light tasks, so + * they get preempted easier. That is, if 'se' < 'curr' then + * the resulting gran will be larger, therefore penalizing the + * lighter, if otoh 'se' > 'curr' then the resulting gran will + * be smaller, again penalizing the lighter task. + * + * This is especially important for buddies when the leftmost + * task is higher priority than the buddy. */ - if (sched_feat(ASYM_GRAN)) { - /* - * By using 'se' instead of 'curr' we penalize light tasks, so - * they get preempted easier. That is, if 'se' < 'curr' then - * the resulting gran will be larger, therefore penalizing the - * lighter, if otoh 'se' > 'curr' then the resulting gran will - * be smaller, again penalizing the lighter task. - * - * This is especially important for buddies when the leftmost - * task is higher priority than the buddy. - */ - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, se); - } else { - if (unlikely(curr->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, curr); - } + if (unlikely(se->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, se); return gran; } @@ -1705,7 +1651,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; if (unlikely(rt_prio(p->prio))) @@ -1738,14 +1683,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(curr->policy == SCHED_IDLE)) goto preempt; - if (sched_feat(WAKEUP_SYNC) && sync) - goto preempt; - - if (sched_feat(WAKEUP_OVERLAP) && - se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost) - goto preempt; - if (!sched_feat(WAKEUP_PREEMPT)) return; @@ -1844,13 +1781,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { - schedstat_inc(p, se.nr_failed_migrations_affine); + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } *all_pinned = 0; if (task_running(rq, p)) { - schedstat_inc(p, se.nr_failed_migrations_running); + schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -1866,14 +1803,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, #ifdef CONFIG_SCHEDSTATS if (tsk_cache_hot) { schedstat_inc(sd, lb_hot_gained[idle]); - schedstat_inc(p, se.nr_forced_migrations); + schedstat_inc(p, se.statistics.nr_forced_migrations); } #endif return 1; } if (tsk_cache_hot) { - schedstat_inc(p, se.nr_failed_migrations_hot); + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); return 0; } return 1; @@ -3112,8 +3049,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* move a task from busiest_rq to target_rq */ double_lock_balance(busiest_rq, target_rq); - update_rq_clock(busiest_rq); - update_rq_clock(target_rq); /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { diff --git a/kernel/sched_features.h b/kernel/sched_features.h index d5059fd761d9..83c66e8ad3ee 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,11 +1,4 @@ /* - * Disregards a certain amount of sleep time (sched_latency_ns) and - * considers the task to be running during that period. This gives it - * a service deficit on wakeup, allowing it to run sooner. - */ -SCHED_FEAT(FAIR_SLEEPERS, 1) - -/* * Only give sleepers 50% of their service deficit. This allows * them to run sooner, but does not allow tons of sleepers to * rip the spread apart. @@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1) SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) /* - * By not normalizing the sleep time, heavy tasks get an effective - * longer period, and lighter task an effective shorter period they - * are considered running. - */ -SCHED_FEAT(NORMALIZED_SLEEPER, 0) - -/* * Place new tasks ahead so that they do not starve already running * tasks */ @@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1) SCHED_FEAT(WAKEUP_PREEMPT, 1) /* - * Compute wakeup_gran based on task behaviour, clipped to - * [0, sched_wakeup_gran_ns] - */ -SCHED_FEAT(ADAPTIVE_GRAN, 1) - -/* - * When converting the wakeup granularity to virtual time, do it such - * that heavier tasks preempting a lighter task have an edge. - */ -SCHED_FEAT(ASYM_GRAN, 1) - -/* - * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS. - */ -SCHED_FEAT(WAKEUP_SYNC, 0) - -/* - * Wakeup preempt based on task behaviour. Tasks that do not overlap - * don't get preempted. - */ -SCHED_FEAT(WAKEUP_OVERLAP, 0) - -/* - * Use the SYNC wakeup hint, pipes and the likes use this to indicate - * the remote end is likely to consume the data we just wrote, and - * therefore has cache benefit from being placed on the same cpu, see - * also AFFINE_WAKEUPS. - */ -SCHED_FEAT(SYNC_WAKEUPS, 1) - -/* * Based on load and program behaviour, see if it makes sense to place * a newly woken task on the same cpu as the task that woke it -- * improve cache locality. Typically used with SYNC wakeups as @@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) /* - * Weaken SYNC hint based on overlap - */ -SCHED_FEAT(SYNC_LESS, 1) - -/* - * Add SYNC hint based on overlap - */ -SCHED_FEAT(SYNC_MORE, 0) - -/* * Prefer to schedule the task we woke last (assuming it failed * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b5b920ae2ea7..012d69bb67c7 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f992762d7f51..f25735a767af 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -262,6 +262,9 @@ void tick_nohz_stop_sched_tick(int inidle) goto end; } + if (nohz_ratelimit(cpu)) + goto end; + ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { diff --git a/kernel/user.c b/kernel/user.c index 766467b3bcb7..ec3b2229893b 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -178,8 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) return up; - put_user_ns(new->user_ns); - kmem_cache_free(uid_cachep, new); out_unlock: return NULL; } |