diff options
Diffstat (limited to 'kernel/sched/cputime.c')
-rw-r--r-- | kernel/sched/cputime.c | 160 |
1 files changed, 130 insertions, 30 deletions
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 46ed4e1383e2..e0cd20693ef5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -405,27 +405,25 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ /* * Use precise platform statistics if available: */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + # ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_common_task_switch(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { if (is_idle_task(prev)) vtime_account_idle(prev); else - vtime_account_system(prev); + vtime_account_kernel(prev); vtime_flush(prev); arch_vtime_task_switch(prev); } # endif -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that + * vtime_account_kernel() and vtime_account_idle(). Archs that * have other meaning of the idle time (s390 only includes the * time spent by the CPU when it's in low power mode) must override * vtime_account(). @@ -436,7 +434,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) if (!in_interrupt() && is_idle_task(tsk)) vtime_account_idle(tsk); else - vtime_account_system(tsk); + vtime_account_kernel(tsk); } EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ @@ -477,7 +475,7 @@ void account_process_tick(struct task_struct *p, int user_tick) u64 cputime, steal; struct rq *rq = this_rq(); - if (vtime_accounting_cpu_enabled()) + if (vtime_accounting_enabled_this_cpu()) return; if (sched_clock_irqtime) { @@ -711,8 +709,8 @@ static u64 get_vtime_delta(struct vtime *vtime) return delta - other; } -static void __vtime_account_system(struct task_struct *tsk, - struct vtime *vtime) +static void vtime_account_system(struct task_struct *tsk, + struct vtime *vtime) { vtime->stime += get_vtime_delta(vtime); if (vtime->stime >= TICK_NSEC) { @@ -731,7 +729,17 @@ static void vtime_account_guest(struct task_struct *tsk, } } -void vtime_account_system(struct task_struct *tsk) +static void __vtime_account_kernel(struct task_struct *tsk, + struct vtime *vtime) +{ + /* We might have scheduled out from guest path */ + if (vtime->state == VTIME_GUEST) + vtime_account_guest(tsk, vtime); + else + vtime_account_system(tsk, vtime); +} + +void vtime_account_kernel(struct task_struct *tsk) { struct vtime *vtime = &tsk->vtime; @@ -739,11 +747,7 @@ void vtime_account_system(struct task_struct *tsk) return; write_seqcount_begin(&vtime->seqcount); - /* We might have scheduled out from guest path */ - if (tsk->flags & PF_VCPU) - vtime_account_guest(tsk, vtime); - else - __vtime_account_system(tsk, vtime); + __vtime_account_kernel(tsk, vtime); write_seqcount_end(&vtime->seqcount); } @@ -752,7 +756,7 @@ void vtime_user_enter(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk, vtime); + vtime_account_system(tsk, vtime); vtime->state = VTIME_USER; write_seqcount_end(&vtime->seqcount); } @@ -782,8 +786,9 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk, vtime); + vtime_account_system(tsk, vtime); tsk->flags |= PF_VCPU; + vtime->state = VTIME_GUEST; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -795,6 +800,7 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); tsk->flags &= ~PF_VCPU; + vtime->state = VTIME_SYS; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); @@ -804,19 +810,30 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(get_vtime_delta(&tsk->vtime)); } -void arch_vtime_task_switch(struct task_struct *prev) +void vtime_task_switch_generic(struct task_struct *prev) { struct vtime *vtime = &prev->vtime; write_seqcount_begin(&vtime->seqcount); + if (vtime->state == VTIME_IDLE) + vtime_account_idle(prev); + else + __vtime_account_kernel(prev, vtime); vtime->state = VTIME_INACTIVE; + vtime->cpu = -1; write_seqcount_end(&vtime->seqcount); vtime = ¤t->vtime; write_seqcount_begin(&vtime->seqcount); - vtime->state = VTIME_SYS; + if (is_idle_task(current)) + vtime->state = VTIME_IDLE; + else if (current->flags & PF_VCPU) + vtime->state = VTIME_GUEST; + else + vtime->state = VTIME_SYS; vtime->starttime = sched_clock(); + vtime->cpu = smp_processor_id(); write_seqcount_end(&vtime->seqcount); } @@ -827,8 +844,9 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); - vtime->state = VTIME_SYS; + vtime->state = VTIME_IDLE; vtime->starttime = sched_clock(); + vtime->cpu = cpu; write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } @@ -846,7 +864,7 @@ u64 task_gtime(struct task_struct *t) seq = read_seqcount_begin(&vtime->seqcount); gtime = t->gtime; - if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) + if (vtime->state == VTIME_GUEST) gtime += vtime->gtime + vtime_delta(vtime); } while (read_seqcount_retry(&vtime->seqcount, seq)); @@ -877,20 +895,102 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) *utime = t->utime; *stime = t->stime; - /* Task is sleeping, nothing to add */ - if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) + /* Task is sleeping or idle, nothing to add */ + if (vtime->state < VTIME_SYS) continue; delta = vtime_delta(vtime); /* - * Task runs either in user or kernel space, add pending nohz time to - * the right place. + * Task runs either in user (including guest) or kernel space, + * add pending nohz time to the right place. */ - if (vtime->state == VTIME_USER || t->flags & PF_VCPU) - *utime += vtime->utime + delta; - else if (vtime->state == VTIME_SYS) + if (vtime->state == VTIME_SYS) *stime += vtime->stime + delta; + else + *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); } + +static int kcpustat_field_vtime(u64 *cpustat, + struct vtime *vtime, + enum cpu_usage_stat usage, + int cpu, u64 *val) +{ + unsigned int seq; + int err; + + do { + seq = read_seqcount_begin(&vtime->seqcount); + + /* + * We raced against context switch, fetch the + * kcpustat task again. + */ + if (vtime->cpu != cpu && vtime->cpu != -1) + return -EAGAIN; + + /* + * Two possible things here: + * 1) We are seeing the scheduling out task (prev) or any past one. + * 2) We are seeing the scheduling in task (next) but it hasn't + * passed though vtime_task_switch() yet so the pending + * cputime of the prev task may not be flushed yet. + * + * Case 1) is ok but 2) is not. So wait for a safe VTIME state. + */ + if (vtime->state == VTIME_INACTIVE) + return -EAGAIN; + + err = 0; + + *val = cpustat[usage]; + + if (vtime->state == VTIME_SYS) + *val += vtime->stime + vtime_delta(vtime); + + } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return 0; +} + +u64 kcpustat_field(struct kernel_cpustat *kcpustat, + enum cpu_usage_stat usage, int cpu) +{ + u64 *cpustat = kcpustat->cpustat; + struct rq *rq; + u64 val; + int err; + + if (!vtime_accounting_enabled_cpu(cpu)) + return cpustat[usage]; + + /* Only support sys vtime for now */ + if (usage != CPUTIME_SYSTEM) + return cpustat[usage]; + + rq = cpu_rq(cpu); + + for (;;) { + struct task_struct *curr; + struct vtime *vtime; + + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (WARN_ON_ONCE(!curr)) { + rcu_read_unlock(); + return cpustat[usage]; + } + + vtime = &curr->vtime; + err = kcpustat_field_vtime(cpustat, vtime, usage, cpu, &val); + rcu_read_unlock(); + + if (!err) + return val; + + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(kcpustat_field); #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ |