diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 165 |
1 files changed, 156 insertions, 9 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dbe0f628efa3..85565053a6ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -888,6 +888,17 @@ static unsigned int task_scan_max(struct task_struct *p) */ unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + struct list_head task_list; + + struct rcu_head rcu; + atomic_long_t faults[0]; +}; + static inline int task_faults_idx(int nid, int priv) { return 2 * nid + priv; @@ -1182,7 +1193,10 @@ static void task_numa_placement(struct task_struct *p) int priv, i; for (priv = 0; priv < 2; priv++) { + long diff; + i = task_faults_idx(nid, priv); + diff = -p->numa_faults[i]; /* Decay existing window, copy faults since last scan */ p->numa_faults[i] >>= 1; @@ -1190,6 +1204,11 @@ static void task_numa_placement(struct task_struct *p) p->numa_faults_buffer[i] = 0; faults += p->numa_faults[i]; + diff += p->numa_faults[i]; + if (p->numa_group) { + /* safe because we can only change our own group */ + atomic_long_add(diff, &p->numa_group->faults[i]); + } } if (faults > max_faults) { @@ -1207,6 +1226,131 @@ static void task_numa_placement(struct task_struct *p) } } +static inline int get_numa_group(struct numa_group *grp) +{ + return atomic_inc_not_zero(&grp->refcount); +} + +static inline void put_numa_group(struct numa_group *grp) +{ + if (atomic_dec_and_test(&grp->refcount)) + kfree_rcu(grp, rcu); +} + +static void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static void task_numa_group(struct task_struct *p, int cpupid) +{ + struct numa_group *grp, *my_grp; + struct task_struct *tsk; + bool join = false; + int cpu = cpupid_to_cpu(cpupid); + int i; + + if (unlikely(!p->numa_group)) { + unsigned int size = sizeof(struct numa_group) + + 2*nr_node_ids*sizeof(atomic_long_t); + + grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!grp) + return; + + atomic_set(&grp->refcount, 1); + spin_lock_init(&grp->lock); + INIT_LIST_HEAD(&grp->task_list); + + for (i = 0; i < 2*nr_node_ids; i++) + atomic_long_set(&grp->faults[i], p->numa_faults[i]); + + list_add(&p->numa_entry, &grp->task_list); + grp->nr_tasks++; + rcu_assign_pointer(p->numa_group, grp); + } + + rcu_read_lock(); + tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + + if (!cpupid_match_pid(tsk, cpupid)) + goto unlock; + + grp = rcu_dereference(tsk->numa_group); + if (!grp) + goto unlock; + + my_grp = p->numa_group; + if (grp == my_grp) + goto unlock; + + /* + * Only join the other group if its bigger; if we're the bigger group, + * the other task will join us. + */ + if (my_grp->nr_tasks > grp->nr_tasks) + goto unlock; + + /* + * Tie-break on the grp address. + */ + if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) + goto unlock; + + if (!get_numa_group(grp)) + goto unlock; + + join = true; + +unlock: + rcu_read_unlock(); + + if (!join) + return; + + for (i = 0; i < 2*nr_node_ids; i++) { + atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]); + atomic_long_add(p->numa_faults[i], &grp->faults[i]); + } + + double_lock(&my_grp->lock, &grp->lock); + + list_move(&p->numa_entry, &grp->task_list); + my_grp->nr_tasks--; + grp->nr_tasks++; + + spin_unlock(&my_grp->lock); + spin_unlock(&grp->lock); + + rcu_assign_pointer(p->numa_group, grp); + + put_numa_group(my_grp); +} + +void task_numa_free(struct task_struct *p) +{ + struct numa_group *grp = p->numa_group; + int i; + + if (grp) { + for (i = 0; i < 2*nr_node_ids; i++) + atomic_long_sub(p->numa_faults[i], &grp->faults[i]); + + spin_lock(&grp->lock); + list_del(&p->numa_entry); + grp->nr_tasks--; + spin_unlock(&grp->lock); + rcu_assign_pointer(p->numa_group, NULL); + put_numa_group(grp); + } + + kfree(p->numa_faults); +} + /* * Got a PROT_NONE fault for a page on @node. */ @@ -1222,15 +1366,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) if (!p->mm) return; - /* - * First accesses are treated as private, otherwise consider accesses - * to be private if the accessing pid has not changed - */ - if (!cpupid_pid_unset(last_cpupid)) - priv = ((p->pid & LAST__PID_MASK) == cpupid_to_pid(last_cpupid)); - else - priv = 1; - /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; @@ -1245,6 +1380,18 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) } /* + * First accesses are treated as private, otherwise consider accesses + * to be private if the accessing pid has not changed + */ + if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { + priv = 1; + } else { + priv = cpupid_match_pid(p, last_cpupid); + if (!priv) + task_numa_group(p, last_cpupid); + } + + /* * If pages are properly placed (did not migrate) then scan slower. * This is reset periodically in case of phase changes */ |