summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJeff Moyer <jmoyer@redhat.com>2012-12-05 20:17:21 +0100
committerJens Axboe <axboe@kernel.dk>2012-12-05 20:17:21 +0100
commit8fa72d234da9b6b473bbb1f74d533663e4996e6b (patch)
tree536b8d2cf5aa3f0baee3aa0d89e602733be3df56 /mm
parentc304a51bf42a7ce48f430a371e1213472c89e13d (diff)
downloadtalos-op-linux-8fa72d234da9b6b473bbb1f74d533663e4996e6b.tar.gz
talos-op-linux-8fa72d234da9b6b473bbb1f74d533663e4996e6b.zip
bdi: add a user-tunable cpu_list for the bdi flusher threads
In realtime environments, it may be desirable to keep the per-bdi flusher threads from running on certain cpus. This patch adds a cpu_list file to /sys/class/bdi/* to enable this. The default is to tie the flusher threads to the same numa node as the backing device (though I could be convinced to make it a mask of all cpus to avoid a change in behaviour). Thanks to Jeremy Eder for the original idea. Signed-off-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c84
1 files changed, 84 insertions, 0 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d3ca2b3ee176..bd6a6cabef71 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
+#include <linux/slab.h>
#include <trace/events/writeback.h>
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
@@ -221,12 +222,63 @@ static ssize_t max_ratio_store(struct device *dev,
}
BDI_SHOW(max_ratio, bdi->max_ratio)
+static ssize_t cpu_list_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ struct bdi_writeback *wb = &bdi->wb;
+ cpumask_var_t newmask;
+ ssize_t ret;
+ struct task_struct *task;
+
+ if (!alloc_cpumask_var(&newmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpulist_parse(buf, newmask);
+ if (!ret) {
+ spin_lock_bh(&bdi->wb_lock);
+ task = wb->task;
+ if (task)
+ get_task_struct(task);
+ spin_unlock_bh(&bdi->wb_lock);
+
+ mutex_lock(&bdi->flusher_cpumask_lock);
+ if (task) {
+ ret = set_cpus_allowed_ptr(task, newmask);
+ put_task_struct(task);
+ }
+ if (ret == 0) {
+ cpumask_copy(bdi->flusher_cpumask, newmask);
+ ret = count;
+ }
+ mutex_unlock(&bdi->flusher_cpumask_lock);
+
+ }
+ free_cpumask_var(newmask);
+
+ return ret;
+}
+
+static ssize_t cpu_list_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ ssize_t ret;
+
+ mutex_lock(&bdi->flusher_cpumask_lock);
+ ret = cpulist_scnprintf(page, PAGE_SIZE-1, bdi->flusher_cpumask);
+ mutex_unlock(&bdi->flusher_cpumask_lock);
+
+ return ret;
+}
+
#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
static struct device_attribute bdi_dev_attrs[] = {
__ATTR_RW(read_ahead_kb),
__ATTR_RW(min_ratio),
__ATTR_RW(max_ratio),
+ __ATTR_RW(cpu_list),
__ATTR_NULL,
};
@@ -428,6 +480,7 @@ static int bdi_forker_thread(void *ptr)
writeback_inodes_wb(&bdi->wb, 1024,
WB_REASON_FORKER_THREAD);
} else {
+ int ret;
/*
* The spinlock makes sure we do not lose
* wake-ups when racing with 'bdi_queue_work()'.
@@ -437,6 +490,14 @@ static int bdi_forker_thread(void *ptr)
spin_lock_bh(&bdi->wb_lock);
bdi->wb.task = task;
spin_unlock_bh(&bdi->wb_lock);
+ mutex_lock(&bdi->flusher_cpumask_lock);
+ ret = set_cpus_allowed_ptr(task,
+ bdi->flusher_cpumask);
+ mutex_unlock(&bdi->flusher_cpumask_lock);
+ if (ret)
+ printk_once("%s: failed to bind flusher"
+ " thread %s, error %d\n",
+ __func__, task->comm, ret);
wake_up_process(task);
}
bdi_clear_pending(bdi);
@@ -509,6 +570,17 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
dev_name(dev));
if (IS_ERR(wb->task))
return PTR_ERR(wb->task);
+ } else {
+ int node;
+ /*
+ * Set up a default cpumask for the flusher threads that
+ * includes all cpus on the same numa node as the device.
+ * The mask may be overridden via sysfs.
+ */
+ node = dev_to_node(bdi->dev);
+ if (node != NUMA_NO_NODE)
+ cpumask_copy(bdi->flusher_cpumask,
+ cpumask_of_node(node));
}
bdi_debug_register(bdi, dev_name(dev));
@@ -634,6 +706,15 @@ int bdi_init(struct backing_dev_info *bdi)
bdi_wb_init(&bdi->wb, bdi);
+ if (!bdi_cap_flush_forker(bdi)) {
+ bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+ if (!bdi->flusher_cpumask)
+ return -ENOMEM;
+ cpumask_setall(bdi->flusher_cpumask);
+ mutex_init(&bdi->flusher_cpumask_lock);
+ } else
+ bdi->flusher_cpumask = NULL;
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init(&bdi->bdi_stat[i], 0);
if (err)
@@ -656,6 +737,7 @@ int bdi_init(struct backing_dev_info *bdi)
err:
while (i--)
percpu_counter_destroy(&bdi->bdi_stat[i]);
+ kfree(bdi->flusher_cpumask);
}
return err;
@@ -683,6 +765,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
bdi_unregister(bdi);
+ kfree(bdi->flusher_cpumask);
+
/*
* If bdi_unregister() had already been called earlier, the
* wakeup_timer could still be armed because bdi_prune_sb()
OpenPOWER on IntegriCloud