diff options
-rw-r--r-- | Documentation/cgroups/blkio-controller.txt | 28 | ||||
-rw-r--r-- | block/blk-core.c | 108 | ||||
-rw-r--r-- | block/blk-flush.c | 439 | ||||
-rw-r--r-- | block/blk-settings.c | 7 | ||||
-rw-r--r-- | block/blk-sysfs.c | 2 | ||||
-rw-r--r-- | block/blk-throttle.c | 6 | ||||
-rw-r--r-- | block/blk.h | 12 | ||||
-rw-r--r-- | block/cfq-iosched.c | 57 | ||||
-rw-r--r-- | block/elevator.c | 9 | ||||
-rw-r--r-- | block/genhd.c | 16 | ||||
-rw-r--r-- | drivers/block/loop.c | 3 | ||||
-rw-r--r-- | include/linux/blk_types.h | 2 | ||||
-rw-r--r-- | include/linux/blkdev.h | 25 | ||||
-rw-r--r-- | include/linux/elevator.h | 1 |
14 files changed, 429 insertions, 286 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 4ed7b5ceeed2..d915c16df42c 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt @@ -343,34 +343,6 @@ Common files among various policies CFQ sysfs tunable ================= -/sys/block/<disk>/queue/iosched/group_isolation ------------------------------------------------ - -If group_isolation=1, it provides stronger isolation between groups at the -expense of throughput. By default group_isolation is 0. In general that -means that if group_isolation=0, expect fairness for sequential workload -only. Set group_isolation=1 to see fairness for random IO workload also. - -Generally CFQ will put random seeky workload in sync-noidle category. CFQ -will disable idling on these queues and it does a collective idling on group -of such queues. Generally these are slow moving queues and if there is a -sync-noidle service tree in each group, that group gets exclusive access to -disk for certain period. That means it will bring the throughput down if -group does not have enough IO to drive deeper queue depths and utilize disk -capacity to the fullest in the slice allocated to it. But the flip side is -that even a random reader should get better latencies and overall throughput -if there are lots of sequential readers/sync-idle workload running in the -system. - -If group_isolation=0, then CFQ automatically moves all the random seeky queues -in the root group. That means there will be no service differentiation for -that kind of workload. This leads to better throughput as we do collective -idling on root sync-noidle tree. - -By default one should run with group_isolation=0. If that is not sufficient -and one wants stronger isolation between groups, then set group_isolation=1 -but this will come at cost of reduced throughput. - /sys/block/<disk>/queue/iosched/slice_idle ------------------------------------------ On a faster hardware CFQ can be slow, especially with sequential workload. diff --git a/block/blk-core.c b/block/blk-core.c index 518dd423a5fe..74d496ccf4d7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -149,39 +149,29 @@ EXPORT_SYMBOL(blk_rq_init); static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { - struct request_queue *q = rq->q; - - if (&q->flush_rq != rq) { - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - - if (unlikely(nbytes > bio->bi_size)) { - printk(KERN_ERR "%s: want %u bytes done, %u left\n", - __func__, nbytes, bio->bi_size); - nbytes = bio->bi_size; - } + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; + + if (unlikely(nbytes > bio->bi_size)) { + printk(KERN_ERR "%s: want %u bytes done, %u left\n", + __func__, nbytes, bio->bi_size); + nbytes = bio->bi_size; + } - if (unlikely(rq->cmd_flags & REQ_QUIET)) - set_bit(BIO_QUIET, &bio->bi_flags); + if (unlikely(rq->cmd_flags & REQ_QUIET)) + set_bit(BIO_QUIET, &bio->bi_flags); - bio->bi_size -= nbytes; - bio->bi_sector += (nbytes >> 9); + bio->bi_size -= nbytes; + bio->bi_sector += (nbytes >> 9); - if (bio_integrity(bio)) - bio_integrity_advance(bio, nbytes); + if (bio_integrity(bio)) + bio_integrity_advance(bio, nbytes); - if (bio->bi_size == 0) - bio_endio(bio, error); - } else { - /* - * Okay, this is the sequenced flush request in - * progress, just record the error; - */ - if (error && !q->flush_err) - q->flush_err = error; - } + /* don't actually finish bio if it's part of flush sequence */ + if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + bio_endio(bio, error); } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -390,13 +380,16 @@ EXPORT_SYMBOL(blk_stop_queue); * that its ->make_request_fn will not re-add plugging prior to calling * this function. * + * This function does not cancel any asynchronous activity arising + * out of elevator or throttling code. That would require elevaotor_exit() + * and blk_throtl_exit() to be called with queue lock initialized. + * */ void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->unplug_timer); del_timer_sync(&q->timeout); cancel_work_sync(&q->unplug_work); - throtl_shutdown_timer_wq(q); } EXPORT_SYMBOL(blk_sync_queue); @@ -457,6 +450,11 @@ void blk_put_queue(struct request_queue *q) kobject_put(&q->kobj); } +/* + * Note: If a driver supplied the queue lock, it should not zap that lock + * unexpectedly as some queue cleanup components like elevator_exit() and + * blk_throtl_exit() need queue lock. + */ void blk_cleanup_queue(struct request_queue *q) { /* @@ -475,6 +473,8 @@ void blk_cleanup_queue(struct request_queue *q) if (q->elevator) elevator_exit(q->elevator); + blk_throtl_exit(q); + blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); @@ -541,7 +541,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->timeout_list); - INIT_LIST_HEAD(&q->pending_flushes); + INIT_LIST_HEAD(&q->flush_queue[0]); + INIT_LIST_HEAD(&q->flush_queue[1]); + INIT_LIST_HEAD(&q->flush_data_in_flight); INIT_WORK(&q->unplug_work, blk_unplug_work); kobject_init(&q->kobj, &blk_queue_ktype); @@ -549,6 +551,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); + /* + * By default initialize queue_lock to internal lock and driver can + * override it later if need be. + */ + q->queue_lock = &q->__queue_lock; + return q; } EXPORT_SYMBOL(blk_alloc_queue_node); @@ -633,7 +641,10 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, q->unprep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; q->queue_flags = QUEUE_FLAG_DEFAULT; - q->queue_lock = lock; + + /* Override internal queue lock with supplied lock pointer */ + if (lock) + q->queue_lock = lock; /* * This also sets hw/phys segments, boundary and size @@ -762,6 +773,25 @@ static void freed_request(struct request_queue *q, int sync, int priv) } /* + * Determine if elevator data should be initialized when allocating the + * request associated with @bio. + */ +static bool blk_rq_should_init_elevator(struct bio *bio) +{ + if (!bio) + return true; + + /* + * Flush requests do not use the elevator so skip initialization. + * This allows a request to share the flush and elevator data. + */ + if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) + return false; + + return true; +} + +/* * Get a free request, queue_lock must be held. * Returns NULL on failure, with queue_lock held. * Returns !NULL on success, with queue_lock *not held*. @@ -773,7 +803,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, struct request_list *rl = &q->rq; struct io_context *ioc = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; - int may_queue, priv; + int may_queue, priv = 0; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) @@ -817,9 +847,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl->count[is_sync]++; rl->starved[is_sync] = 0; - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) - rl->elvpriv++; + if (blk_rq_should_init_elevator(bio)) { + priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); + if (priv) + rl->elvpriv++; + } if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; @@ -1220,7 +1252,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) spin_lock_irq(q->queue_lock); if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { - where = ELEVATOR_INSERT_FRONT; + where = ELEVATOR_INSERT_FLUSH; goto get_rq; } @@ -1805,7 +1837,7 @@ static void blk_account_io_done(struct request *req) * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && req != &req->q->flush_rq) { + if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; diff --git a/block/blk-flush.c b/block/blk-flush.c index b27d0208611b..0bd8c9c5d6e5 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -1,6 +1,69 @@ /* * Functions to sequence FLUSH and FUA writes. + * + * Copyright (C) 2011 Max Planck Institute for Gravitational Physics + * Copyright (C) 2011 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + * + * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three + * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request + * properties and hardware capability. + * + * If a request doesn't have data, only REQ_FLUSH makes sense, which + * indicates a simple flush request. If there is data, REQ_FLUSH indicates + * that the device cache should be flushed before the data is executed, and + * REQ_FUA means that the data must be on non-volatile media on request + * completion. + * + * If the device doesn't have writeback cache, FLUSH and FUA don't make any + * difference. The requests are either completed immediately if there's no + * data or executed as normal requests otherwise. + * + * If the device has writeback cache and supports FUA, REQ_FLUSH is + * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. + * + * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is + * translated to PREFLUSH and REQ_FUA to POSTFLUSH. + * + * The actual execution of flush is double buffered. Whenever a request + * needs to execute PRE or POSTFLUSH, it queues at + * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a + * flush is issued and the pending_idx is toggled. When the flush + * completes, all the requests which were pending are proceeded to the next + * step. This allows arbitrary merging of different types of FLUSH/FUA + * requests. + * + * Currently, the following conditions are used to determine when to issue + * flush. + * + * C1. At any given time, only one flush shall be in progress. This makes + * double buffering sufficient. + * + * C2. Flush is deferred if any request is executing DATA of its sequence. + * This avoids issuing separate POSTFLUSHes for requests which shared + * PREFLUSH. + * + * C3. The second condition is ignored if there is a request which has + * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid + * starvation in the unlikely case where there are continuous stream of + * FUA (without FLUSH) requests. + * + * For devices which support FUA, it isn't clear whether C2 (and thus C3) + * is beneficial. + * + * Note that a sequenced FLUSH/FUA request with DATA is completed twice. + * Once while executing DATA and again after the whole sequence is + * complete. The first completion updates the contained bio but doesn't + * finish it so that the bio submitter is notified only after the whole + * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in + * req_bio_endio(). + * + * The above peculiarity requires that each FLUSH/FUA request has only one + * bio attached to it, which is guaranteed as they aren't allowed to be + * merged in the usual way. */ + #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> @@ -11,58 +74,143 @@ /* FLUSH/FUA sequences */ enum { - QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */ - QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */ - QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */ - QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */ - QUEUE_FSEQ_DONE = (1 << 4), + REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ + REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ + REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ + REQ_FSEQ_DONE = (1 << 3), + + REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | + REQ_FSEQ_POSTFLUSH, + + /* + * If flush has been pending longer than the following timeout, + * it's issued even if flush_data requests are still in flight. + */ + FLUSH_PENDING_TIMEOUT = 5 * HZ, }; -static struct request *queue_next_fseq(struct request_queue *q); +static bool blk_kick_flush(struct request_queue *q); -unsigned blk_flush_cur_seq(struct request_queue *q) +static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) { - if (!q->flush_seq) - return 0; - return 1 << ffz(q->flush_seq); + unsigned int policy = 0; + + if (fflags & REQ_FLUSH) { + if (rq->cmd_flags & REQ_FLUSH) + policy |= REQ_FSEQ_PREFLUSH; + if (blk_rq_sectors(rq)) + policy |= REQ_FSEQ_DATA; + if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) + policy |= REQ_FSEQ_POSTFLUSH; + } + return policy; } -static struct request *blk_flush_complete_seq(struct request_queue *q, - unsigned seq, int error) +static unsigned int blk_flush_cur_seq(struct request *rq) { - struct request *next_rq = NULL; - - if (error && !q->flush_err) - q->flush_err = error; - - BUG_ON(q->flush_seq & seq); - q->flush_seq |= seq; - - if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) { - /* not complete yet, queue the next flush sequence */ - next_rq = queue_next_fseq(q); - } else { - /* complete this flush request */ - __blk_end_request_all(q->orig_flush_rq, q->flush_err); - q->orig_flush_rq = NULL; - q->flush_seq = 0; - - /* dispatch the next flush if there's one */ - if (!list_empty(&q->pending_flushes)) { - next_rq = list_entry_rq(q->pending_flushes.next); - list_move(&next_rq->queuelist, &q->queue_head); - } + return 1 << ffz(rq->flush.seq); +} + +static void blk_flush_restore_request(struct request *rq) +{ + /* + * After flush data completion, @rq->bio is %NULL but we need to + * complete the bio again. @rq->biotail is guaranteed to equal the + * original @rq->bio. Restore it. + */ + rq->bio = rq->biotail; + + /* make @rq a normal request */ + rq->cmd_flags &= ~REQ_FLUSH_SEQ; + rq->end_io = NULL; +} + +/** + * blk_flush_complete_seq - complete flush sequence + * @rq: FLUSH/FUA request being sequenced + * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) + * @error: whether an error occurred + * + * @rq just completed @seq part of its flush sequence, record the + * completion and trigger the next step. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + * + * RETURNS: + * %true if requests were added to the dispatch queue, %false otherwise. + */ +static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, + int error) +{ + struct request_queue *q = rq->q; + struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + bool queued = false; + + BUG_ON(rq->flush.seq & seq); + rq->flush.seq |= seq; + + if (likely(!error)) + seq = blk_flush_cur_seq(rq); + else + seq = REQ_FSEQ_DONE; + + switch (seq) { + case REQ_FSEQ_PREFLUSH: + case REQ_FSEQ_POSTFLUSH: + /* queue for flush */ + if (list_empty(pending)) + q->flush_pending_since = jiffies; + list_move_tail(&rq->flush.list, pending); + break; + + case REQ_FSEQ_DATA: + list_move_tail(&rq->flush.list, &q->flush_data_in_flight); + list_add(&rq->queuelist, &q->queue_head); + queued = true; + break; + + case REQ_FSEQ_DONE: + /* + * @rq was previously adjusted by blk_flush_issue() for + * flush sequencing and may already have gone through the + * flush data request completion path. Restore @rq for + * normal completion and end it. + */ + BUG_ON(!list_empty(&rq->queuelist)); + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); + __blk_end_request_all(rq, error); + break; + + default: + BUG(); } - return next_rq; + + return blk_kick_flush(q) | queued; } -static void blk_flush_complete_seq_end_io(struct request_queue *q, - unsigned seq, int error) +static void flush_end_io(struct request *flush_rq, int error) { + struct request_queue *q = flush_rq->q; + struct list_head *running = &q->flush_queue[q->flush_running_idx]; bool was_empty = elv_queue_empty(q); - struct request *next_rq; + bool queued = false; + struct request *rq, *n; + + BUG_ON(q->flush_pending_idx == q->flush_running_idx); + + /* account completion of the flush request */ + q->flush_running_idx ^= 1; + elv_completed_request(q, flush_rq); + + /* and push the waiting requests to the next stage */ + list_for_each_entry_safe(rq, n, running, flush.list) { + unsigned int seq = blk_flush_cur_seq(rq); - next_rq = blk_flush_complete_seq(q, seq, error); + BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + queued |= blk_flush_complete_seq(rq, seq, error); + } /* * Moving a request silently to empty queue_head may stall the @@ -70,127 +218,154 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q, * from request completion path and calling directly into * request_fn may confuse the driver. Always use kblockd. */ - if (was_empty && next_rq) + if (queued && was_empty) __blk_run_queue(q, true); } -static void pre_flush_end_io(struct request *rq, int error) +/** + * blk_kick_flush - consider issuing flush request + * @q: request_queue being kicked + * + * Flush related states of @q have changed, consider issuing flush request. + * Please read the comment at the top of this file for more info. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + * + * RETURNS: + * %true if flush was issued, %false otherwise. + */ +static bool blk_kick_flush(struct request_queue *q) { - elv_completed_request(rq->q, rq); - blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error); + struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + struct request *first_rq = + list_first_entry(pending, struct request, flush.list); + + /* C1 described at the top of this file */ + if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending)) + return false; + + /* C2 and C3 */ + if (!list_empty(&q->flush_data_in_flight) && + time_before(jiffies, + q->flush_pending_since + FLUSH_PENDING_TIMEOUT)) + return false; + + /* + * Issue flush and toggle pending_idx. This makes pending_idx + * different from running_idx, which means flush is in flight. + */ + blk_rq_init(q, &q->flush_rq); + q->flush_rq.cmd_type = REQ_TYPE_FS; + q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; + q->flush_rq.rq_disk = first_rq->rq_disk; + q->flush_rq.end_io = flush_end_io; + + q->flush_pending_idx ^= 1; + elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_REQUEUE); + return true; } static void flush_data_end_io(struct request *rq, int error) { - elv_completed_request(rq->q, rq); - blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error); -} + struct request_queue *q = rq->q; + bool was_empty = elv_queue_empty(q); -static void post_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error); + /* + * After populating an empty queue, kick it to avoid stall. Read + * the comment in flush_end_io(). + */ + if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty) + __blk_run_queue(q, true); } -static void init_flush_request(struct request *rq, struct gendisk *disk) +/** + * blk_insert_flush - insert a new FLUSH/FUA request + * @rq: request to insert + * + * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions. + * @rq is being submitted. Analyze what needs to be done and put it on the + * right queue. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + */ +void blk_insert_flush(struct request *rq) { - rq->cmd_type = REQ_TYPE_FS; - rq->cmd_flags = WRITE_FLUSH; - rq->rq_disk = disk; -} + struct request_queue *q = rq->q; + unsigned int fflags = q->flush_flags; /* may change, cache */ + unsigned int policy = blk_flush_policy(fflags, rq); -static struct request *queue_next_fseq(struct request_queue *q) -{ - struct request *orig_rq = q->orig_flush_rq; - struct request *rq = &q->flush_rq; + BUG_ON(rq->end_io); + BUG_ON(!rq->bio || rq->bio != rq->biotail); - blk_rq_init(q, rq); + /* + * @policy now records what operations need to be done. Adjust + * REQ_FLUSH and FUA for the driver. + */ + rq->cmd_flags &= ~REQ_FLUSH; + if (!(fflags & REQ_FUA)) + rq->cmd_flags &= ~REQ_FUA; - switch (blk_flush_cur_seq(q)) { - case QUEUE_FSEQ_PREFLUSH: - init_flush_request(rq, orig_rq->rq_disk); - rq->end_io = pre_flush_end_io; - break; - case QUEUE_FSEQ_DATA: - init_request_from_bio(rq, orig_rq->bio); - /* - * orig_rq->rq_disk may be different from - * bio->bi_bdev->bd_disk if orig_rq got here through - * remapping drivers. Make sure rq->rq_disk points - * to the same one as orig_rq. - */ - rq->rq_disk = orig_rq->rq_disk; - rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA); - rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA); - rq->end_io = flush_data_end_io; - break; - case QUEUE_FSEQ_POSTFLUSH: - init_flush_request(rq, orig_rq->rq_disk); - rq->end_io = post_flush_end_io; - break; - default: - BUG(); + /* + * If there's data but flush is not necessary, the request can be + * processed directly without going through flush machinery. Queue + * for normal execution. + */ + if ((policy & REQ_FSEQ_DATA) && + !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { + list_add(&rq->queuelist, &q->queue_head); + return; } - elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); - return rq; + /* + * @rq should go through flush machinery. Mark it part of flush + * sequence and submit for further processing. + */ + memset(&rq->flush, 0, sizeof(rq->flush)); + INIT_LIST_HEAD(&rq->flush.list); + rq->cmd_flags |= REQ_FLUSH_SEQ; + rq->end_io = flush_data_end_io; + + blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); } -struct request *blk_do_flush(struct request_queue *q, struct request *rq) +/** + * blk_abort_flushes - @q is being aborted, abort flush requests + * @q: request_queue being aborted + * + * To be called from elv_abort_queue(). @q is being aborted. Prepare all + * FLUSH/FUA requests for abortion. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + */ +void blk_abort_flushes(struct request_queue *q) { - unsigned int fflags = q->flush_flags; /* may change, cache it */ - bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; - bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH); - bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA); - unsigned skip = 0; + struct request *rq, *n; + int i; /* - * Special case. If there's data but flush is not necessary, - * the request can be issued directly. - * - * Flush w/o data should be able to be issued directly too but - * currently some drivers assume that rq->bio contains - * non-zero data if it isn't NULL and empty FLUSH requests - * getting here usually have bio's without data. + * Requests in flight for data are already owned by the dispatch + * queue or the device driver. Just restore for normal completion. */ - if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { - rq->cmd_flags &= ~REQ_FLUSH; - if (!has_fua) - rq->cmd_flags &= ~REQ_FUA; - return rq; + list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) { + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); } /* - * Sequenced flushes can't be processed in parallel. If - * another one is already in progress, queue for later - * processing. + * We need to give away requests on flush queues. Restore for + * normal completion and put them on the dispatch queue. */ - if (q->flush_seq) { - list_move_tail(&rq->queuelist, &q->pending_flushes); - return NULL; + for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) { + list_for_each_entry_safe(rq, n, &q->flush_queue[i], + flush.list) { + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); + list_add_tail(&rq->queuelist, &q->queue_head); + } } - - /* - * Start a new flush sequence - */ - q->flush_err = 0; - q->flush_seq |= QUEUE_FSEQ_STARTED; - - /* adjust FLUSH/FUA of the original request and stash it away */ - rq->cmd_flags &= ~REQ_FLUSH; - if (!has_fua) - rq->cmd_flags &= ~REQ_FUA; - blk_dequeue_request(rq); - q->orig_flush_rq = rq; - - /* skip unneded sequences and return the first one */ - if (!do_preflush) - skip |= QUEUE_FSEQ_PREFLUSH; - if (!blk_rq_sectors(rq)) - skip |= QUEUE_FSEQ_DATA; - if (!do_postflush) - skip |= QUEUE_FSEQ_POSTFLUSH; - return blk_flush_complete_seq(q, skip, 0); } static void bio_end_flush(struct bio *bio, int err) diff --git a/block/blk-settings.c b/block/blk-settings.c index 36c8c1f2af18..df649fa59ded 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -176,13 +176,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); /* - * If the caller didn't supply a lock, fall back to our embedded - * per-queue locks - */ - if (!q->queue_lock) - q->queue_lock = &q->__queue_lock; - - /* * by default assume old behaviour and bounce for any highmem page */ blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 41fb69150b4d..261c75c665ae 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -471,8 +471,6 @@ static void blk_release_queue(struct kobject *kobj) blk_sync_queue(q); - blk_throtl_exit(q); - if (rl->rq_pool) mempool_destroy(rl->rq_pool); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index e36cc10a346c..061dee66e2a6 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -968,7 +968,7 @@ static void throtl_update_blkio_group_write_iops(void *key, throtl_schedule_delayed_work(td, 0); } -void throtl_shutdown_timer_wq(struct request_queue *q) +static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; @@ -1102,7 +1102,7 @@ void blk_throtl_exit(struct request_queue *q) BUG_ON(!td); - throtl_shutdown_timer_wq(q); + throtl_shutdown_wq(q); spin_lock_irq(q->queue_lock); throtl_release_tgs(td); @@ -1132,7 +1132,7 @@ void blk_throtl_exit(struct request_queue *q) * update limits through cgroup and another work got queued, cancel * it. */ - throtl_shutdown_timer_wq(q); + throtl_shutdown_wq(q); throtl_td_free(td); } diff --git a/block/blk.h b/block/blk.h index 2db8f32838e7..284b500852bd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -51,21 +51,17 @@ static inline void blk_clear_rq_complete(struct request *rq) */ #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) -struct request *blk_do_flush(struct request_queue *q, struct request *rq); +void blk_insert_flush(struct request *rq); +void blk_abort_flushes(struct request_queue *q); static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; while (1) { - while (!list_empty(&q->queue_head)) { + if (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); - if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || - rq == &q->flush_rq) - return rq; - rq = blk_do_flush(q, rq); - if (rq) - return rq; + return rq; } if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ea83a4f0c27d..89dc745c7d94 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4; #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) #define RQ_CIC(rq) \ - ((struct cfq_io_context *) (rq)->elevator_private) -#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) -#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) + ((struct cfq_io_context *) (rq)->elevator_private[0]) +#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1]) +#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2]) static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; @@ -146,7 +146,6 @@ struct cfq_queue { struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; struct cfq_group *cfqg; - struct cfq_group *orig_cfqg; /* Number of sectors dispatched from queue in single dispatch round */ unsigned long nr_sectors; }; @@ -285,7 +284,6 @@ struct cfq_data { unsigned int cfq_slice_idle; unsigned int cfq_group_idle; unsigned int cfq_latency; - unsigned int cfq_group_isolation; unsigned int cic_index; struct list_head cic_list; @@ -1187,32 +1185,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, int new_cfqq = 1; int group_changed = 0; -#ifdef CONFIG_CFQ_GROUP_IOSCHED - if (!cfqd->cfq_group_isolation - && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD - && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { - /* Move this cfq to root group */ - cfq_log_cfqq(cfqd, cfqq, "moving to root group"); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_group_service_tree_del(cfqd, cfqq->cfqg); - cfqq->orig_cfqg = cfqq->cfqg; - cfqq->cfqg = &cfqd->root_group; - cfqd->root_group.ref++; - group_changed = 1; - } else if (!cfqd->cfq_group_isolation - && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { - /* cfqq is sequential now needs to go to its original group */ - BUG_ON(cfqq->cfqg != &cfqd->root_group); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_group_service_tree_del(cfqd, cfqq->cfqg); - cfq_put_cfqg(cfqq->cfqg); - cfqq->cfqg = cfqq->orig_cfqg; - cfqq->orig_cfqg = NULL; - group_changed = 1; - cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); - } -#endif - service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), cfqq_type(cfqq)); if (cfq_class_idle(cfqq)) { @@ -2542,7 +2514,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) static void cfq_put_queue(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; - struct cfq_group *cfqg, *orig_cfqg; + struct cfq_group *cfqg; BUG_ON(cfqq->ref <= 0); @@ -2554,7 +2526,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); cfqg = cfqq->cfqg; - orig_cfqg = cfqq->orig_cfqg; if (unlikely(cfqd->active_queue == cfqq)) { __cfq_slice_expired(cfqd, cfqq, 0); @@ -2564,8 +2535,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); cfq_put_cfqg(cfqg); - if (orig_cfqg) - cfq_put_cfqg(orig_cfqg); } /* @@ -3613,12 +3582,12 @@ static void cfq_put_request(struct request *rq) put_io_context(RQ_CIC(rq)->ioc); - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; + rq->elevator_private[0] = NULL; + rq->elevator_private[1] = NULL; /* Put down rq reference on cfqg */ cfq_put_cfqg(RQ_CFQG(rq)); - rq->elevator_private3 = NULL; + rq->elevator_private[2] = NULL; cfq_put_queue(cfqq); } @@ -3705,13 +3674,13 @@ new_queue: } cfqq->allocated[rw]++; - cfqq->ref++; - rq->elevator_private = cic; - rq->elevator_private2 = cfqq; - rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); spin_unlock_irqrestore(q->queue_lock, flags); + cfqq->ref++; + rq->elevator_private[0] = cic; + rq->elevator_private[1] = cfqq; + rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg); return 0; queue_fail: @@ -3953,7 +3922,6 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_slice_idle = cfq_slice_idle; cfqd->cfq_group_idle = cfq_group_idle; cfqd->cfq_latency = 1; - cfqd->cfq_group_isolation = 0; cfqd->hw_tag = -1; /* * we optimistically start assuming sync ops weren't delayed in last @@ -4029,7 +3997,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); -SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -4063,7 +4030,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); -STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -4081,7 +4047,6 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_idle), CFQ_ATTR(group_idle), CFQ_ATTR(low_latency), - CFQ_ATTR(group_isolation), __ATTR_NULL }; diff --git a/block/elevator.c b/block/elevator.c index 236e93c1f46c..fabf3675c913 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -673,6 +673,11 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) q->elevator->ops->elevator_add_req_fn(q, rq); break; + case ELEVATOR_INSERT_FLUSH: + rq->cmd_flags |= REQ_SOFTBARRIER; + blk_insert_flush(rq); + break; + default: printk(KERN_ERR "%s: bad insertion point %d\n", __func__, where); @@ -759,7 +764,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) if (e->ops->elevator_set_req_fn) return e->ops->elevator_set_req_fn(q, rq, gfp_mask); - rq->elevator_private = NULL; + rq->elevator_private[0] = NULL; return 0; } @@ -785,6 +790,8 @@ void elv_abort_queue(struct request_queue *q) { struct request *rq; + blk_abort_flushes(q); + while (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); rq->cmd_flags |= REQ_QUIET; diff --git a/block/genhd.c b/block/genhd.c index cbf1112a885c..3e2b57b55e38 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1158,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%u %lu %lu %llu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[0]), - part_stat_read(hd, merges[0]), - (unsigned long long)part_stat_read(hd, sectors[0]), - jiffies_to_msecs(part_stat_read(hd, ticks[0])), - part_stat_read(hd, ios[1]), - part_stat_read(hd, merges[1]), - (unsigned long long)part_stat_read(hd, sectors[1]), - jiffies_to_msecs(part_stat_read(hd, ticks[1])), + part_stat_read(hd, ios[READ]), + part_stat_read(hd, merges[READ]), + (unsigned long long)part_stat_read(hd, sectors[READ]), + jiffies_to_msecs(part_stat_read(hd, ticks[READ])), + part_stat_read(hd, ios[WRITE]), + part_stat_read(hd, merges[WRITE]), + (unsigned long long)part_stat_read(hd, sectors[WRITE]), + jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index dbf31ec9114d..79c3079a6203 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1636,9 +1636,6 @@ out: static void loop_free(struct loop_device *lo) { - if (!lo->lo_queue->queue_lock) - lo->lo_queue->queue_lock = &lo->lo_queue->__queue_lock; - blk_cleanup_queue(lo->lo_queue); put_disk(lo->lo_disk); list_del(&lo->lo_list); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 46ad5197537a..dddedfc0af81 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -148,6 +148,7 @@ enum rq_flag_bits { __REQ_ALLOCED, /* request came from our alloc pool */ __REQ_COPY_USER, /* contains copies of user pages */ __REQ_FLUSH, /* request for cache flush */ + __REQ_FLUSH_SEQ, /* request for flush sequence */ __REQ_IO_STAT, /* account I/O stat */ __REQ_MIXED_MERGE, /* merge of different types, fail separately */ __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ @@ -188,6 +189,7 @@ enum rq_flag_bits { #define REQ_ALLOCED (1 << __REQ_ALLOCED) #define REQ_COPY_USER (1 << __REQ_COPY_USER) #define REQ_FLUSH (1 << __REQ_FLUSH) +#define REQ_FLUSH_SEQ (1 << __REQ_FLUSH_SEQ) #define REQ_IO_STAT (1 << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) #define REQ_SECURE (1 << __REQ_SECURE) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d5063e1b5555..13b75ca62181 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -108,11 +108,17 @@ struct request { /* * Three pointers are available for the IO schedulers, if they need - * more they have to dynamically allocate it. + * more they have to dynamically allocate it. Flush requests are + * never put on the IO scheduler. So let the flush fields share + * space with the three elevator_private pointers. */ - void *elevator_private; - void *elevator_private2; - void *elevator_private3; + union { + void *elevator_private[3]; + struct { + unsigned int seq; + struct list_head list; + } flush; + }; struct gendisk *rq_disk; struct hd_struct *part; @@ -363,11 +369,12 @@ struct request_queue * for flush operations */ unsigned int flush_flags; - unsigned int flush_seq; - int flush_err; + unsigned int flush_pending_idx:1; + unsigned int flush_running_idx:1; + unsigned long flush_pending_since; + struct list_head flush_queue[2]; + struct list_head flush_data_in_flight; struct request flush_rq; - struct request *orig_flush_rq; - struct list_head pending_flushes; struct mutex sysfs_lock; @@ -1135,7 +1142,6 @@ static inline uint64_t rq_io_start_time_ns(struct request *req) extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern int blk_throtl_bio(struct request_queue *q, struct bio **bio); -extern void throtl_shutdown_timer_wq(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) { @@ -1144,7 +1150,6 @@ static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline int blk_throtl_exit(struct request_queue *q) { return 0; } -static inline void throtl_shutdown_timer_wq(struct request_queue *q) {} #endif /* CONFIG_BLK_DEV_THROTTLING */ #define MODULE_ALIAS_BLOCKDEV(major,minor) \ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 4d857973d2c9..39b68edb388d 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -167,6 +167,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define ELEVATOR_INSERT_BACK 2 #define ELEVATOR_INSERT_SORT 3 #define ELEVATOR_INSERT_REQUEUE 4 +#define ELEVATOR_INSERT_FLUSH 5 /* * return values from elevator_may_queue_fn |