diff options
Diffstat (limited to 'block')
52 files changed, 5494 insertions, 2014 deletions
diff --git a/block/Kconfig b/block/Kconfig index 8b5f8e560eb4..3bc76bb113a0 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -26,9 +26,15 @@ menuconfig BLOCK if BLOCK +config BLK_RQ_ALLOC_TIME + bool + config BLK_SCSI_REQUEST bool +config BLK_CGROUP_RWSTAT + bool + config BLK_DEV_BSG bool "Block layer SG support v4" default y @@ -60,7 +66,6 @@ config BLK_DEV_BSGLIB config BLK_DEV_INTEGRITY bool "Block layer data integrity support" - select CRC_T10DIF if BLK_DEV_INTEGRITY ---help--- Some storage devices allow extra information to be stored/retrieved to help protect the data. The block layer @@ -71,6 +76,11 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_INTEGRITY_T10 + tristate + depends on BLK_DEV_INTEGRITY + select CRC_T10DIF + config BLK_DEV_ZONED bool "Zoned block device support" select MQ_IOSCHED_DEADLINE @@ -83,6 +93,7 @@ config BLK_DEV_ZONED config BLK_DEV_THROTTLING bool "Block layer bio throttling support" depends on BLK_CGROUP=y + select BLK_CGROUP_RWSTAT ---help--- Block layer bio throttling support. It can be used to limit the IO rate to a device. IO rate policies are per cgroup and @@ -132,6 +143,16 @@ config BLK_CGROUP_IOLATENCY Note, this is an experimental interface and could be changed someday. +config BLK_CGROUP_IOCOST + bool "Enable support for cost model based cgroup IO controller" + depends on BLK_CGROUP=y + select BLK_RQ_ALLOC_TIME + ---help--- + Enabling this option enables the .weight interface for cost + model based proportional IO control. The IO controller + distributes IO capacity between different groups based on + their share of the overall weight distribution. + config BLK_WBT_MQ bool "Multiqueue writeback throttling" default y diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index b89310a022ad..7df14133adc8 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -31,6 +31,7 @@ config IOSCHED_BFQ config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" depends on IOSCHED_BFQ && BLK_CGROUP + select BLK_CGROUP_RWSTAT ---help--- Enable hierarchical scheduling in BFQ, using the blkio diff --git a/block/Makefile b/block/Makefile index eee1b4ceecf9..1a43750f4b01 100644 --- a/block/Makefile +++ b/block/Makefile @@ -16,16 +16,18 @@ obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o +obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o +obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o +obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 0f6cd688924f..09b69a3ed490 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -332,7 +332,7 @@ static void bfqg_put(struct bfq_group *bfqg) kfree(bfqg); } -static void bfqg_and_blkg_get(struct bfq_group *bfqg) +void bfqg_and_blkg_get(struct bfq_group *bfqg) { /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ bfqg_get(bfqg); @@ -347,6 +347,17 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) bfqg_put(bfqg); } +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) +{ + struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); + + if (!bfqg) + return; + + blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); + blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); +} + /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { @@ -431,6 +442,8 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) static void bfqg_stats_exit(struct bfqg_stats *stats) { + blkg_rwstat_exit(&stats->bytes); + blkg_rwstat_exit(&stats->ios); #ifdef CONFIG_BFQ_CGROUP_DEBUG blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); @@ -448,6 +461,10 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { + if (blkg_rwstat_init(&stats->bytes, gfp) || + blkg_rwstat_init(&stats->ios, gfp)) + return -ENOMEM; + #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || @@ -501,11 +518,12 @@ static void bfq_cpd_free(struct blkcg_policy_data *cpd) kfree(cpd_to_bfqgd(cpd)); } -static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) +static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) { struct bfq_group *bfqg; - bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); + bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node); if (!bfqg) return NULL; @@ -633,9 +651,15 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); + /* + * get extra reference to prevent bfqq from being freed in + * next possible deactivate + */ + bfqq->ref++; + if (bfq_bfqq_busy(bfqq)) bfq_deactivate_bfqq(bfqd, bfqq, false, false); - else if (entity->on_st) + else if (entity->on_st_or_in_serv) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(bfqq_group(bfqq)); @@ -652,6 +676,8 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->in_service_queue && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); + /* release extra ref taken above */ + bfq_put_queue(bfqq); } /** @@ -904,7 +930,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd) bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static int bfq_io_show_weight(struct seq_file *sf, void *v) +static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); @@ -918,6 +944,60 @@ static int bfq_io_show_weight(struct seq_file *sf, void *v) return 0; } +static u64 bfqg_prfill_weight_device(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + + if (!bfqg->entity.dev_weight) + return 0; + return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight); +} + +static int bfq_io_show_weight(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + + seq_printf(sf, "default %u\n", bfqgd->weight); + blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device, + &blkcg_policy_bfq, 0, false); + return 0; +} + +static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight) +{ + weight = dev_weight ?: weight; + + bfqg->entity.dev_weight = dev_weight; + /* + * Setting the prio_changed flag of the entity + * to 1 with new_weight == weight would re-set + * the value of the weight to its ioprio mapping. + * Set the flag only if necessary. + */ + if ((unsigned short)weight != bfqg->entity.new_weight) { + bfqg->entity.new_weight = (unsigned short)weight; + /* + * Make sure that the above new value has been + * stored in bfqg->entity.new_weight before + * setting the prio_changed flag. In fact, + * this flag may be read asynchronously (in + * critical sections protected by a different + * lock than that held here), and finding this + * flag set may cause the execution of the code + * for updating parameters whose value may + * depend also on bfqg->entity.new_weight (in + * __bfq_entity_update_weight_prio). + * This barrier makes sure that the new value + * of bfqg->entity.new_weight is correctly + * seen in that code. + */ + smp_wmb(); + bfqg->entity.prio_changed = 1; + } +} + static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) @@ -936,61 +1016,70 @@ static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); - if (!bfqg) - continue; - /* - * Setting the prio_changed flag of the entity - * to 1 with new_weight == weight would re-set - * the value of the weight to its ioprio mapping. - * Set the flag only if necessary. - */ - if ((unsigned short)val != bfqg->entity.new_weight) { - bfqg->entity.new_weight = (unsigned short)val; - /* - * Make sure that the above new value has been - * stored in bfqg->entity.new_weight before - * setting the prio_changed flag. In fact, - * this flag may be read asynchronously (in - * critical sections protected by a different - * lock than that held here), and finding this - * flag set may cause the execution of the code - * for updating parameters whose value may - * depend also on bfqg->entity.new_weight (in - * __bfq_entity_update_weight_prio). - * This barrier makes sure that the new value - * of bfqg->entity.new_weight is correctly - * seen in that code. - */ - smp_wmb(); - bfqg->entity.prio_changed = 1; - } + if (bfqg) + bfq_group_set_weight(bfqg, val, 0); } spin_unlock_irq(&blkcg->lock); return ret; } -static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) +static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { - u64 weight; - /* First unsigned long found in the file is used */ - int ret = kstrtoull(strim(buf), 0, &weight); + int ret; + struct blkg_conf_ctx ctx; + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct bfq_group *bfqg; + u64 v; + ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx); if (ret) return ret; - ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight); + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ + ret = -ERANGE; + if (!v) + goto out; + } else if (!strcmp(strim(ctx.body), "default")) { + v = 0; + } else { + ret = -EINVAL; + goto out; + } + + bfqg = blkg_to_bfqg(ctx.blkg); + + ret = -ERANGE; + if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) { + bfq_group_set_weight(bfqg, bfqg->entity.weight, v); + ret = 0; + } +out: + blkg_conf_finish(&ctx); return ret ?: nbytes; } -#ifdef CONFIG_BFQ_CGROUP_DEBUG -static int bfqg_print_stat(struct seq_file *sf, void *v) +static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - &blkcg_policy_bfq, seq_cft(sf)->private, false); - return 0; + char *endp; + int ret; + u64 v; + + buf = strim(buf); + + /* "WEIGHT" or "default WEIGHT" sets the default weight */ + v = simple_strtoull(buf, &endp, 0); + if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { + ret = bfq_io_set_weight_legacy(of_css(of), NULL, v); + return ret ?: nbytes; + } + + return bfq_io_set_device_weight(of, buf, nbytes, off); } static int bfqg_print_rwstat(struct seq_file *sf, void *v) @@ -1000,6 +1089,31 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) return 0; } +static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, + seq_cft(sf)->private, true); + return 0; +} + +#ifdef CONFIG_BFQ_CGROUP_DEBUG +static int bfqg_print_stat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, + &blkcg_policy_bfq, seq_cft(sf)->private, false); + return 0; +} + static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1025,15 +1139,6 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, sum); } -static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat_sample sum; - - blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); - return __blkg_prfill_rwstat(sf, pd, &sum); -} - static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), @@ -1042,18 +1147,11 @@ static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) return 0; } -static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, - seq_cft(sf)->private, true); - return 0; -} - static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg); + u64 sum = blkg_rwstat_total(&bfqg->stats.bytes); return __blkg_prfill_u64(sf, pd, sum >> 9); } @@ -1070,8 +1168,8 @@ static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, { struct blkg_rwstat_sample tmp; - blkg_rwstat_recursive_sum(pd->blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes), &tmp); + blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq, + offsetof(struct bfq_group, stats.bytes), &tmp); return __blkg_prfill_u64(sf, pd, (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); @@ -1141,20 +1239,26 @@ struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, + .seq_show = bfq_io_show_weight_legacy, .write_u64 = bfq_io_set_weight_legacy, }, + { + .name = "bfq.weight_device", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write = bfq_io_set_weight, + }, /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.io_service_bytes", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, + .private = offsetof(struct bfq_group, stats.bytes), + .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_serviced", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, + .private = offsetof(struct bfq_group, stats.ios), + .seq_show = bfqg_print_rwstat, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { @@ -1191,13 +1295,13 @@ struct cftype bfq_blkcg_legacy_files[] = { /* the same statistics which cover the bfqg and its descendants */ { .name = "bfq.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, + .private = offsetof(struct bfq_group, stats.bytes), + .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, + .private = offsetof(struct bfq_group, stats.ios), + .seq_show = bfqg_print_rwstat_recursive, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { @@ -1302,6 +1406,10 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq) return bfqq->bfqd->root_group; } +void bfqg_and_blkg_get(struct bfq_group *bfqg) {} + +void bfqg_and_blkg_put(struct bfq_group *bfqg) {} + struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { struct bfq_group *bfqg; diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b33be928d164..8c436abfaf14 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -427,7 +427,6 @@ void bfq_schedule_dispatch(struct bfq_data *bfqd) } #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) #define bfq_sample_valid(samples) ((samples) > 80) @@ -614,6 +613,10 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->pos_root = NULL; } + /* oom_bfqq does not participate in queue merging */ + if (bfqq == &bfqd->oom_bfqq) + return; + /* * bfqq cannot be merged any longer (see comments in * bfq_setup_cooperator): no point in adding bfqq into the @@ -1056,7 +1059,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, static int bfqq_process_refs(struct bfq_queue *bfqq) { - return bfqq->ref - bfqq->allocated - bfqq->entity.on_st - + return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - (bfqq->weight_counter != NULL); } @@ -2016,7 +2019,7 @@ static void bfq_add_request(struct request *rq) (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + - msecs_to_jiffies(100))) { + msecs_to_jiffies(10))) { bfqd->last_empty_occupied_ns = ktime_get_ns(); /* * Start the state machine for measuring the @@ -2025,7 +2028,21 @@ static void bfq_add_request(struct request *rq) * be set when rq will be dispatched. */ bfqd->wait_dispatch = true; - bfqd->rqs_injected = false; + /* + * If there is no I/O in service in the drive, + * then possible injection occurred before the + * arrival of rq will not affect the total + * service time of rq. So the injection limit + * must not be updated as a function of such + * total service time, unless new injection + * occurs before rq is completed. To have the + * injection limit updated only in the latter + * case, reset rqs_injected here (rqs_injected + * will be set in case injection is performed + * on bfqq before rq is completed). + */ + if (bfqd->rq_in_driver == 0) + bfqd->rqs_injected = false; } } @@ -2699,6 +2716,28 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } } + +static +void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* + * To prevent bfqq's service guarantees from being violated, + * bfqq may be left busy, i.e., queued for service, even if + * empty (see comments in __bfq_bfqq_expire() for + * details). But, if no process will send requests to bfqq any + * longer, then there is no point in keeping bfqq queued for + * service. In addition, keeping bfqq queued for service, but + * with no process ref any longer, may have caused bfqq to be + * freed when dequeued from service. But this is assumed to + * never happen. + */ + if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && + bfqq != bfqd->in_service_queue) + bfq_del_bfqq_busy(bfqd, bfqq, false); + + bfq_put_queue(bfqq); +} + static void bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) @@ -2769,8 +2808,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, */ new_bfqq->pid = -1; bfqq->bic = NULL; - /* release process reference to bfqq */ - bfq_put_queue(bfqq); + bfq_release_process_ref(bfqd, bfqq); } static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, @@ -3409,6 +3447,10 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { + /* No point in idling for bfqq if it won't get requests any longer */ + if (unlikely(!bfqq_process_refs(bfqq))) + return false; + return (bfqq->wr_coeff > 1 && (bfqd->wr_busy_queues < bfq_tot_busy_queues(bfqd) || @@ -4042,6 +4084,10 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, bfqq_sequential_and_IO_bound, idling_boosts_thr; + /* No point in idling for bfqq if it won't get requests any longer */ + if (unlikely(!bfqq_process_refs(bfqq))) + return false; + bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); @@ -4135,6 +4181,10 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) struct bfq_data *bfqd = bfqq->bfqd; bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; + /* No point in idling for bfqq if it won't get requests any longer */ + if (unlikely(!bfqq_process_refs(bfqq))) + return false; + if (unlikely(bfqd->strict_guarantees)) return true; @@ -4775,9 +4825,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) { struct bfq_queue *item; struct hlist_node *n; -#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_group *bfqg = bfqq_group(bfqq); -#endif if (bfqq->bfqd) bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", @@ -4850,9 +4898,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) bfqq->bfqd->last_completed_rq_bfqq = NULL; kmem_cache_free(bfq_pool, bfqq); -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_and_blkg_put(bfqg); -#endif } static void bfq_put_cooperator(struct bfq_queue *bfqq) @@ -4885,7 +4931,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_cooperator(bfqq); - bfq_put_queue(bfqq); /* release process reference */ + bfq_release_process_ref(bfqd, bfqq); } static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) @@ -4987,8 +5033,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { - /* release process reference on this queue */ - bfq_put_queue(bfqq); + bfq_release_process_ref(bfqd, bfqq); bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); bic_set_bfqq(bic, bfqq, false); } @@ -5450,6 +5495,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool idle_timer_disabled = false; unsigned int cmd_flags; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) + bfqg_stats_update_legacy_io(q, rq); +#endif spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { spin_unlock_irq(&bfqd->lock); @@ -5784,14 +5833,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; unsigned int old_limit = bfqq->inject_limit; - if (bfqq->last_serv_time_ns > 0) { + if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { u64 threshold = (bfqq->last_serv_time_ns * 3)>>1; if (tot_time_ns >= threshold && old_limit > 0) { bfqq->inject_limit--; bfqq->decrease_time_jif = jiffies; } else if (tot_time_ns < threshold && - old_limit < bfqd->max_rq_in_driver<<1) + old_limit <= bfqd->max_rq_in_driver) bfqq->inject_limit++; } @@ -5809,12 +5858,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, */ if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || tot_time_ns < bfqq->last_serv_time_ns) { + if (bfqq->last_serv_time_ns == 0) { + /* + * Now we certainly have a base value: make sure we + * start trying injection. + */ + bfqq->inject_limit = max_t(unsigned int, 1, old_limit); + } bfqq->last_serv_time_ns = tot_time_ns; - /* - * Now we certainly have a base value: make sure we - * start trying injection. - */ - bfqq->inject_limit = max_t(unsigned int, 1, old_limit); } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) /* * No I/O injected and no request still in service in @@ -5830,6 +5881,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, /* update complete, not waiting for any request completion any longer */ bfqd->waited_rq = NULL; + bfqd->rqs_injected = false; } /* @@ -5927,6 +5979,8 @@ static void bfq_finish_requeue_request(struct request *rq) } /* + * Removes the association between the current task and bfqq, assuming + * that bic points to the bfq iocontext of the task. * Returns NULL if a new bfqq should be allocated, or the old bfqq if this * was the last process referring to that bfqq. */ @@ -5946,7 +6000,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) bfq_put_cooperator(bfqq); - bfq_put_queue(bfqq); + bfq_release_process_ref(bfqq->bfqd, bfqq); return NULL; } @@ -6334,10 +6388,10 @@ static void bfq_exit_queue(struct elevator_queue *e) hrtimer_cancel(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED /* release oom-queue reference to root group */ bfqg_and_blkg_put(bfqd->root_group); +#ifdef CONFIG_BFQ_GROUP_IOSCHED blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); #else spin_lock_irq(&bfqd->lock); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index e80adf822bbe..d1233af9c684 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -10,6 +10,8 @@ #include <linux/hrtimer.h> #include <linux/blk-cgroup.h> +#include "blk-cgroup-rwstat.h" + #define BFQ_IOPRIO_CLASSES 3 #define BFQ_CL_IDLE_TIMEOUT (HZ/5) @@ -148,7 +150,7 @@ struct bfq_entity { * Flag, true if the entity is on a tree (either the active or * the idle one of its service_tree) or is in service. */ - bool on_st; + bool on_st_or_in_serv; /* B-WF2Q+ start and finish timestamps [sectors/weight] */ u64 start, finish; @@ -168,6 +170,9 @@ struct bfq_entity { /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ int budget; + /* device weight, if non-zero, it overrides the default weight of + * bfq_group_data */ + int dev_weight; /* weight of the queue */ int weight; /* next weight if a change is in progress */ @@ -806,6 +811,9 @@ struct bfq_stat { }; struct bfqg_stats { + /* basic stats */ + struct blkg_rwstat bytes; + struct blkg_rwstat ios; #ifdef CONFIG_BFQ_CGROUP_DEBUG /* number of ios merged */ struct blkg_rwstat merged; @@ -913,6 +921,7 @@ struct bfq_group { #else struct bfq_group { + struct bfq_entity entity; struct bfq_sched_data sched_data; struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; @@ -953,6 +962,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); /* ---------------- cgroups-support interface ---------------- */ +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, unsigned int op); void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); @@ -975,6 +985,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); +void bfqg_and_blkg_get(struct bfq_group *bfqg); void bfqg_and_blkg_put(struct bfq_group *bfqg); #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -1059,6 +1070,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ char pid_str[MAX_PID_STR_LENGTH]; \ + if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ + break; \ bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ @@ -1075,6 +1088,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ char pid_str[MAX_PID_STR_LENGTH]; \ + if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ + break; \ bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index c9ba225081ce..eb0e2a6daabe 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -277,10 +277,7 @@ struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) */ static u64 bfq_delta(unsigned long service, unsigned long weight) { - u64 d = (u64)service << WFQ_SERVICE_SHIFT; - - do_div(d, weight); - return d; + return div64_ul((u64)service << WFQ_SERVICE_SHIFT, weight); } /** @@ -536,7 +533,9 @@ static void bfq_get_entity(struct bfq_entity *entity) bfqq->ref++; bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", bfqq, bfqq->ref); - } + } else + bfqg_and_blkg_get(container_of(entity, struct bfq_group, + entity)); } /** @@ -648,10 +647,16 @@ static void bfq_forget_entity(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - entity->on_st = false; + entity->on_st_or_in_serv = false; st->wsum -= entity->weight; - if (bfqq && !is_in_service) + if (is_in_service) + return; + + if (bfqq) bfq_put_queue(bfqq); + else + bfqg_and_blkg_put(container_of(entity, struct bfq_group, + entity)); } /** @@ -744,6 +749,8 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, } #endif + /* Matches the smp_wmb() in bfq_group_set_weight. */ + smp_rmb(); old_st->wsum -= entity->weight; if (entity->new_weight != entity->orig_weight) { @@ -1000,7 +1007,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity, */ bfq_get_entity(entity); - entity->on_st = true; + entity->on_st_or_in_serv = true; } #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -1166,7 +1173,10 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) struct bfq_service_tree *st; bool is_in_service; - if (!entity->on_st) /* entity never activated, or already inactive */ + if (!entity->on_st_or_in_serv) /* + * entity never activated, or + * already inactive + */ return false; /* @@ -1621,7 +1631,7 @@ bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) * service tree either, then release the service reference to * the queue it represents (taken with bfq_get_entity). */ - if (!in_serv_entity->on_st) { + if (!in_serv_entity->on_st_or_in_serv) { /* * If no process is referencing in_serv_bfqq any * longer, then the service reference may be the only diff --git a/block/bio-integrity.c b/block/bio-integrity.c index fb95dbb21dd8..bf62c25cde8f 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -87,7 +87,7 @@ EXPORT_SYMBOL(bio_integrity_alloc); * Description: Used to free the integrity portion of a bio. Usually * called from bio_free(). */ -static void bio_integrity_free(struct bio *bio) +void bio_integrity_free(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_set *bs = bio->bi_pool; diff --git a/block/bio.c b/block/bio.c index 299a0e7651ec..94d697217887 100644 --- a/block/bio.c +++ b/block/bio.c @@ -233,6 +233,9 @@ fallback: void bio_uninit(struct bio *bio) { bio_disassociate_blkg(bio); + + if (bio_integrity(bio)) + bio_integrity_free(bio); } EXPORT_SYMBOL(bio_uninit); @@ -536,6 +539,55 @@ void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) EXPORT_SYMBOL(zero_fill_bio_iter); /** + * bio_truncate - truncate the bio to small size of @new_size + * @bio: the bio to be truncated + * @new_size: new size for truncating the bio + * + * Description: + * Truncate the bio to new size of @new_size. If bio_op(bio) is + * REQ_OP_READ, zero the truncated part. This function should only + * be used for handling corner cases, such as bio eod. + */ +void bio_truncate(struct bio *bio, unsigned new_size) +{ + struct bio_vec bv; + struct bvec_iter iter; + unsigned int done = 0; + bool truncated = false; + + if (new_size >= bio->bi_iter.bi_size) + return; + + if (bio_op(bio) != REQ_OP_READ) + goto exit; + + bio_for_each_segment(bv, bio, iter) { + if (done + bv.bv_len > new_size) { + unsigned offset; + + if (!truncated) + offset = new_size - done; + else + offset = 0; + zero_user(bv.bv_page, offset, bv.bv_len - offset); + truncated = true; + } + done += bv.bv_len; + } + + exit: + /* + * Don't touch bvec table here and make it really immutable, since + * fs bio user has to retrieve all pages via bio_for_each_segment_all + * in its .end_bio() callback. + * + * It is enough to truncate bio by updating .bi_size since we can make + * correct bvec with the updated .bi_size for drivers. + */ + bio->bi_iter.bi_size = new_size; +} + +/** * bio_put - release a reference to a bio * @bio: bio to release reference to * @@ -646,25 +698,20 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return true; } -/* - * Check if the @page can be added to the current segment(@bv), and make - * sure to call it only if page_is_mergeable(@bv, @page) is true - */ -static bool can_add_page_to_seg(struct request_queue *q, - struct bio_vec *bv, struct page *page, unsigned len, - unsigned offset) +static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned len, unsigned offset, + bool *same_page) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) return false; - if (bv->bv_len + len > queue_max_segment_size(q)) return false; - - return true; + return __bio_try_merge_page(bio, page, len, offset, same_page); } /** @@ -674,7 +721,7 @@ static bool can_add_page_to_seg(struct request_queue *q, * @page: page to add * @len: vec entry length * @offset: vec entry offset - * @put_same_page: put the page if it is same with last added page + * @same_page: return if the merge happen inside the same page * * Attempt to add a page to the bio_vec maplist. This can fail for a * number of reasons, such as the bio being full or target block device @@ -685,10 +732,9 @@ static bool can_add_page_to_seg(struct request_queue *q, */ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, - bool put_same_page) + bool *same_page) { struct bio_vec *bvec; - bool same_page = false; /* * cloned bio must not modify vec list @@ -700,28 +746,16 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, return 0; if (bio->bi_vcnt > 0) { - bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page == bvec->bv_page && - offset == bvec->bv_offset + bvec->bv_len) { - if (put_same_page) - put_page(page); - bvec->bv_len += len; - goto done; - } + if (bio_try_merge_pc_page(q, bio, page, len, offset, same_page)) + return len; /* - * If the queue doesn't support SG gaps and adding this - * offset would create a gap, disallow it. + * If the queue doesn't support SG gaps and adding this segment + * would create a gap, disallow it. */ + bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (bvec_gap_to_prev(q, bvec, offset)) return 0; - - if (page_is_mergeable(bvec, page, len, offset, &same_page) && - can_add_page_to_seg(q, bvec, page, len, offset)) { - bvec->bv_len += len; - goto done; - } } if (bio_full(bio, len)) @@ -735,7 +769,6 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, bvec->bv_len = len; bvec->bv_offset = offset; bio->bi_vcnt++; - done: bio->bi_iter.bi_size += len; return len; } @@ -743,7 +776,8 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - return __bio_add_pc_page(q, bio, page, len, offset, false); + bool same_page = false; + return __bio_add_pc_page(q, bio, page, len, offset, &same_page); } EXPORT_SYMBOL(bio_add_pc_page); @@ -773,6 +807,8 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page, struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (page_is_mergeable(bv, page, len, off, same_page)) { + if (bio->bi_iter.bi_size > UINT_MAX - len) + return false; bv->bv_len += len; bio->bi_iter.bi_size += len; return true; @@ -806,6 +842,9 @@ void __bio_add_page(struct bio *bio, struct page *page, bio->bi_iter.bi_size += len; bio->bi_vcnt++; + + if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) + bio_set_flag(bio, BIO_WORKINGSET); } EXPORT_SYMBOL_GPL(__bio_add_page); @@ -1384,13 +1423,17 @@ struct bio *bio_map_user_iov(struct request_queue *q, for (j = 0; j < npages; j++) { struct page *page = pages[j]; unsigned int n = PAGE_SIZE - offs; + bool same_page = false; if (n > bytes) n = bytes; if (!__bio_add_pc_page(q, bio, page, n, offs, - true)) + &same_page)) { + if (same_page) + put_page(page); break; + } added += n; bytes -= n; @@ -1521,7 +1564,6 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, bio->bi_end_io = bio_map_kern_endio; return bio; } -EXPORT_SYMBOL(bio_map_kern); static void bio_copy_kern_endio(struct bio *bio) { @@ -1842,8 +1884,8 @@ EXPORT_SYMBOL(bio_endio); * @bio, and updates @bio to represent the remaining sectors. * * Unless this is a discard request the newly allocated bio will point - * to @bio's bi_io_vec; it is the caller's responsibility to ensure that - * @bio is not freed before the split. + * to @bio's bi_io_vec. It is the caller's responsibility to ensure that + * neither @bio nor @bs are freed before the split bio. */ struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c new file mode 100644 index 000000000000..85d5790ac49b --- /dev/null +++ b/block/blk-cgroup-rwstat.c @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. + * Do not use in new code. + */ +#include "blk-cgroup-rwstat.h" + +int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) +{ + int i, ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); + if (ret) { + while (--i >= 0) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); + return ret; + } + atomic64_set(&rwstat->aux_cnt[i], 0); + } + return 0; +} +EXPORT_SYMBOL_GPL(blkg_rwstat_init); + +void blkg_rwstat_exit(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); +} +EXPORT_SYMBOL_GPL(blkg_rwstat_exit); + +/** + * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @rwstat: rwstat to print + * + * Print @rwstat to @sf for the device assocaited with @pd. + */ +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat_sample *rwstat) +{ + static const char *rwstr[] = { + [BLKG_RWSTAT_READ] = "Read", + [BLKG_RWSTAT_WRITE] = "Write", + [BLKG_RWSTAT_SYNC] = "Sync", + [BLKG_RWSTAT_ASYNC] = "Async", + [BLKG_RWSTAT_DISCARD] = "Discard", + }; + const char *dname = blkg_dev_name(pd->blkg); + u64 v; + int i; + + if (!dname) + return 0; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], + rwstat->cnt[i]); + + v = rwstat->cnt[BLKG_RWSTAT_READ] + + rwstat->cnt[BLKG_RWSTAT_WRITE] + + rwstat->cnt[BLKG_RWSTAT_DISCARD]; + seq_printf(sf, "%s Total %llu\n", dname, v); + return v; +} +EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); + +/** + * blkg_prfill_rwstat - prfill callback for blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the blkg_rwstat in @pd + * + * prfill callback for printing a blkg_rwstat. + */ +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat_sample rwstat = { }; + + blkg_rwstat_read((void *)pd + off, &rwstat); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} +EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); + +/** + * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg + * @sum: blkg_rwstat_sample structure containing the results + * + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. + */ +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat_sample *sum) +{ + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + unsigned int i; + + lockdep_assert_held(&blkg->q->queue_lock); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; + + if (!pos_blkg->online) + continue; + + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h new file mode 100644 index 000000000000..ee746919c41f --- /dev/null +++ b/block/blk-cgroup-rwstat.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. + * Do not use in new code. + */ +#ifndef _BLK_CGROUP_RWSTAT_H +#define _BLK_CGROUP_RWSTAT_H + +#include <linux/blk-cgroup.h> + +enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, + BLKG_RWSTAT_DISCARD, + + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, +}; + +/* + * blkg_[rw]stat->aux_cnt is excluded for local stats but included for + * recursive. Used to carry stats of dead children. + */ +struct blkg_rwstat { + struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; + atomic64_t aux_cnt[BLKG_RWSTAT_NR]; +}; + +struct blkg_rwstat_sample { + u64 cnt[BLKG_RWSTAT_NR]; +}; + +static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, + unsigned int idx) +{ + return atomic64_read(&rwstat->aux_cnt[idx]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); +} + +int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp); +void blkg_rwstat_exit(struct blkg_rwstat *rwstat); +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat_sample *rwstat); +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat_sample *sum); + + +/** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @op: REQ_OP and flags + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ +static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + unsigned int op, uint64_t val) +{ + struct percpu_counter *cnt; + + if (op_is_discard(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; + else if (op_is_write(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; + + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); + + if (op_is_sync(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; + + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); +} + +/** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it in the aux counts. + */ +static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, + struct blkg_rwstat_sample *result) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + result->cnt[i] = + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); +} + +/** + * blkg_rwstat_total - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) +{ + struct blkg_rwstat_sample tmp = { }; + + blkg_rwstat_read(rwstat, &tmp); + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; +} + +/** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ +static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + percpu_counter_set(&rwstat->cpu_cnt[i], 0); + atomic64_set(&rwstat->aux_cnt[i], 0); + } +} + +/** + * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's count including the aux one to @to's aux count. + */ +static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + u64 sum[BLKG_RWSTAT_NR]; + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), + &to->aux_cnt[i]); +} +#endif /* _BLK_CGROUP_RWSTAT_H */ diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 55a7dc227dfb..a229b94d5390 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -80,8 +80,7 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - blkg_rwstat_exit(&blkg->stat_ios); - blkg_rwstat_exit(&blkg->stat_bytes); + free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } @@ -146,7 +145,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, gfp_t gfp_mask) { struct blkcg_gq *blkg; - int i; + int i, cpu; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); @@ -156,8 +155,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) goto err_free; - if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || - blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); + if (!blkg->iostat_cpu) goto err_free; blkg->q = q; @@ -167,6 +166,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); blkg->blkcg = blkcg; + u64_stats_init(&blkg->iostat.sync); + for_each_possible_cpu(cpu) + u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; @@ -175,7 +178,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, continue; /* alloc per-policy data and attach it to blkg */ - pd = pol->pd_alloc_fn(gfp_mask, q->node); + pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); if (!pd) goto err_free; @@ -393,7 +396,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; - struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(&blkg->q->queue_lock); @@ -410,11 +412,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) pol->pd_offline_fn(blkg->pd[i]); } - if (parent) { - blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); - blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); - } - blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -464,7 +461,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; - int i; + int i, cpu; mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); @@ -475,8 +472,12 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - blkg_rwstat_reset(&blkg->stat_bytes); - blkg_rwstat_reset(&blkg->stat_ios); + for_each_possible_cpu(cpu) { + struct blkg_iostat_set *bis = + per_cpu_ptr(blkg->iostat_cpu, cpu); + memset(bis, 0, sizeof(*bis)); + } + memset(&blkg->iostat, 0, sizeof(blkg->iostat)); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -560,197 +561,55 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -/** - * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @rwstat: rwstat to print - * - * Print @rwstat to @sf for the device assocaited with @pd. - */ -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat_sample *rwstat) -{ - static const char *rwstr[] = { - [BLKG_RWSTAT_READ] = "Read", - [BLKG_RWSTAT_WRITE] = "Write", - [BLKG_RWSTAT_SYNC] = "Sync", - [BLKG_RWSTAT_ASYNC] = "Async", - [BLKG_RWSTAT_DISCARD] = "Discard", - }; - const char *dname = blkg_dev_name(pd->blkg); - u64 v; - int i; - - if (!dname) - return 0; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - rwstat->cnt[i]); - - v = rwstat->cnt[BLKG_RWSTAT_READ] + - rwstat->cnt[BLKG_RWSTAT_WRITE] + - rwstat->cnt[BLKG_RWSTAT_DISCARD]; - seq_printf(sf, "%s Total %llu\n", dname, v); - return v; -} -EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); - -/** - * blkg_prfill_rwstat - prfill callback for blkg_rwstat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @off: offset to the blkg_rwstat in @pd - * - * prfill callback for printing a blkg_rwstat. - */ -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat_sample rwstat = { }; - - blkg_rwstat_read((void *)pd + off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} -EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); - -static u64 blkg_prfill_rwstat_field(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat_sample rwstat = { }; - - blkg_rwstat_read((void *)pd->blkg + off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -/** - * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes - * @sf: seq_file to print to - * @v: unused - * - * To be used as cftype->seq_show to print blkg->stat_bytes. - * cftype->private must be set to the blkcg_policy. - */ -int blkg_print_stat_bytes(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_bytes), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); - -/** - * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios - * @sf: seq_file to print to - * @v: unused - * - * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private - * must be set to the blkcg_policy. - */ -int blkg_print_stat_ios(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_ios), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_ios); - -static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat_sample rwstat; - - blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -/** - * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes - * @sf: seq_file to print to - * @v: unused - */ -int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) +/* Performs queue bypass and policy enabled checks then looks up blkg. */ +static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, + const struct blkcg_policy *pol, + struct request_queue *q) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field_recursive, - (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_bytes), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(&q->queue_lock); -/** - * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios - * @sf: seq_file to print to - * @v: unused - */ -int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field_recursive, - (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_ios), true); - return 0; + if (!blkcg_policy_enabled(q, pol)) + return ERR_PTR(-EOPNOTSUPP); + return __blkg_lookup(blkcg, q, true /* update_hint */); } -EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); /** - * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @blkg: blkg of interest - * @pol: blkcg_policy which contains the blkg_rwstat - * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg - * @sum: blkg_rwstat_sample structure containing the results + * blkg_conf_prep - parse and prepare for per-blkg config update + * @inputp: input string pointer * - * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its - * online descendants and their aux counts. The caller must be holding the - * queue lock for online tests. + * Parse the device node prefix part, MAJ:MIN, of per-blkg config update + * from @input and get and return the matching gendisk. *@inputp is + * updated to point past the device node prefix. Returns an ERR_PTR() + * value on error. * - * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it - * is at @off bytes into @blkg's blkg_policy_data of the policy. + * Use this function iff blkg_conf_prep() can't be used for some reason. */ -void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, - int off, struct blkg_rwstat_sample *sum) +struct gendisk *blkcg_conf_get_disk(char **inputp) { - struct blkcg_gq *pos_blkg; - struct cgroup_subsys_state *pos_css; - unsigned int i; - - lockdep_assert_held(&blkg->q->queue_lock); - - rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { - struct blkg_rwstat *rwstat; + char *input = *inputp; + unsigned int major, minor; + struct gendisk *disk; + int key_len, part; - if (!pos_blkg->online) - continue; + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) + return ERR_PTR(-EINVAL); - if (pol) - rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; - else - rwstat = (void *)pos_blkg + off; + input += key_len; + if (!isspace(*input)) + return ERR_PTR(-EINVAL); + input = skip_spaces(input); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); + disk = get_gendisk(MKDEV(major, minor), &part); + if (!disk) + return ERR_PTR(-ENODEV); + if (part) { + put_disk_and_module(disk); + return ERR_PTR(-ENODEV); } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); -/* Performs queue bypass and policy enabled checks then looks up blkg. */ -static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, - const struct blkcg_policy *pol, - struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); - - if (!blkcg_policy_enabled(q, pol)) - return ERR_PTR(-EOPNOTSUPP); - return __blkg_lookup(blkcg, q, true /* update_hint */); + *inputp = input; + return disk; } /** @@ -772,25 +631,11 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; - unsigned int major, minor; - int key_len, part, ret; - char *body; - - if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) - return -EINVAL; + int ret; - body = input + key_len; - if (!isspace(*body)) - return -EINVAL; - body = skip_spaces(body); - - disk = get_gendisk(MKDEV(major, minor), &part); - if (!disk) - return -ENODEV; - if (part) { - ret = -ENODEV; - goto fail; - } + disk = blkcg_conf_get_disk(&input); + if (IS_ERR(disk)) + return PTR_ERR(disk); q = disk->queue; @@ -856,7 +701,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, success: ctx->disk = disk; ctx->blkg = blkg; - ctx->body = body; + ctx->body = input; return 0; fail_unlock: @@ -876,6 +721,7 @@ fail: } return ret; } +EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_finish - finish up per-blkg config update @@ -891,26 +737,34 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) rcu_read_unlock(); put_disk_and_module(ctx->disk); } +EXPORT_SYMBOL_GPL(blkg_conf_finish); static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; + cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkg_iostat_set *bis = &blkg->iostat; const char *dname; char *buf; - struct blkg_rwstat_sample rwstat; u64 rbytes, wbytes, rios, wios, dbytes, dios; size_t size = seq_get_buf(sf, &buf), off = 0; int i; bool has_stats = false; + unsigned seq; + + spin_lock_irq(&blkg->q->queue_lock); + + if (!blkg->online) + goto skip; dname = blkg_dev_name(blkg); if (!dname) - continue; + goto skip; /* * Hooray string manipulation, count is the size written NOT @@ -920,21 +774,16 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) */ off += scnprintf(buf+off, size-off, "%s ", dname); - spin_lock_irq(&blkg->q->queue_lock); - - blkg_rwstat_recursive_sum(blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes), &rwstat); - rbytes = rwstat.cnt[BLKG_RWSTAT_READ]; - wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE]; - dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD]; + do { + seq = u64_stats_fetch_begin(&bis->sync); - blkg_rwstat_recursive_sum(blkg, NULL, - offsetof(struct blkcg_gq, stat_ios), &rwstat); - rios = rwstat.cnt[BLKG_RWSTAT_READ]; - wios = rwstat.cnt[BLKG_RWSTAT_WRITE]; - dios = rwstat.cnt[BLKG_RWSTAT_DISCARD]; - - spin_unlock_irq(&blkg->q->queue_lock); + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { has_stats = true; @@ -973,6 +822,8 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) seq_commit(sf, -1); } } + skip: + spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); @@ -1211,26 +1062,6 @@ err_unlock: } /** - * blkcg_drain_queue - drain blkcg part of request_queue - * @q: request_queue to drain - * - * Called from blk_drain_queue(). Responsible for draining blkcg part. - */ -void blkcg_drain_queue(struct request_queue *q) -{ - lockdep_assert_held(&q->queue_lock); - - /* - * @q could be exiting and already have destroyed all blkgs as - * indicated by NULL root_blkg. If so, don't confuse policies. - */ - if (!q->root_blkg) - return; - - blk_throtl_drain(q); -} - -/** * blkcg_exit_queue - exit and release blkcg part of request_queue * @q: request_queue being released * @@ -1268,6 +1099,77 @@ static int blkcg_can_attach(struct cgroup_taskset *tset) return ret; } +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} + +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); + struct blkg_iostat cur, delta; + unsigned seq; + + /* fetch the current per-cpu values */ + do { + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); + + /* propagate percpu delta to global */ + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&delta, &cur); + blkg_iostat_sub(&delta, &bisc->last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(&bisc->last, &delta); + u64_stats_update_end(&blkg->iostat.sync); + + /* propagate global delta to parent */ + if (parent) { + u64_stats_update_begin(&parent->iostat.sync); + blkg_iostat_set(&delta, &blkg->iostat.cur); + blkg_iostat_sub(&delta, &blkg->iostat.last); + blkg_iostat_add(&parent->iostat.cur, &delta); + blkg_iostat_add(&blkg->iostat.last, &delta); + u64_stats_update_end(&parent->iostat.sync); + } + } + + rcu_read_unlock(); +} + static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; @@ -1300,6 +1202,7 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, + .css_rstat_flush = blkcg_rstat_flush, .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, @@ -1336,7 +1239,7 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { struct blkg_policy_data *pd_prealloc = NULL; - struct blkcg_gq *blkg; + struct blkcg_gq *blkg, *pinned_blkg = NULL; int ret; if (blkcg_policy_enabled(q, pol)) @@ -1344,49 +1247,82 @@ int blkcg_activate_policy(struct request_queue *q, if (queue_is_mq(q)) blk_mq_freeze_queue(q); -pd_prealloc: - if (!pd_prealloc) { - pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); - if (!pd_prealloc) { - ret = -ENOMEM; - goto out_bypass_end; - } - } - +retry: spin_lock_irq(&q->queue_lock); - /* blkg_list is pushed at the head, reverse walk to init parents first */ + /* blkg_list is pushed at the head, reverse walk to allocate parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) continue; - pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node); - if (!pd) - swap(pd, pd_prealloc); + /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ + if (blkg == pinned_blkg) { + pd = pd_prealloc; + pd_prealloc = NULL; + } else { + pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, + blkg->blkcg); + } + if (!pd) { + /* + * GFP_NOWAIT failed. Free the existing one and + * prealloc for @blkg w/ GFP_KERNEL. + */ + if (pinned_blkg) + blkg_put(pinned_blkg); + blkg_get(blkg); + pinned_blkg = blkg; + spin_unlock_irq(&q->queue_lock); - goto pd_prealloc; + + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, + blkg->blkcg); + if (pd_prealloc) + goto retry; + else + goto enomem; } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; - if (pol->pd_init_fn) - pol->pd_init_fn(pd); } + /* all allocated, init in the same order */ + if (pol->pd_init_fn) + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) + pol->pd_init_fn(blkg->pd[pol->plid]); + __set_bit(pol->plid, q->blkcg_pols); ret = 0; spin_unlock_irq(&q->queue_lock); -out_bypass_end: +out: if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); + if (pinned_blkg) + blkg_put(pinned_blkg); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; + +enomem: + /* alloc failed, nothing's initialized yet, free everything */ + spin_lock_irq(&q->queue_lock); + list_for_each_entry(blkg, &q->blkg_list, q_node) { + if (blkg->pd[pol->plid]) { + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } + } + spin_unlock_irq(&q->queue_lock); + ret = -ENOMEM; + goto out; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1475,7 +1411,8 @@ int blkcg_policy_register(struct blkcg_policy *pol) blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; - pol->cpd_init_fn(cpd); + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); } } diff --git a/block/blk-core.c b/block/blk-core.c index d0cc6e14d2f0..089e890ab208 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,8 +34,10 @@ #include <linux/ratelimit.h> #include <linux/pm_runtime.h> #include <linux/blk-cgroup.h> +#include <linux/t10-pi.h> #include <linux/debugfs.h> #include <linux/bpf.h> +#include <linux/psi.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> @@ -129,6 +131,10 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(DISCARD), REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), + REQ_OP_NAME(ZONE_RESET_ALL), + REQ_OP_NAME(ZONE_OPEN), + REQ_OP_NAME(ZONE_CLOSE), + REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), @@ -333,18 +339,19 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying); */ void blk_cleanup_queue(struct request_queue *q) { + WARN_ON_ONCE(blk_queue_registered(q)); + /* mark @q DYING, no new request or merges will be allowed afterwards */ - mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); blk_queue_flag_set(QUEUE_FLAG_DYING, q); - mutex_unlock(&q->sysfs_lock); /* * Drain all requests queued before DYING marking. Set DEAD flag to - * prevent that q->request_fn() gets invoked after draining finished. + * prevent that blk_mq_run_hw_queues() accesses the hardware queues + * after draining finished. */ blk_freeze_queue(q); @@ -479,7 +486,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); @@ -518,6 +524,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) mutex_init(&q->blk_trace_mutex); #endif mutex_init(&q->sysfs_lock); + mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); @@ -601,6 +608,7 @@ bool bio_attempt_back_merge(struct request *req, struct bio *bio, return false; trace_block_bio_backmerge(req->q, req, bio); + rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); @@ -622,6 +630,7 @@ bool bio_attempt_front_merge(struct request *req, struct bio *bio, return false; trace_block_bio_frontmerge(req->q, req, bio); + rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); @@ -647,6 +656,8 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, blk_rq_get_max_sectors(req, blk_rq_pos(req))) goto no_merge; + rq_qos_merge(q, req, bio); + req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; @@ -840,11 +851,7 @@ static inline int blk_partition_remap(struct bio *bio) if (unlikely(bio_check_ro(bio, p))) goto out; - /* - * Zone reset does not include bi_size so bio_sectors() is always 0. - * Include a test for the reset op code and perform the remap if needed. - */ - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) { + if (bio_sectors(bio)) { if (bio_check_eod(bio, part_nr_sects_read(p))) goto out; bio->bi_iter.bi_sector += p->start_sect; @@ -878,11 +885,14 @@ generic_make_request_checks(struct bio *bio) } /* - * For a REQ_NOWAIT based request, return -EOPNOTSUPP - * if queue is not a request based queue. + * Non-mq queues do not honor REQ_NOWAIT, so complete a bio + * with BLK_STS_AGAIN status in order to catch -EAGAIN and + * to give a chance to the caller to repeat request gracefully. */ - if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) - goto not_supported; + if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) { + status = BLK_STS_AGAIN; + goto end_io; + } if (should_fail_bio(bio)) goto end_io; @@ -928,9 +938,16 @@ generic_make_request_checks(struct bio *bio) goto not_supported; break; case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: if (!blk_queue_is_zoned(q)) goto not_supported; break; + case REQ_OP_ZONE_RESET_ALL: + if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q)) + goto not_supported; + break; case REQ_OP_WRITE_ZEROES: if (!q->limits.max_write_zeroes_sectors) goto not_supported; @@ -1128,6 +1145,10 @@ EXPORT_SYMBOL_GPL(direct_make_request); */ blk_qc_t submit_bio(struct bio *bio) { + bool workingset_read = false; + unsigned long pflags; + blk_qc_t ret; + if (blkcg_punt_bio_submit(bio)) return BLK_QC_T_NONE; @@ -1146,6 +1167,8 @@ blk_qc_t submit_bio(struct bio *bio) if (op_is_write(bio_op(bio))) { count_vm_events(PGPGOUT, count); } else { + if (bio_flagged(bio, BIO_WORKINGSET)) + workingset_read = true; task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } @@ -1160,7 +1183,21 @@ blk_qc_t submit_bio(struct bio *bio) } } - return generic_make_request(bio); + /* + * If we're reading data that is part of the userspace + * workingset, count submission time as memory stall. When the + * device is congested, or the submitting cgroup IO-throttled, + * submission can be a significant part of overall IO time. + */ + if (workingset_read) + psi_memstall_enter(&pflags); + + ret = generic_make_request(bio); + + if (workingset_read) + psi_memstall_leave(&pflags); + + return ret; } EXPORT_SYMBOL(submit_bio); @@ -1276,7 +1313,7 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes); void blk_account_io_completion(struct request *req, unsigned int bytes) { - if (blk_do_io_stat(req)) { + if (req->part && blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; @@ -1294,7 +1331,8 @@ void blk_account_io_done(struct request *req, u64 now) * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { + if (req->part && blk_do_io_stat(req) && + !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; @@ -1405,6 +1443,12 @@ bool blk_update_request(struct request *req, blk_status_t error, if (!req->bio) return false; +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + error == BLK_STS_OK) + req->q->integrity.profile->complete_fn(req, nr_bytes); +#endif + if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET))) print_req_error(req, error, __func__); @@ -1752,9 +1796,9 @@ int __init blk_dev_init(void) { BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * - FIELD_SIZEOF(struct request, cmd_flags)); + sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * - FIELD_SIZEOF(struct bio, bi_opf)); + sizeof_field(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", diff --git a/block/blk-exec.c b/block/blk-exec.c index 1db44ca0f4a6..e20a852ae432 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -55,6 +55,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq->rq_disk = bd_disk; rq->end_io = done; + blk_account_io_start(rq, true); + /* * don't check dying flag for MQ because the request won't * be reused after dying flag is set diff --git a/block/blk-flush.c b/block/blk-flush.c index aedd9320e605..3f977c517960 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,6 +69,7 @@ #include <linux/blkdev.h> #include <linux/gfp.h> #include <linux/blk-mq.h> +#include <linux/lockdep.h> #include "blk.h" #include "blk-mq.h" @@ -136,6 +137,17 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) blk_mq_add_to_requeue_list(rq, add_front, true); } +static void blk_account_io_flush(struct request *rq) +{ + struct hd_struct *part = &rq->rq_disk->part0; + + part_stat_lock(); + part_stat_inc(part, ios[STAT_FLUSH]); + part_stat_add(part, nsecs[STAT_FLUSH], + ktime_get_ns() - rq->start_time_ns); + part_stat_unlock(); +} + /** * blk_flush_complete_seq - complete flush sequence * @rq: PREFLUSH/FUA request being sequenced @@ -185,7 +197,7 @@ static void blk_flush_complete_seq(struct request *rq, case REQ_FSEQ_DONE: /* - * @rq was previously adjusted by blk_flush_issue() for + * @rq was previously adjusted by blk_insert_flush() for * flush sequencing and may already have gone through the * flush data request completion path. Restore @rq for * normal completion and end it. @@ -212,8 +224,20 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); struct blk_mq_hw_ctx *hctx; + blk_account_io_flush(flush_rq); + /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + + if (!refcount_dec_and_test(&flush_rq->ref)) { + fq->rq_status = error; + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + return; + } + + if (fq->rq_status != BLK_STS_OK) + error = fq->rq_status; + hctx = flush_rq->mq_hctx; if (!q->elevator) { blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); @@ -482,6 +506,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, INIT_LIST_HEAD(&fq->flush_queue[1]); INIT_LIST_HEAD(&fq->flush_data_in_flight); + lockdep_register_key(&fq->key); + lockdep_set_class(&fq->mq_flush_lock, &fq->key); + return fq; fail_rq: @@ -496,6 +523,7 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) if (!fq) return; + lockdep_unregister_key(&fq->key); kfree(fq->flush_rq); kfree(fq); } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ca39b4624cf8..ff1070edbb40 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -368,10 +368,21 @@ static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) return BLK_STS_OK; } +static void blk_integrity_nop_prepare(struct request *rq) +{ +} + +static void blk_integrity_nop_complete(struct request *rq, + unsigned int nr_bytes) +{ +} + static const struct blk_integrity_profile nop_profile = { .name = "nop", .generate_fn = blk_integrity_nop_fn, .verify_fn = blk_integrity_nop_fn, + .prepare_fn = blk_integrity_nop_prepare, + .complete_fn = blk_integrity_nop_complete, }; /** diff --git a/block/blk-iocost.c b/block/blk-iocost.c new file mode 100644 index 000000000000..27ca68621137 --- /dev/null +++ b/block/blk-iocost.c @@ -0,0 +1,2472 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * IO cost model based controller. + * + * Copyright (C) 2019 Tejun Heo <tj@kernel.org> + * Copyright (C) 2019 Andy Newell <newella@fb.com> + * Copyright (C) 2019 Facebook + * + * One challenge of controlling IO resources is the lack of trivially + * observable cost metric. This is distinguished from CPU and memory where + * wallclock time and the number of bytes can serve as accurate enough + * approximations. + * + * Bandwidth and iops are the most commonly used metrics for IO devices but + * depending on the type and specifics of the device, different IO patterns + * easily lead to multiple orders of magnitude variations rendering them + * useless for the purpose of IO capacity distribution. While on-device + * time, with a lot of clutches, could serve as a useful approximation for + * non-queued rotational devices, this is no longer viable with modern + * devices, even the rotational ones. + * + * While there is no cost metric we can trivially observe, it isn't a + * complete mystery. For example, on a rotational device, seek cost + * dominates while a contiguous transfer contributes a smaller amount + * proportional to the size. If we can characterize at least the relative + * costs of these different types of IOs, it should be possible to + * implement a reasonable work-conserving proportional IO resource + * distribution. + * + * 1. IO Cost Model + * + * IO cost model estimates the cost of an IO given its basic parameters and + * history (e.g. the end sector of the last IO). The cost is measured in + * device time. If a given IO is estimated to cost 10ms, the device should + * be able to process ~100 of those IOs in a second. + * + * Currently, there's only one builtin cost model - linear. Each IO is + * classified as sequential or random and given a base cost accordingly. + * On top of that, a size cost proportional to the length of the IO is + * added. While simple, this model captures the operational + * characteristics of a wide varienty of devices well enough. Default + * paramters for several different classes of devices are provided and the + * parameters can be configured from userspace via + * /sys/fs/cgroup/io.cost.model. + * + * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate + * device-specific coefficients. + * + * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate + * device-specific coefficients. + * + * 2. Control Strategy + * + * The device virtual time (vtime) is used as the primary control metric. + * The control strategy is composed of the following three parts. + * + * 2-1. Vtime Distribution + * + * When a cgroup becomes active in terms of IOs, its hierarchical share is + * calculated. Please consider the following hierarchy where the numbers + * inside parentheses denote the configured weights. + * + * root + * / \ + * A (w:100) B (w:300) + * / \ + * A0 (w:100) A1 (w:100) + * + * If B is idle and only A0 and A1 are actively issuing IOs, as the two are + * of equal weight, each gets 50% share. If then B starts issuing IOs, B + * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, + * 12.5% each. The distribution mechanism only cares about these flattened + * shares. They're called hweights (hierarchical weights) and always add + * upto 1 (HWEIGHT_WHOLE). + * + * A given cgroup's vtime runs slower in inverse proportion to its hweight. + * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) + * against the device vtime - an IO which takes 10ms on the underlying + * device is considered to take 80ms on A0. + * + * This constitutes the basis of IO capacity distribution. Each cgroup's + * vtime is running at a rate determined by its hweight. A cgroup tracks + * the vtime consumed by past IOs and can issue a new IO iff doing so + * wouldn't outrun the current device vtime. Otherwise, the IO is + * suspended until the vtime has progressed enough to cover it. + * + * 2-2. Vrate Adjustment + * + * It's unrealistic to expect the cost model to be perfect. There are too + * many devices and even on the same device the overall performance + * fluctuates depending on numerous factors such as IO mixture and device + * internal garbage collection. The controller needs to adapt dynamically. + * + * This is achieved by adjusting the overall IO rate according to how busy + * the device is. If the device becomes overloaded, we're sending down too + * many IOs and should generally slow down. If there are waiting issuers + * but the device isn't saturated, we're issuing too few and should + * generally speed up. + * + * To slow down, we lower the vrate - the rate at which the device vtime + * passes compared to the wall clock. For example, if the vtime is running + * at the vrate of 75%, all cgroups added up would only be able to issue + * 750ms worth of IOs per second, and vice-versa for speeding up. + * + * Device business is determined using two criteria - rq wait and + * completion latencies. + * + * When a device gets saturated, the on-device and then the request queues + * fill up and a bio which is ready to be issued has to wait for a request + * to become available. When this delay becomes noticeable, it's a clear + * indication that the device is saturated and we lower the vrate. This + * saturation signal is fairly conservative as it only triggers when both + * hardware and software queues are filled up, and is used as the default + * busy signal. + * + * As devices can have deep queues and be unfair in how the queued commands + * are executed, soley depending on rq wait may not result in satisfactory + * control quality. For a better control quality, completion latency QoS + * parameters can be configured so that the device is considered saturated + * if N'th percentile completion latency rises above the set point. + * + * The completion latency requirements are a function of both the + * underlying device characteristics and the desired IO latency quality of + * service. There is an inherent trade-off - the tighter the latency QoS, + * the higher the bandwidth lossage. Latency QoS is disabled by default + * and can be set through /sys/fs/cgroup/io.cost.qos. + * + * 2-3. Work Conservation + * + * Imagine two cgroups A and B with equal weights. A is issuing a small IO + * periodically while B is sending out enough parallel IOs to saturate the + * device on its own. Let's say A's usage amounts to 100ms worth of IO + * cost per second, i.e., 10% of the device capacity. The naive + * distribution of half and half would lead to 60% utilization of the + * device, a significant reduction in the total amount of work done + * compared to free-for-all competition. This is too high a cost to pay + * for IO control. + * + * To conserve the total amount of work done, we keep track of how much + * each active cgroup is actually using and yield part of its weight if + * there are other cgroups which can make use of it. In the above case, + * A's weight will be lowered so that it hovers above the actual usage and + * B would be able to use the rest. + * + * As we don't want to penalize a cgroup for donating its weight, the + * surplus weight adjustment factors in a margin and has an immediate + * snapback mechanism in case the cgroup needs more IO vtime for itself. + * + * Note that adjusting down surplus weights has the same effects as + * accelerating vtime for other cgroups and work conservation can also be + * implemented by adjusting vrate dynamically. However, squaring who can + * donate and should take back how much requires hweight propagations + * anyway making it easier to implement and understand as a separate + * mechanism. + * + * 3. Monitoring + * + * Instead of debugfs or other clumsy monitoring mechanisms, this + * controller uses a drgn based monitoring script - + * tools/cgroup/iocost_monitor.py. For details on drgn, please see + * https://github.com/osandov/drgn. The ouput looks like the following. + * + * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% + * active weight hweight% inflt% dbt delay usages% + * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033 + * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077 + * + * - per : Timer period + * - cur_per : Internal wall and device vtime clock + * - vrate : Device virtual time rate against wall clock + * - weight : Surplus-adjusted and configured weights + * - hweight : Surplus-adjusted and configured hierarchical weights + * - inflt : The percentage of in-flight IO cost at the end of last period + * - del_ms : Deferred issuer delay induction level and duration + * - usages : Usage history + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/time64.h> +#include <linux/parser.h> +#include <linux/sched/signal.h> +#include <linux/blk-cgroup.h> +#include "blk-rq-qos.h" +#include "blk-stat.h" +#include "blk-wbt.h" + +#ifdef CONFIG_TRACEPOINTS + +/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */ +#define TRACE_IOCG_PATH_LEN 1024 +static DEFINE_SPINLOCK(trace_iocg_path_lock); +static char trace_iocg_path[TRACE_IOCG_PATH_LEN]; + +#define TRACE_IOCG_PATH(type, iocg, ...) \ + do { \ + unsigned long flags; \ + if (trace_iocost_##type##_enabled()) { \ + spin_lock_irqsave(&trace_iocg_path_lock, flags); \ + cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \ + trace_iocg_path, TRACE_IOCG_PATH_LEN); \ + trace_iocost_##type(iocg, trace_iocg_path, \ + ##__VA_ARGS__); \ + spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \ + } \ + } while (0) + +#else /* CONFIG_TRACE_POINTS */ +#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0) +#endif /* CONFIG_TRACE_POINTS */ + +enum { + MILLION = 1000000, + + /* timer period is calculated from latency requirements, bound it */ + MIN_PERIOD = USEC_PER_MSEC, + MAX_PERIOD = USEC_PER_SEC, + + /* + * A cgroup's vtime can run 50% behind the device vtime, which + * serves as its IO credit buffer. Surplus weight adjustment is + * immediately canceled if the vtime margin runs below 10%. + */ + MARGIN_PCT = 50, + INUSE_MARGIN_PCT = 10, + + /* Have some play in waitq timer operations */ + WAITQ_TIMER_MARGIN_PCT = 5, + + /* + * vtime can wrap well within a reasonable uptime when vrate is + * consistently raised. Don't trust recorded cgroup vtime if the + * period counter indicates that it's older than 5mins. + */ + VTIME_VALID_DUR = 300 * USEC_PER_SEC, + + /* + * Remember the past three non-zero usages and use the max for + * surplus calculation. Three slots guarantee that we remember one + * full period usage from the last active stretch even after + * partial deactivation and re-activation periods. Don't start + * giving away weight before collecting two data points to prevent + * hweight adjustments based on one partial activation period. + */ + NR_USAGE_SLOTS = 3, + MIN_VALID_USAGES = 2, + + /* 1/64k is granular enough and can easily be handled w/ u32 */ + HWEIGHT_WHOLE = 1 << 16, + + /* + * As vtime is used to calculate the cost of each IO, it needs to + * be fairly high precision. For example, it should be able to + * represent the cost of a single page worth of discard with + * suffificient accuracy. At the same time, it should be able to + * represent reasonably long enough durations to be useful and + * convenient during operation. + * + * 1s worth of vtime is 2^37. This gives us both sub-nanosecond + * granularity and days of wrap-around time even at extreme vrates. + */ + VTIME_PER_SEC_SHIFT = 37, + VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT, + VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC, + + /* bound vrate adjustments within two orders of magnitude */ + VRATE_MIN_PPM = 10000, /* 1% */ + VRATE_MAX_PPM = 100000000, /* 10000% */ + + VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION, + VRATE_CLAMP_ADJ_PCT = 4, + + /* if IOs end up waiting for requests, issue less */ + RQ_WAIT_BUSY_PCT = 5, + + /* unbusy hysterisis */ + UNBUSY_THR_PCT = 75, + + /* don't let cmds which take a very long time pin lagging for too long */ + MAX_LAGGING_PERIODS = 10, + + /* + * If usage% * 1.25 + 2% is lower than hweight% by more than 3%, + * donate the surplus. + */ + SURPLUS_SCALE_PCT = 125, /* * 125% */ + SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */ + SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */ + + /* switch iff the conditions are met for longer than this */ + AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, + + /* + * Count IO size in 4k pages. The 12bit shift helps keeping + * size-proportional components of cost calculation in closer + * numbers of digits to per-IO cost components. + */ + IOC_PAGE_SHIFT = 12, + IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT, + IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT, + + /* if apart further than 16M, consider randio for linear model */ + LCOEF_RANDIO_PAGES = 4096, +}; + +enum ioc_running { + IOC_IDLE, + IOC_RUNNING, + IOC_STOP, +}; + +/* io.cost.qos controls including per-dev enable of the whole controller */ +enum { + QOS_ENABLE, + QOS_CTRL, + NR_QOS_CTRL_PARAMS, +}; + +/* io.cost.qos params */ +enum { + QOS_RPPM, + QOS_RLAT, + QOS_WPPM, + QOS_WLAT, + QOS_MIN, + QOS_MAX, + NR_QOS_PARAMS, +}; + +/* io.cost.model controls */ +enum { + COST_CTRL, + COST_MODEL, + NR_COST_CTRL_PARAMS, +}; + +/* builtin linear cost model coefficients */ +enum { + I_LCOEF_RBPS, + I_LCOEF_RSEQIOPS, + I_LCOEF_RRANDIOPS, + I_LCOEF_WBPS, + I_LCOEF_WSEQIOPS, + I_LCOEF_WRANDIOPS, + NR_I_LCOEFS, +}; + +enum { + LCOEF_RPAGE, + LCOEF_RSEQIO, + LCOEF_RRANDIO, + LCOEF_WPAGE, + LCOEF_WSEQIO, + LCOEF_WRANDIO, + NR_LCOEFS, +}; + +enum { + AUTOP_INVALID, + AUTOP_HDD, + AUTOP_SSD_QD1, + AUTOP_SSD_DFL, + AUTOP_SSD_FAST, +}; + +struct ioc_gq; + +struct ioc_params { + u32 qos[NR_QOS_PARAMS]; + u64 i_lcoefs[NR_I_LCOEFS]; + u64 lcoefs[NR_LCOEFS]; + u32 too_fast_vrate_pct; + u32 too_slow_vrate_pct; +}; + +struct ioc_missed { + u32 nr_met; + u32 nr_missed; + u32 last_met; + u32 last_missed; +}; + +struct ioc_pcpu_stat { + struct ioc_missed missed[2]; + + u64 rq_wait_ns; + u64 last_rq_wait_ns; +}; + +/* per device */ +struct ioc { + struct rq_qos rqos; + + bool enabled; + + struct ioc_params params; + u32 period_us; + u32 margin_us; + u64 vrate_min; + u64 vrate_max; + + spinlock_t lock; + struct timer_list timer; + struct list_head active_iocgs; /* active cgroups */ + struct ioc_pcpu_stat __percpu *pcpu_stat; + + enum ioc_running running; + atomic64_t vtime_rate; + + seqcount_t period_seqcount; + u32 period_at; /* wallclock starttime */ + u64 period_at_vtime; /* vtime starttime */ + + atomic64_t cur_period; /* inc'd each period */ + int busy_level; /* saturation history */ + + u64 inuse_margin_vtime; + bool weights_updated; + atomic_t hweight_gen; /* for lazy hweights */ + + u64 autop_too_fast_at; + u64 autop_too_slow_at; + int autop_idx; + bool user_qos_params:1; + bool user_cost_model:1; +}; + +/* per device-cgroup pair */ +struct ioc_gq { + struct blkg_policy_data pd; + struct ioc *ioc; + + /* + * A iocg can get its weight from two sources - an explicit + * per-device-cgroup configuration or the default weight of the + * cgroup. `cfg_weight` is the explicit per-device-cgroup + * configuration. `weight` is the effective considering both + * sources. + * + * When an idle cgroup becomes active its `active` goes from 0 to + * `weight`. `inuse` is the surplus adjusted active weight. + * `active` and `inuse` are used to calculate `hweight_active` and + * `hweight_inuse`. + * + * `last_inuse` remembers `inuse` while an iocg is idle to persist + * surplus adjustments. + */ + u32 cfg_weight; + u32 weight; + u32 active; + u32 inuse; + u32 last_inuse; + + sector_t cursor; /* to detect randio */ + + /* + * `vtime` is this iocg's vtime cursor which progresses as IOs are + * issued. If lagging behind device vtime, the delta represents + * the currently available IO budget. If runnning ahead, the + * overage. + * + * `vtime_done` is the same but progressed on completion rather + * than issue. The delta behind `vtime` represents the cost of + * currently in-flight IOs. + * + * `last_vtime` is used to remember `vtime` at the end of the last + * period to calculate utilization. + */ + atomic64_t vtime; + atomic64_t done_vtime; + atomic64_t abs_vdebt; + u64 last_vtime; + + /* + * The period this iocg was last active in. Used for deactivation + * and invalidating `vtime`. + */ + atomic64_t active_period; + struct list_head active_list; + + /* see __propagate_active_weight() and current_hweight() for details */ + u64 child_active_sum; + u64 child_inuse_sum; + int hweight_gen; + u32 hweight_active; + u32 hweight_inuse; + bool has_surplus; + + struct wait_queue_head waitq; + struct hrtimer waitq_timer; + struct hrtimer delay_timer; + + /* usage is recorded as fractions of HWEIGHT_WHOLE */ + int usage_idx; + u32 usages[NR_USAGE_SLOTS]; + + /* this iocg's depth in the hierarchy and ancestors including self */ + int level; + struct ioc_gq *ancestors[]; +}; + +/* per cgroup */ +struct ioc_cgrp { + struct blkcg_policy_data cpd; + unsigned int dfl_weight; +}; + +struct ioc_now { + u64 now_ns; + u32 now; + u64 vnow; + u64 vrate; +}; + +struct iocg_wait { + struct wait_queue_entry wait; + struct bio *bio; + u64 abs_cost; + bool committed; +}; + +struct iocg_wake_ctx { + struct ioc_gq *iocg; + u32 hw_inuse; + s64 vbudget; +}; + +static const struct ioc_params autop[] = { + [AUTOP_HDD] = { + .qos = { + [QOS_RLAT] = 250000, /* 250ms */ + [QOS_WLAT] = 250000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 174019176, + [I_LCOEF_RSEQIOPS] = 41708, + [I_LCOEF_RRANDIOPS] = 370, + [I_LCOEF_WBPS] = 178075866, + [I_LCOEF_WSEQIOPS] = 42705, + [I_LCOEF_WRANDIOPS] = 378, + }, + }, + [AUTOP_SSD_QD1] = { + .qos = { + [QOS_RLAT] = 25000, /* 25ms */ + [QOS_WLAT] = 25000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 245855193, + [I_LCOEF_RSEQIOPS] = 61575, + [I_LCOEF_RRANDIOPS] = 6946, + [I_LCOEF_WBPS] = 141365009, + [I_LCOEF_WSEQIOPS] = 33716, + [I_LCOEF_WRANDIOPS] = 26796, + }, + }, + [AUTOP_SSD_DFL] = { + .qos = { + [QOS_RLAT] = 25000, /* 25ms */ + [QOS_WLAT] = 25000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 488636629, + [I_LCOEF_RSEQIOPS] = 8932, + [I_LCOEF_RRANDIOPS] = 8518, + [I_LCOEF_WBPS] = 427891549, + [I_LCOEF_WSEQIOPS] = 28755, + [I_LCOEF_WRANDIOPS] = 21940, + }, + .too_fast_vrate_pct = 500, + }, + [AUTOP_SSD_FAST] = { + .qos = { + [QOS_RLAT] = 5000, /* 5ms */ + [QOS_WLAT] = 5000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 3102524156LLU, + [I_LCOEF_RSEQIOPS] = 724816, + [I_LCOEF_RRANDIOPS] = 778122, + [I_LCOEF_WBPS] = 1742780862LLU, + [I_LCOEF_WSEQIOPS] = 425702, + [I_LCOEF_WRANDIOPS] = 443193, + }, + .too_slow_vrate_pct = 10, + }, +}; + +/* + * vrate adjust percentages indexed by ioc->busy_level. We adjust up on + * vtime credit shortage and down on device saturation. + */ +static u32 vrate_adj_pct[] = + { 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 }; + +static struct blkcg_policy blkcg_policy_iocost; + +/* accessors and helpers */ +static struct ioc *rqos_to_ioc(struct rq_qos *rqos) +{ + return container_of(rqos, struct ioc, rqos); +} + +static struct ioc *q_to_ioc(struct request_queue *q) +{ + return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); +} + +static const char *q_name(struct request_queue *q) +{ + if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + return kobject_name(q->kobj.parent); + else + return "<unknown>"; +} + +static const char __maybe_unused *ioc_name(struct ioc *ioc) +{ + return q_name(ioc->rqos.q); +} + +static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct ioc_gq, pd) : NULL; +} + +static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg) +{ + return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost)); +} + +static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg) +{ + return pd_to_blkg(&iocg->pd); +} + +static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) +{ + return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost), + struct ioc_cgrp, cpd); +} + +/* + * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical + * weight, the more expensive each IO. Must round up. + */ +static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) +{ + return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); +} + +/* + * The inverse of abs_cost_to_cost(). Must round up. + */ +static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) +{ + return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE); +} + +static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) +{ + bio->bi_iocost_cost = cost; + atomic64_add(cost, &iocg->vtime); +} + +#define CREATE_TRACE_POINTS +#include <trace/events/iocost.h> + +/* latency Qos params changed, update period_us and all the dependent params */ +static void ioc_refresh_period_us(struct ioc *ioc) +{ + u32 ppm, lat, multi, period_us; + + lockdep_assert_held(&ioc->lock); + + /* pick the higher latency target */ + if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { + ppm = ioc->params.qos[QOS_RPPM]; + lat = ioc->params.qos[QOS_RLAT]; + } else { + ppm = ioc->params.qos[QOS_WPPM]; + lat = ioc->params.qos[QOS_WLAT]; + } + + /* + * We want the period to be long enough to contain a healthy number + * of IOs while short enough for granular control. Define it as a + * multiple of the latency target. Ideally, the multiplier should + * be scaled according to the percentile so that it would nominally + * contain a certain number of requests. Let's be simpler and + * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50). + */ + if (ppm) + multi = max_t(u32, (MILLION - ppm) / 50000, 2); + else + multi = 2; + period_us = multi * lat; + period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD); + + /* calculate dependent params */ + ioc->period_us = period_us; + ioc->margin_us = period_us * MARGIN_PCT / 100; + ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( + period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100); +} + +static int ioc_autop_idx(struct ioc *ioc) +{ + int idx = ioc->autop_idx; + const struct ioc_params *p = &autop[idx]; + u32 vrate_pct; + u64 now_ns; + + /* rotational? */ + if (!blk_queue_nonrot(ioc->rqos.q)) + return AUTOP_HDD; + + /* handle SATA SSDs w/ broken NCQ */ + if (blk_queue_depth(ioc->rqos.q) == 1) + return AUTOP_SSD_QD1; + + /* use one of the normal ssd sets */ + if (idx < AUTOP_SSD_DFL) + return AUTOP_SSD_DFL; + + /* if user is overriding anything, maintain what was there */ + if (ioc->user_qos_params || ioc->user_cost_model) + return idx; + + /* step up/down based on the vrate */ + vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100, + VTIME_PER_USEC); + now_ns = ktime_get_ns(); + + if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { + if (!ioc->autop_too_fast_at) + ioc->autop_too_fast_at = now_ns; + if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) + return idx + 1; + } else { + ioc->autop_too_fast_at = 0; + } + + if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { + if (!ioc->autop_too_slow_at) + ioc->autop_too_slow_at = now_ns; + if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) + return idx - 1; + } else { + ioc->autop_too_slow_at = 0; + } + + return idx; +} + +/* + * Take the followings as input + * + * @bps maximum sequential throughput + * @seqiops maximum sequential 4k iops + * @randiops maximum random 4k iops + * + * and calculate the linear model cost coefficients. + * + * *@page per-page cost 1s / (@bps / 4096) + * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0) + * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0) + */ +static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops, + u64 *page, u64 *seqio, u64 *randio) +{ + u64 v; + + *page = *seqio = *randio = 0; + + if (bps) + *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, + DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE)); + + if (seqiops) { + v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops); + if (v > *page) + *seqio = v - *page; + } + + if (randiops) { + v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops); + if (v > *page) + *randio = v - *page; + } +} + +static void ioc_refresh_lcoefs(struct ioc *ioc) +{ + u64 *u = ioc->params.i_lcoefs; + u64 *c = ioc->params.lcoefs; + + calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], + &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]); + calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS], + &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]); +} + +static bool ioc_refresh_params(struct ioc *ioc, bool force) +{ + const struct ioc_params *p; + int idx; + + lockdep_assert_held(&ioc->lock); + + idx = ioc_autop_idx(ioc); + p = &autop[idx]; + + if (idx == ioc->autop_idx && !force) + return false; + + if (idx != ioc->autop_idx) + atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); + + ioc->autop_idx = idx; + ioc->autop_too_fast_at = 0; + ioc->autop_too_slow_at = 0; + + if (!ioc->user_qos_params) + memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); + if (!ioc->user_cost_model) + memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); + + ioc_refresh_period_us(ioc); + ioc_refresh_lcoefs(ioc); + + ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * + VTIME_PER_USEC, MILLION); + ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] * + VTIME_PER_USEC, MILLION); + + return true; +} + +/* take a snapshot of the current [v]time and vrate */ +static void ioc_now(struct ioc *ioc, struct ioc_now *now) +{ + unsigned seq; + + now->now_ns = ktime_get(); + now->now = ktime_to_us(now->now_ns); + now->vrate = atomic64_read(&ioc->vtime_rate); + + /* + * The current vtime is + * + * vtime at period start + (wallclock time since the start) * vrate + * + * As a consistent snapshot of `period_at_vtime` and `period_at` is + * needed, they're seqcount protected. + */ + do { + seq = read_seqcount_begin(&ioc->period_seqcount); + now->vnow = ioc->period_at_vtime + + (now->now - ioc->period_at) * now->vrate; + } while (read_seqcount_retry(&ioc->period_seqcount, seq)); +} + +static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) +{ + lockdep_assert_held(&ioc->lock); + WARN_ON_ONCE(ioc->running != IOC_RUNNING); + + write_seqcount_begin(&ioc->period_seqcount); + ioc->period_at = now->now; + ioc->period_at_vtime = now->vnow; + write_seqcount_end(&ioc->period_seqcount); + + ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); + add_timer(&ioc->timer); +} + +/* + * Update @iocg's `active` and `inuse` to @active and @inuse, update level + * weight sums and propagate upwards accordingly. + */ +static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +{ + struct ioc *ioc = iocg->ioc; + int lvl; + + lockdep_assert_held(&ioc->lock); + + inuse = min(active, inuse); + + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + u32 parent_active = 0, parent_inuse = 0; + + /* update the level sums */ + parent->child_active_sum += (s32)(active - child->active); + parent->child_inuse_sum += (s32)(inuse - child->inuse); + /* apply the udpates */ + child->active = active; + child->inuse = inuse; + + /* + * The delta between inuse and active sums indicates that + * that much of weight is being given away. Parent's inuse + * and active should reflect the ratio. + */ + if (parent->child_active_sum) { + parent_active = parent->weight; + parent_inuse = DIV64_U64_ROUND_UP( + parent_active * parent->child_inuse_sum, + parent->child_active_sum); + } + + /* do we need to keep walking up? */ + if (parent_active == parent->active && + parent_inuse == parent->inuse) + break; + + active = parent_active; + inuse = parent_inuse; + } + + ioc->weights_updated = true; +} + +static void commit_active_weights(struct ioc *ioc) +{ + lockdep_assert_held(&ioc->lock); + + if (ioc->weights_updated) { + /* paired with rmb in current_hweight(), see there */ + smp_wmb(); + atomic_inc(&ioc->hweight_gen); + ioc->weights_updated = false; + } +} + +static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +{ + __propagate_active_weight(iocg, active, inuse); + commit_active_weights(iocg->ioc); +} + +static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) +{ + struct ioc *ioc = iocg->ioc; + int lvl; + u32 hwa, hwi; + int ioc_gen; + + /* hot path - if uptodate, use cached */ + ioc_gen = atomic_read(&ioc->hweight_gen); + if (ioc_gen == iocg->hweight_gen) + goto out; + + /* + * Paired with wmb in commit_active_weights(). If we saw the + * updated hweight_gen, all the weight updates from + * __propagate_active_weight() are visible too. + * + * We can race with weight updates during calculation and get it + * wrong. However, hweight_gen would have changed and a future + * reader will recalculate and we're guaranteed to discard the + * wrong result soon. + */ + smp_rmb(); + + hwa = hwi = HWEIGHT_WHOLE; + for (lvl = 0; lvl <= iocg->level - 1; lvl++) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + u32 active_sum = READ_ONCE(parent->child_active_sum); + u32 inuse_sum = READ_ONCE(parent->child_inuse_sum); + u32 active = READ_ONCE(child->active); + u32 inuse = READ_ONCE(child->inuse); + + /* we can race with deactivations and either may read as zero */ + if (!active_sum || !inuse_sum) + continue; + + active_sum = max(active, active_sum); + hwa = hwa * active / active_sum; /* max 16bits * 10000 */ + + inuse_sum = max(inuse, inuse_sum); + hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */ + } + + iocg->hweight_active = max_t(u32, hwa, 1); + iocg->hweight_inuse = max_t(u32, hwi, 1); + iocg->hweight_gen = ioc_gen; +out: + if (hw_activep) + *hw_activep = iocg->hweight_active; + if (hw_inusep) + *hw_inusep = iocg->hweight_inuse; +} + +static void weight_updated(struct ioc_gq *iocg) +{ + struct ioc *ioc = iocg->ioc; + struct blkcg_gq *blkg = iocg_to_blkg(iocg); + struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); + u32 weight; + + lockdep_assert_held(&ioc->lock); + + weight = iocg->cfg_weight ?: iocc->dfl_weight; + if (weight != iocg->weight && iocg->active) + propagate_active_weight(iocg, weight, + DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight)); + iocg->weight = weight; +} + +static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + u64 last_period, cur_period, max_period_delta; + u64 vtime, vmargin, vmin; + int i; + + /* + * If seem to be already active, just update the stamp to tell the + * timer that we're still active. We don't mind occassional races. + */ + if (!list_empty(&iocg->active_list)) { + ioc_now(ioc, now); + cur_period = atomic64_read(&ioc->cur_period); + if (atomic64_read(&iocg->active_period) != cur_period) + atomic64_set(&iocg->active_period, cur_period); + return true; + } + + /* racy check on internal node IOs, treat as root level IOs */ + if (iocg->child_active_sum) + return false; + + spin_lock_irq(&ioc->lock); + + ioc_now(ioc, now); + + /* update period */ + cur_period = atomic64_read(&ioc->cur_period); + last_period = atomic64_read(&iocg->active_period); + atomic64_set(&iocg->active_period, cur_period); + + /* already activated or breaking leaf-only constraint? */ + if (!list_empty(&iocg->active_list)) + goto succeed_unlock; + for (i = iocg->level - 1; i > 0; i--) + if (!list_empty(&iocg->ancestors[i]->active_list)) + goto fail_unlock; + + if (iocg->child_active_sum) + goto fail_unlock; + + /* + * vtime may wrap when vrate is raised substantially due to + * underestimated IO costs. Look at the period and ignore its + * vtime if the iocg has been idle for too long. Also, cap the + * budget it can start with to the margin. + */ + max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us); + vtime = atomic64_read(&iocg->vtime); + vmargin = ioc->margin_us * now->vrate; + vmin = now->vnow - vmargin; + + if (last_period + max_period_delta < cur_period || + time_before64(vtime, vmin)) { + atomic64_add(vmin - vtime, &iocg->vtime); + atomic64_add(vmin - vtime, &iocg->done_vtime); + vtime = vmin; + } + + /* + * Activate, propagate weight and start period timer if not + * running. Reset hweight_gen to avoid accidental match from + * wrapping. + */ + iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; + list_add(&iocg->active_list, &ioc->active_iocgs); + propagate_active_weight(iocg, iocg->weight, + iocg->last_inuse ?: iocg->weight); + + TRACE_IOCG_PATH(iocg_activate, iocg, now, + last_period, cur_period, vtime); + + iocg->last_vtime = vtime; + + if (ioc->running == IOC_IDLE) { + ioc->running = IOC_RUNNING; + ioc_start_period(ioc, now); + } + +succeed_unlock: + spin_unlock_irq(&ioc->lock); + return true; + +fail_unlock: + spin_unlock_irq(&ioc->lock); + return false; +} + +static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, + int flags, void *key) +{ + struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); + struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key; + u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); + + ctx->vbudget -= cost; + + if (ctx->vbudget < 0) + return -1; + + iocg_commit_bio(ctx->iocg, wait->bio, cost); + + /* + * autoremove_wake_function() removes the wait entry only when it + * actually changed the task state. We want the wait always + * removed. Remove explicitly and use default_wake_function(). + */ + list_del_init(&wq_entry->entry); + wait->committed = true; + + default_wake_function(wq_entry, mode, flags, key); + return 0; +} + +static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct iocg_wake_ctx ctx = { .iocg = iocg }; + u64 margin_ns = (u64)(ioc->period_us * + WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; + u64 abs_vdebt, vdebt, vshortage, expires, oexpires; + s64 vbudget; + u32 hw_inuse; + + lockdep_assert_held(&iocg->waitq.lock); + + current_hweight(iocg, NULL, &hw_inuse); + vbudget = now->vnow - atomic64_read(&iocg->vtime); + + /* pay off debt */ + abs_vdebt = atomic64_read(&iocg->abs_vdebt); + vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse); + if (vdebt && vbudget > 0) { + u64 delta = min_t(u64, vbudget, vdebt); + u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), + abs_vdebt); + + atomic64_add(delta, &iocg->vtime); + atomic64_add(delta, &iocg->done_vtime); + atomic64_sub(abs_delta, &iocg->abs_vdebt); + if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0)) + atomic64_set(&iocg->abs_vdebt, 0); + } + + /* + * Wake up the ones which are due and see how much vtime we'll need + * for the next one. + */ + ctx.hw_inuse = hw_inuse; + ctx.vbudget = vbudget - vdebt; + __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); + if (!waitqueue_active(&iocg->waitq)) + return; + if (WARN_ON_ONCE(ctx.vbudget >= 0)) + return; + + /* determine next wakeup, add a quarter margin to guarantee chunking */ + vshortage = -ctx.vbudget; + expires = now->now_ns + + DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC; + expires += margin_ns / 4; + + /* if already active and close enough, don't bother */ + oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); + if (hrtimer_is_queued(&iocg->waitq_timer) && + abs(oexpires - expires) <= margin_ns / 4) + return; + + hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), + margin_ns / 4, HRTIMER_MODE_ABS); +} + +static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) +{ + struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); + struct ioc_now now; + unsigned long flags; + + ioc_now(iocg->ioc, &now); + + spin_lock_irqsave(&iocg->waitq.lock, flags); + iocg_kick_waitq(iocg, &now); + spin_unlock_irqrestore(&iocg->waitq.lock, flags); + + return HRTIMER_NORESTART; +} + +static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) +{ + struct ioc *ioc = iocg->ioc; + struct blkcg_gq *blkg = iocg_to_blkg(iocg); + u64 vtime = atomic64_read(&iocg->vtime); + u64 vmargin = ioc->margin_us * now->vrate; + u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; + u64 expires, oexpires; + u32 hw_inuse; + + /* debt-adjust vtime */ + current_hweight(iocg, NULL, &hw_inuse); + vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse); + + /* clear or maintain depending on the overage */ + if (time_before_eq64(vtime, now->vnow)) { + blkcg_clear_delay(blkg); + return false; + } + if (!atomic_read(&blkg->use_delay) && + time_before_eq64(vtime, now->vnow + vmargin)) + return false; + + /* use delay */ + if (cost) { + u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC, + now->vrate); + blkcg_add_delay(blkg, now->now_ns, cost_ns); + } + blkcg_use_delay(blkg); + + expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow, + now->vrate) * NSEC_PER_USEC; + + /* if already active and close enough, don't bother */ + oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); + if (hrtimer_is_queued(&iocg->delay_timer) && + abs(oexpires - expires) <= margin_ns / 4) + return true; + + hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), + margin_ns / 4, HRTIMER_MODE_ABS); + return true; +} + +static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) +{ + struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); + struct ioc_now now; + + ioc_now(iocg->ioc, &now); + iocg_kick_delay(iocg, &now, 0); + + return HRTIMER_NORESTART; +} + +static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) +{ + u32 nr_met[2] = { }; + u32 nr_missed[2] = { }; + u64 rq_wait_ns = 0; + int cpu, rw; + + for_each_online_cpu(cpu) { + struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); + u64 this_rq_wait_ns; + + for (rw = READ; rw <= WRITE; rw++) { + u32 this_met = READ_ONCE(stat->missed[rw].nr_met); + u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed); + + nr_met[rw] += this_met - stat->missed[rw].last_met; + nr_missed[rw] += this_missed - stat->missed[rw].last_missed; + stat->missed[rw].last_met = this_met; + stat->missed[rw].last_missed = this_missed; + } + + this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns); + rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; + stat->last_rq_wait_ns = this_rq_wait_ns; + } + + for (rw = READ; rw <= WRITE; rw++) { + if (nr_met[rw] + nr_missed[rw]) + missed_ppm_ar[rw] = + DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION, + nr_met[rw] + nr_missed[rw]); + else + missed_ppm_ar[rw] = 0; + } + + *rq_wait_pct_p = div64_u64(rq_wait_ns * 100, + ioc->period_us * NSEC_PER_USEC); +} + +/* was iocg idle this period? */ +static bool iocg_is_idle(struct ioc_gq *iocg) +{ + struct ioc *ioc = iocg->ioc; + + /* did something get issued this period? */ + if (atomic64_read(&iocg->active_period) == + atomic64_read(&ioc->cur_period)) + return false; + + /* is something in flight? */ + if (atomic64_read(&iocg->done_vtime) < atomic64_read(&iocg->vtime)) + return false; + + return true; +} + +/* returns usage with margin added if surplus is large enough */ +static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse) +{ + /* add margin */ + usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); + usage += SURPLUS_SCALE_ABS; + + /* don't bother if the surplus is too small */ + if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse) + return 0; + + return usage; +} + +static void ioc_timer_fn(struct timer_list *timer) +{ + struct ioc *ioc = container_of(timer, struct ioc, timer); + struct ioc_gq *iocg, *tiocg; + struct ioc_now now; + int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0; + u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + u32 missed_ppm[2], rq_wait_pct; + u64 period_vtime; + int prev_busy_level, i; + + /* how were the latencies during the period? */ + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + + /* take care of active iocgs */ + spin_lock_irq(&ioc->lock); + + ioc_now(ioc, &now); + + period_vtime = now.vnow - ioc->period_at_vtime; + if (WARN_ON_ONCE(!period_vtime)) { + spin_unlock_irq(&ioc->lock); + return; + } + + /* + * Waiters determine the sleep durations based on the vrate they + * saw at the time of sleep. If vrate has increased, some waiters + * could be sleeping for too long. Wake up tardy waiters which + * should have woken up in the last period and expire idle iocgs. + */ + list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { + if (!waitqueue_active(&iocg->waitq) && + !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg)) + continue; + + spin_lock(&iocg->waitq.lock); + + if (waitqueue_active(&iocg->waitq) || + atomic64_read(&iocg->abs_vdebt)) { + /* might be oversleeping vtime / hweight changes, kick */ + iocg_kick_waitq(iocg, &now); + iocg_kick_delay(iocg, &now, 0); + } else if (iocg_is_idle(iocg)) { + /* no waiter and idle, deactivate */ + iocg->last_inuse = iocg->inuse; + __propagate_active_weight(iocg, 0, 0); + list_del_init(&iocg->active_list); + } + + spin_unlock(&iocg->waitq.lock); + } + commit_active_weights(ioc); + + /* calc usages and see whether some weights need to be moved around */ + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + u64 vdone, vtime, vusage, vmargin, vmin; + u32 hw_active, hw_inuse, usage; + + /* + * Collect unused and wind vtime closer to vnow to prevent + * iocgs from accumulating a large amount of budget. + */ + vdone = atomic64_read(&iocg->done_vtime); + vtime = atomic64_read(&iocg->vtime); + current_hweight(iocg, &hw_active, &hw_inuse); + + /* + * Latency QoS detection doesn't account for IOs which are + * in-flight for longer than a period. Detect them by + * comparing vdone against period start. If lagging behind + * IOs from past periods, don't increase vrate. + */ + if ((ppm_rthr != MILLION || ppm_wthr != MILLION) && + !atomic_read(&iocg_to_blkg(iocg)->use_delay) && + time_after64(vtime, vdone) && + time_after64(vtime, now.vnow - + MAX_LAGGING_PERIODS * period_vtime) && + time_before64(vdone, now.vnow - period_vtime)) + nr_lagging++; + + if (waitqueue_active(&iocg->waitq)) + vusage = now.vnow - iocg->last_vtime; + else if (time_before64(iocg->last_vtime, vtime)) + vusage = vtime - iocg->last_vtime; + else + vusage = 0; + + iocg->last_vtime += vusage; + /* + * Factor in in-flight vtime into vusage to avoid + * high-latency completions appearing as idle. This should + * be done after the above ->last_time adjustment. + */ + vusage = max(vusage, vtime - vdone); + + /* calculate hweight based usage ratio and record */ + if (vusage) { + usage = DIV64_U64_ROUND_UP(vusage * hw_inuse, + period_vtime); + iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; + iocg->usages[iocg->usage_idx] = usage; + } else { + usage = 0; + } + + /* see whether there's surplus vtime */ + vmargin = ioc->margin_us * now.vrate; + vmin = now.vnow - vmargin; + + iocg->has_surplus = false; + + if (!waitqueue_active(&iocg->waitq) && + time_before64(vtime, vmin)) { + u64 delta = vmin - vtime; + + /* throw away surplus vtime */ + atomic64_add(delta, &iocg->vtime); + atomic64_add(delta, &iocg->done_vtime); + iocg->last_vtime += delta; + /* if usage is sufficiently low, maybe it can donate */ + if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { + iocg->has_surplus = true; + nr_surpluses++; + } + } else if (hw_inuse < hw_active) { + u32 new_hwi, new_inuse; + + /* was donating but might need to take back some */ + if (waitqueue_active(&iocg->waitq)) { + new_hwi = hw_active; + } else { + new_hwi = max(hw_inuse, + usage * SURPLUS_SCALE_PCT / 100 + + SURPLUS_SCALE_ABS); + } + + new_inuse = div64_u64((u64)iocg->inuse * new_hwi, + hw_inuse); + new_inuse = clamp_t(u32, new_inuse, 1, iocg->active); + + if (new_inuse > iocg->inuse) { + TRACE_IOCG_PATH(inuse_takeback, iocg, &now, + iocg->inuse, new_inuse, + hw_inuse, new_hwi); + __propagate_active_weight(iocg, iocg->weight, + new_inuse); + } + } else { + /* genuninely out of vtime */ + nr_shortages++; + } + } + + if (!nr_shortages || !nr_surpluses) + goto skip_surplus_transfers; + + /* there are both shortages and surpluses, transfer surpluses */ + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + u32 usage, hw_active, hw_inuse, new_hwi, new_inuse; + int nr_valid = 0; + + if (!iocg->has_surplus) + continue; + + /* base the decision on max historical usage */ + for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) { + if (iocg->usages[i]) { + usage = max(usage, iocg->usages[i]); + nr_valid++; + } + } + if (nr_valid < MIN_VALID_USAGES) + continue; + + current_hweight(iocg, &hw_active, &hw_inuse); + new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse); + if (!new_hwi) + continue; + + new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, + hw_inuse); + if (new_inuse < iocg->inuse) { + TRACE_IOCG_PATH(inuse_giveaway, iocg, &now, + iocg->inuse, new_inuse, + hw_inuse, new_hwi); + __propagate_active_weight(iocg, iocg->weight, new_inuse); + } + } +skip_surplus_transfers: + commit_active_weights(ioc); + + /* + * If q is getting clogged or we're missing too much, we're issuing + * too much IO and should lower vtime rate. If we're not missing + * and experiencing shortages but not surpluses, we're too stingy + * and should increase vtime rate. + */ + prev_busy_level = ioc->busy_level; + if (rq_wait_pct > RQ_WAIT_BUSY_PCT || + missed_ppm[READ] > ppm_rthr || + missed_ppm[WRITE] > ppm_wthr) { + ioc->busy_level = max(ioc->busy_level, 0); + ioc->busy_level++; + } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && + missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 && + missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) { + /* take action iff there is contention */ + if (nr_shortages && !nr_lagging) { + ioc->busy_level = min(ioc->busy_level, 0); + /* redistribute surpluses first */ + if (!nr_surpluses) + ioc->busy_level--; + } + } else { + ioc->busy_level = 0; + } + + ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); + + if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { + u64 vrate = atomic64_read(&ioc->vtime_rate); + u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; + + /* rq_wait signal is always reliable, ignore user vrate_min */ + if (rq_wait_pct > RQ_WAIT_BUSY_PCT) + vrate_min = VRATE_MIN; + + /* + * If vrate is out of bounds, apply clamp gradually as the + * bounds can change abruptly. Otherwise, apply busy_level + * based adjustment. + */ + if (vrate < vrate_min) { + vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), + 100); + vrate = min(vrate, vrate_min); + } else if (vrate > vrate_max) { + vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), + 100); + vrate = max(vrate, vrate_max); + } else { + int idx = min_t(int, abs(ioc->busy_level), + ARRAY_SIZE(vrate_adj_pct) - 1); + u32 adj_pct = vrate_adj_pct[idx]; + + if (ioc->busy_level > 0) + adj_pct = 100 - adj_pct; + else + adj_pct = 100 + adj_pct; + + vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), + vrate_min, vrate_max); + } + + trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages, + nr_surpluses); + + atomic64_set(&ioc->vtime_rate, vrate); + ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( + ioc->period_us * vrate * INUSE_MARGIN_PCT, 100); + } else if (ioc->busy_level != prev_busy_level || nr_lagging) { + trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + &missed_ppm, rq_wait_pct, nr_lagging, + nr_shortages, nr_surpluses); + } + + ioc_refresh_params(ioc, false); + + /* + * This period is done. Move onto the next one. If nothing's + * going on with the device, stop the timer. + */ + atomic64_inc(&ioc->cur_period); + + if (ioc->running != IOC_STOP) { + if (!list_empty(&ioc->active_iocgs)) { + ioc_start_period(ioc, &now); + } else { + ioc->busy_level = 0; + ioc->running = IOC_IDLE; + } + } + + spin_unlock_irq(&ioc->lock); +} + +static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, + bool is_merge, u64 *costp) +{ + struct ioc *ioc = iocg->ioc; + u64 coef_seqio, coef_randio, coef_page; + u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1); + u64 seek_pages = 0; + u64 cost = 0; + + switch (bio_op(bio)) { + case REQ_OP_READ: + coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; + coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; + coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; + break; + case REQ_OP_WRITE: + coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; + coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; + coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; + break; + default: + goto out; + } + + if (iocg->cursor) { + seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); + seek_pages >>= IOC_SECT_TO_PAGE_SHIFT; + } + + if (!is_merge) { + if (seek_pages > LCOEF_RANDIO_PAGES) { + cost += coef_randio; + } else { + cost += coef_seqio; + } + } + cost += pages * coef_page; +out: + *costp = cost; +} + +static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge) +{ + u64 cost; + + calc_vtime_cost_builtin(bio, iocg, is_merge, &cost); + return cost; +} + +static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct ioc *ioc = rqos_to_ioc(rqos); + struct ioc_gq *iocg = blkg_to_iocg(blkg); + struct ioc_now now; + struct iocg_wait wait; + u32 hw_active, hw_inuse; + u64 abs_cost, cost, vtime; + + /* bypass IOs if disabled or for root cgroup */ + if (!ioc->enabled || !iocg->level) + return; + + /* always activate so that even 0 cost IOs get protected to some level */ + if (!iocg_activate(iocg, &now)) + return; + + /* calculate the absolute vtime cost */ + abs_cost = calc_vtime_cost(bio, iocg, false); + if (!abs_cost) + return; + + iocg->cursor = bio_end_sector(bio); + + vtime = atomic64_read(&iocg->vtime); + current_hweight(iocg, &hw_active, &hw_inuse); + + if (hw_inuse < hw_active && + time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) { + TRACE_IOCG_PATH(inuse_reset, iocg, &now, + iocg->inuse, iocg->weight, hw_inuse, hw_active); + spin_lock_irq(&ioc->lock); + propagate_active_weight(iocg, iocg->weight, iocg->weight); + spin_unlock_irq(&ioc->lock); + current_hweight(iocg, &hw_active, &hw_inuse); + } + + cost = abs_cost_to_cost(abs_cost, hw_inuse); + + /* + * If no one's waiting and within budget, issue right away. The + * tests are racy but the races aren't systemic - we only miss once + * in a while which is fine. + */ + if (!waitqueue_active(&iocg->waitq) && + !atomic64_read(&iocg->abs_vdebt) && + time_before_eq64(vtime + cost, now.vnow)) { + iocg_commit_bio(iocg, bio, cost); + return; + } + + /* + * We're over budget. If @bio has to be issued regardless, + * remember the abs_cost instead of advancing vtime. + * iocg_kick_waitq() will pay off the debt before waking more IOs. + * This way, the debt is continuously paid off each period with the + * actual budget available to the cgroup. If we just wound vtime, + * we would incorrectly use the current hw_inuse for the entire + * amount which, for example, can lead to the cgroup staying + * blocked for a long time even with substantially raised hw_inuse. + */ + if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { + atomic64_add(abs_cost, &iocg->abs_vdebt); + if (iocg_kick_delay(iocg, &now, cost)) + blkcg_schedule_throttle(rqos->q, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); + return; + } + + /* + * Append self to the waitq and schedule the wakeup timer if we're + * the first waiter. The timer duration is calculated based on the + * current vrate. vtime and hweight changes can make it too short + * or too long. Each wait entry records the absolute cost it's + * waiting for to allow re-evaluation using a custom wait entry. + * + * If too short, the timer simply reschedules itself. If too long, + * the period timer will notice and trigger wakeups. + * + * All waiters are on iocg->waitq and the wait states are + * synchronized using waitq.lock. + */ + spin_lock_irq(&iocg->waitq.lock); + + /* + * We activated above but w/o any synchronization. Deactivation is + * synchronized with waitq.lock and we won't get deactivated as + * long as we're waiting, so we're good if we're activated here. + * In the unlikely case that we are deactivated, just issue the IO. + */ + if (unlikely(list_empty(&iocg->active_list))) { + spin_unlock_irq(&iocg->waitq.lock); + iocg_commit_bio(iocg, bio, cost); + return; + } + + init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); + wait.wait.private = current; + wait.bio = bio; + wait.abs_cost = abs_cost; + wait.committed = false; /* will be set true by waker */ + + __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); + iocg_kick_waitq(iocg, &now); + + spin_unlock_irq(&iocg->waitq.lock); + + while (true) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (wait.committed) + break; + io_schedule(); + } + + /* waker already committed us, proceed */ + finish_wait(&iocg->waitq, &wait.wait); +} + +static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, + struct bio *bio) +{ + struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); + struct ioc *ioc = iocg->ioc; + sector_t bio_end = bio_end_sector(bio); + struct ioc_now now; + u32 hw_inuse; + u64 abs_cost, cost; + + /* bypass if disabled or for root cgroup */ + if (!ioc->enabled || !iocg->level) + return; + + abs_cost = calc_vtime_cost(bio, iocg, true); + if (!abs_cost) + return; + + ioc_now(ioc, &now); + current_hweight(iocg, NULL, &hw_inuse); + cost = abs_cost_to_cost(abs_cost, hw_inuse); + + /* update cursor if backmerging into the request at the cursor */ + if (blk_rq_pos(rq) < bio_end && + blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) + iocg->cursor = bio_end; + + /* + * Charge if there's enough vtime budget and the existing request + * has cost assigned. Otherwise, account it as debt. See debt + * handling in ioc_rqos_throttle() for details. + */ + if (rq->bio && rq->bio->bi_iocost_cost && + time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) + iocg_commit_bio(iocg, bio, cost); + else + atomic64_add(abs_cost, &iocg->abs_vdebt); +} + +static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) +{ + struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); + + if (iocg && bio->bi_iocost_cost) + atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); +} + +static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) +{ + struct ioc *ioc = rqos_to_ioc(rqos); + u64 on_q_ns, rq_wait_ns; + int pidx, rw; + + if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) + return; + + switch (req_op(rq) & REQ_OP_MASK) { + case REQ_OP_READ: + pidx = QOS_RLAT; + rw = READ; + break; + case REQ_OP_WRITE: + pidx = QOS_WLAT; + rw = WRITE; + break; + default: + return; + } + + on_q_ns = ktime_get_ns() - rq->alloc_time_ns; + rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; + + if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC) + this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met); + else + this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed); + + this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns); +} + +static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) +{ + struct ioc *ioc = rqos_to_ioc(rqos); + + spin_lock_irq(&ioc->lock); + ioc_refresh_params(ioc, false); + spin_unlock_irq(&ioc->lock); +} + +static void ioc_rqos_exit(struct rq_qos *rqos) +{ + struct ioc *ioc = rqos_to_ioc(rqos); + + blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost); + + spin_lock_irq(&ioc->lock); + ioc->running = IOC_STOP; + spin_unlock_irq(&ioc->lock); + + del_timer_sync(&ioc->timer); + free_percpu(ioc->pcpu_stat); + kfree(ioc); +} + +static struct rq_qos_ops ioc_rqos_ops = { + .throttle = ioc_rqos_throttle, + .merge = ioc_rqos_merge, + .done_bio = ioc_rqos_done_bio, + .done = ioc_rqos_done, + .queue_depth_changed = ioc_rqos_queue_depth_changed, + .exit = ioc_rqos_exit, +}; + +static int blk_iocost_init(struct request_queue *q) +{ + struct ioc *ioc; + struct rq_qos *rqos; + int ret; + + ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); + if (!ioc) + return -ENOMEM; + + ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); + if (!ioc->pcpu_stat) { + kfree(ioc); + return -ENOMEM; + } + + rqos = &ioc->rqos; + rqos->id = RQ_QOS_COST; + rqos->ops = &ioc_rqos_ops; + rqos->q = q; + + spin_lock_init(&ioc->lock); + timer_setup(&ioc->timer, ioc_timer_fn, 0); + INIT_LIST_HEAD(&ioc->active_iocgs); + + ioc->running = IOC_IDLE; + atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); + seqcount_init(&ioc->period_seqcount); + ioc->period_at = ktime_to_us(ktime_get()); + atomic64_set(&ioc->cur_period, 0); + atomic_set(&ioc->hweight_gen, 0); + + spin_lock_irq(&ioc->lock); + ioc->autop_idx = AUTOP_INVALID; + ioc_refresh_params(ioc, true); + spin_unlock_irq(&ioc->lock); + + rq_qos_add(q, rqos); + ret = blkcg_activate_policy(q, &blkcg_policy_iocost); + if (ret) { + rq_qos_del(q, rqos); + free_percpu(ioc->pcpu_stat); + kfree(ioc); + return ret; + } + return 0; +} + +static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) +{ + struct ioc_cgrp *iocc; + + iocc = kzalloc(sizeof(struct ioc_cgrp), gfp); + if (!iocc) + return NULL; + + iocc->dfl_weight = CGROUP_WEIGHT_DFL; + return &iocc->cpd; +} + +static void ioc_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(container_of(cpd, struct ioc_cgrp, cpd)); +} + +static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) +{ + int levels = blkcg->css.cgroup->level + 1; + struct ioc_gq *iocg; + + iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]), + gfp, q->node); + if (!iocg) + return NULL; + + return &iocg->pd; +} + +static void ioc_pd_init(struct blkg_policy_data *pd) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); + struct ioc *ioc = q_to_ioc(blkg->q); + struct ioc_now now; + struct blkcg_gq *tblkg; + unsigned long flags; + + ioc_now(ioc, &now); + + iocg->ioc = ioc; + atomic64_set(&iocg->vtime, now.vnow); + atomic64_set(&iocg->done_vtime, now.vnow); + atomic64_set(&iocg->abs_vdebt, 0); + atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); + INIT_LIST_HEAD(&iocg->active_list); + iocg->hweight_active = HWEIGHT_WHOLE; + iocg->hweight_inuse = HWEIGHT_WHOLE; + + init_waitqueue_head(&iocg->waitq); + hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + iocg->waitq_timer.function = iocg_waitq_timer_fn; + hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + iocg->delay_timer.function = iocg_delay_timer_fn; + + iocg->level = blkg->blkcg->css.cgroup->level; + + for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { + struct ioc_gq *tiocg = blkg_to_iocg(tblkg); + iocg->ancestors[tiocg->level] = tiocg; + } + + spin_lock_irqsave(&ioc->lock, flags); + weight_updated(iocg); + spin_unlock_irqrestore(&ioc->lock, flags); +} + +static void ioc_pd_free(struct blkg_policy_data *pd) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct ioc *ioc = iocg->ioc; + + if (ioc) { + spin_lock(&ioc->lock); + if (!list_empty(&iocg->active_list)) { + propagate_active_weight(iocg, 0, 0); + list_del_init(&iocg->active_list); + } + spin_unlock(&ioc->lock); + + hrtimer_cancel(&iocg->waitq_timer); + hrtimer_cancel(&iocg->delay_timer); + } + kfree(iocg); +} + +static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioc_gq *iocg = pd_to_iocg(pd); + + if (dname && iocg->cfg_weight) + seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight); + return 0; +} + + +static int ioc_weight_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); + + seq_printf(sf, "default %u\n", iocc->dfl_weight); + blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, + &blkcg_policy_iocost, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); + struct blkg_conf_ctx ctx; + struct ioc_gq *iocg; + u32 v; + int ret; + + if (!strchr(buf, ':')) { + struct blkcg_gq *blkg; + + if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v)) + return -EINVAL; + + if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) + return -EINVAL; + + spin_lock(&blkcg->lock); + iocc->dfl_weight = v; + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + struct ioc_gq *iocg = blkg_to_iocg(blkg); + + if (iocg) { + spin_lock_irq(&iocg->ioc->lock); + weight_updated(iocg); + spin_unlock_irq(&iocg->ioc->lock); + } + } + spin_unlock(&blkcg->lock); + + return nbytes; + } + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx); + if (ret) + return ret; + + iocg = blkg_to_iocg(ctx.blkg); + + if (!strncmp(ctx.body, "default", 7)) { + v = 0; + } else { + if (!sscanf(ctx.body, "%u", &v)) + goto einval; + if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) + goto einval; + } + + spin_lock(&iocg->ioc->lock); + iocg->cfg_weight = v; + weight_updated(iocg); + spin_unlock(&iocg->ioc->lock); + + blkg_conf_finish(&ctx); + return nbytes; + +einval: + blkg_conf_finish(&ctx); + return -EINVAL; +} + +static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioc *ioc = pd_to_iocg(pd)->ioc; + + if (!dname) + return 0; + + seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", + dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", + ioc->params.qos[QOS_RPPM] / 10000, + ioc->params.qos[QOS_RPPM] % 10000 / 100, + ioc->params.qos[QOS_RLAT], + ioc->params.qos[QOS_WPPM] / 10000, + ioc->params.qos[QOS_WPPM] % 10000 / 100, + ioc->params.qos[QOS_WLAT], + ioc->params.qos[QOS_MIN] / 10000, + ioc->params.qos[QOS_MIN] % 10000 / 100, + ioc->params.qos[QOS_MAX] / 10000, + ioc->params.qos[QOS_MAX] % 10000 / 100); + return 0; +} + +static int ioc_qos_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + + blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill, + &blkcg_policy_iocost, seq_cft(sf)->private, false); + return 0; +} + +static const match_table_t qos_ctrl_tokens = { + { QOS_ENABLE, "enable=%u" }, + { QOS_CTRL, "ctrl=%s" }, + { NR_QOS_CTRL_PARAMS, NULL }, +}; + +static const match_table_t qos_tokens = { + { QOS_RPPM, "rpct=%s" }, + { QOS_RLAT, "rlat=%u" }, + { QOS_WPPM, "wpct=%s" }, + { QOS_WLAT, "wlat=%u" }, + { QOS_MIN, "min=%s" }, + { QOS_MAX, "max=%s" }, + { NR_QOS_PARAMS, NULL }, +}; + +static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, + size_t nbytes, loff_t off) +{ + struct gendisk *disk; + struct ioc *ioc; + u32 qos[NR_QOS_PARAMS]; + bool enable, user; + char *p; + int ret; + + disk = blkcg_conf_get_disk(&input); + if (IS_ERR(disk)) + return PTR_ERR(disk); + + ioc = q_to_ioc(disk->queue); + if (!ioc) { + ret = blk_iocost_init(disk->queue); + if (ret) + goto err; + ioc = q_to_ioc(disk->queue); + } + + spin_lock_irq(&ioc->lock); + memcpy(qos, ioc->params.qos, sizeof(qos)); + enable = ioc->enabled; + user = ioc->user_qos_params; + spin_unlock_irq(&ioc->lock); + + while ((p = strsep(&input, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + char buf[32]; + int tok; + s64 v; + + if (!*p) + continue; + + switch (match_token(p, qos_ctrl_tokens, args)) { + case QOS_ENABLE: + match_u64(&args[0], &v); + enable = v; + continue; + case QOS_CTRL: + match_strlcpy(buf, &args[0], sizeof(buf)); + if (!strcmp(buf, "auto")) + user = false; + else if (!strcmp(buf, "user")) + user = true; + else + goto einval; + continue; + } + + tok = match_token(p, qos_tokens, args); + switch (tok) { + case QOS_RPPM: + case QOS_WPPM: + if (match_strlcpy(buf, &args[0], sizeof(buf)) >= + sizeof(buf)) + goto einval; + if (cgroup_parse_float(buf, 2, &v)) + goto einval; + if (v < 0 || v > 10000) + goto einval; + qos[tok] = v * 100; + break; + case QOS_RLAT: + case QOS_WLAT: + if (match_u64(&args[0], &v)) + goto einval; + qos[tok] = v; + break; + case QOS_MIN: + case QOS_MAX: + if (match_strlcpy(buf, &args[0], sizeof(buf)) >= + sizeof(buf)) + goto einval; + if (cgroup_parse_float(buf, 2, &v)) + goto einval; + if (v < 0) + goto einval; + qos[tok] = clamp_t(s64, v * 100, + VRATE_MIN_PPM, VRATE_MAX_PPM); + break; + default: + goto einval; + } + user = true; + } + + if (qos[QOS_MIN] > qos[QOS_MAX]) + goto einval; + + spin_lock_irq(&ioc->lock); + + if (enable) { + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + ioc->enabled = true; + } else { + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + ioc->enabled = false; + } + + if (user) { + memcpy(ioc->params.qos, qos, sizeof(qos)); + ioc->user_qos_params = true; + } else { + ioc->user_qos_params = false; + } + + ioc_refresh_params(ioc, true); + spin_unlock_irq(&ioc->lock); + + put_disk_and_module(disk); + return nbytes; +einval: + ret = -EINVAL; +err: + put_disk_and_module(disk); + return ret; +} + +static u64 ioc_cost_model_prfill(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioc *ioc = pd_to_iocg(pd)->ioc; + u64 *u = ioc->params.i_lcoefs; + + if (!dname) + return 0; + + seq_printf(sf, "%s ctrl=%s model=linear " + "rbps=%llu rseqiops=%llu rrandiops=%llu " + "wbps=%llu wseqiops=%llu wrandiops=%llu\n", + dname, ioc->user_cost_model ? "user" : "auto", + u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], + u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); + return 0; +} + +static int ioc_cost_model_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + + blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill, + &blkcg_policy_iocost, seq_cft(sf)->private, false); + return 0; +} + +static const match_table_t cost_ctrl_tokens = { + { COST_CTRL, "ctrl=%s" }, + { COST_MODEL, "model=%s" }, + { NR_COST_CTRL_PARAMS, NULL }, +}; + +static const match_table_t i_lcoef_tokens = { + { I_LCOEF_RBPS, "rbps=%u" }, + { I_LCOEF_RSEQIOPS, "rseqiops=%u" }, + { I_LCOEF_RRANDIOPS, "rrandiops=%u" }, + { I_LCOEF_WBPS, "wbps=%u" }, + { I_LCOEF_WSEQIOPS, "wseqiops=%u" }, + { I_LCOEF_WRANDIOPS, "wrandiops=%u" }, + { NR_I_LCOEFS, NULL }, +}; + +static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, + size_t nbytes, loff_t off) +{ + struct gendisk *disk; + struct ioc *ioc; + u64 u[NR_I_LCOEFS]; + bool user; + char *p; + int ret; + + disk = blkcg_conf_get_disk(&input); + if (IS_ERR(disk)) + return PTR_ERR(disk); + + ioc = q_to_ioc(disk->queue); + if (!ioc) { + ret = blk_iocost_init(disk->queue); + if (ret) + goto err; + ioc = q_to_ioc(disk->queue); + } + + spin_lock_irq(&ioc->lock); + memcpy(u, ioc->params.i_lcoefs, sizeof(u)); + user = ioc->user_cost_model; + spin_unlock_irq(&ioc->lock); + + while ((p = strsep(&input, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + char buf[32]; + int tok; + u64 v; + + if (!*p) + continue; + + switch (match_token(p, cost_ctrl_tokens, args)) { + case COST_CTRL: + match_strlcpy(buf, &args[0], sizeof(buf)); + if (!strcmp(buf, "auto")) + user = false; + else if (!strcmp(buf, "user")) + user = true; + else + goto einval; + continue; + case COST_MODEL: + match_strlcpy(buf, &args[0], sizeof(buf)); + if (strcmp(buf, "linear")) + goto einval; + continue; + } + + tok = match_token(p, i_lcoef_tokens, args); + if (tok == NR_I_LCOEFS) + goto einval; + if (match_u64(&args[0], &v)) + goto einval; + u[tok] = v; + user = true; + } + + spin_lock_irq(&ioc->lock); + if (user) { + memcpy(ioc->params.i_lcoefs, u, sizeof(u)); + ioc->user_cost_model = true; + } else { + ioc->user_cost_model = false; + } + ioc_refresh_params(ioc, true); + spin_unlock_irq(&ioc->lock); + + put_disk_and_module(disk); + return nbytes; + +einval: + ret = -EINVAL; +err: + put_disk_and_module(disk); + return ret; +} + +static struct cftype ioc_files[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = ioc_weight_show, + .write = ioc_weight_write, + }, + { + .name = "cost.qos", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = ioc_qos_show, + .write = ioc_qos_write, + }, + { + .name = "cost.model", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = ioc_cost_model_show, + .write = ioc_cost_model_write, + }, + {} +}; + +static struct blkcg_policy blkcg_policy_iocost = { + .dfl_cftypes = ioc_files, + .cpd_alloc_fn = ioc_cpd_alloc, + .cpd_free_fn = ioc_cpd_free, + .pd_alloc_fn = ioc_pd_alloc, + .pd_init_fn = ioc_pd_init, + .pd_free_fn = ioc_pd_free, +}; + +static int __init ioc_init(void) +{ + return blkcg_policy_register(&blkcg_policy_iocost); +} + +static void __exit ioc_exit(void) +{ + return blkcg_policy_unregister(&blkcg_policy_iocost); +} + +module_init(ioc_init); +module_exit(ioc_exit); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 0fff7b56df0e..c128d50cb410 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -725,7 +725,7 @@ int blk_iolatency_init(struct request_queue *q) return -ENOMEM; rqos = &blkiolat->rqos; - rqos->id = RQ_QOS_CGROUP; + rqos->id = RQ_QOS_LATENCY; rqos->ops = &blkcg_iolatency_ops; rqos->q = q; @@ -934,11 +934,13 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, } -static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) +static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, + struct request_queue *q, + struct blkcg *blkcg) { struct iolatency_grp *iolat; - iolat = kzalloc_node(sizeof(*iolat), gfp, node); + iolat = kzalloc_node(sizeof(*iolat), gfp, q->node); if (!iolat) return NULL; iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), diff --git a/block/blk-map.c b/block/blk-map.c index 3a62e471d81b..b0790268ed9d 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -151,7 +151,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, return 0; unmap_rq: - __blk_rq_unmap_user(bio); + blk_rq_unmap_user(bio); fail: rq->bio = NULL; return ret; diff --git a/block/blk-merge.c b/block/blk-merge.c index 57f7990b342d..1534ed736363 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -132,51 +132,83 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q, return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); } +/* + * Return the maximum number of sectors from the start of a bio that may be + * submitted as a single request to a block device. If enough sectors remain, + * align the end to the physical block size. Otherwise align the end to the + * logical block size. This approach minimizes the number of non-aligned + * requests that are submitted to a block device if the start of a bio is not + * aligned to a physical block boundary. + */ static inline unsigned get_max_io_size(struct request_queue *q, struct bio *bio) { unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); - unsigned mask = queue_logical_block_size(q) - 1; + unsigned max_sectors = sectors; + unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; + unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; + unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1); - /* aligned to logical block size */ - sectors &= ~(mask >> 9); + max_sectors += start_offset; + max_sectors &= ~(pbs - 1); + if (max_sectors > start_offset) + return max_sectors - start_offset; - return sectors; + return sectors & (lbs - 1); } -static unsigned get_max_segment_size(struct request_queue *q, - unsigned offset) +static inline unsigned get_max_segment_size(const struct request_queue *q, + struct page *start_page, + unsigned long offset) { unsigned long mask = queue_segment_boundary(q); - /* default segment boundary mask means no boundary limit */ - if (mask == BLK_SEG_BOUNDARY_MASK) - return queue_max_segment_size(q); + offset = mask & (page_to_phys(start_page) + offset); - return min_t(unsigned long, mask - (mask & offset) + 1, - queue_max_segment_size(q)); + /* + * overflow may be triggered in case of zero page physical address + * on 32bit arch, use queue's max segment size when that happens. + */ + return min_not_zero(mask - offset + 1, + (unsigned long)queue_max_segment_size(q)); } -/* - * Split the bvec @bv into segments, and update all kinds of - * variables. +/** + * bvec_split_segs - verify whether or not a bvec should be split in the middle + * @q: [in] request queue associated with the bio associated with @bv + * @bv: [in] bvec to examine + * @nsegs: [in,out] Number of segments in the bio being built. Incremented + * by the number of segments from @bv that may be appended to that + * bio without exceeding @max_segs + * @sectors: [in,out] Number of sectors in the bio being built. Incremented + * by the number of sectors from @bv that may be appended to that + * bio without exceeding @max_sectors + * @max_segs: [in] upper bound for *@nsegs + * @max_sectors: [in] upper bound for *@sectors + * + * When splitting a bio, it can happen that a bvec is encountered that is too + * big to fit in a single segment and hence that it has to be split in the + * middle. This function verifies whether or not that should happen. The value + * %true is returned if and only if appending the entire @bv to a bio with + * *@nsegs segments and *@sectors sectors would make that bio unacceptable for + * the block driver. */ -static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, - unsigned *nsegs, unsigned *sectors, unsigned max_segs) +static bool bvec_split_segs(const struct request_queue *q, + const struct bio_vec *bv, unsigned *nsegs, + unsigned *sectors, unsigned max_segs, + unsigned max_sectors) { - unsigned len = bv->bv_len; + unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9; + unsigned len = min(bv->bv_len, max_len); unsigned total_len = 0; - unsigned new_nsegs = 0, seg_size = 0; + unsigned seg_size = 0; - /* - * Multi-page bvec may be too big to hold in one segment, so the - * current bvec has to be splitted as multiple segments. - */ - while (len && new_nsegs + *nsegs < max_segs) { - seg_size = get_max_segment_size(q, bv->bv_offset + total_len); + while (len && *nsegs < max_segs) { + seg_size = get_max_segment_size(q, bv->bv_page, + bv->bv_offset + total_len); seg_size = min(seg_size, len); - new_nsegs++; + (*nsegs)++; total_len += seg_size; len -= seg_size; @@ -184,16 +216,31 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, break; } - if (new_nsegs) { - *nsegs += new_nsegs; - if (sectors) - *sectors += total_len >> 9; - } + *sectors += total_len >> 9; - /* split in the middle of the bvec if len != 0 */ - return !!len; + /* tell the caller to split the bvec if it is too big to fit */ + return len > 0 || bv->bv_len > max_len; } +/** + * blk_bio_segment_split - split a bio in two bios + * @q: [in] request queue pointer + * @bio: [in] bio to be split + * @bs: [in] bio set to allocate the clone from + * @segs: [out] number of segments in the bio with the first half of the sectors + * + * Clone @bio, update the bi_iter of the clone to represent the first sectors + * of @bio and update @bio->bi_iter to represent the remaining sectors. The + * following is guaranteed for the cloned bio: + * - That it has at most get_max_io_size(@q, @bio) sectors. + * - That it has at most queue_max_segments(@q) segments. + * + * Except for discard requests the cloned bio will point at the bi_io_vec of + * the original bio. It is the responsibility of the caller to ensure that the + * original bio is not freed before the cloned bio. The caller is also + * responsible for ensuring that @bs is only destroyed after processing of the + * split bio has finished. + */ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *bio, struct bio_set *bs, @@ -213,34 +260,18 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) goto split; - if (sectors + (bv.bv_len >> 9) > max_sectors) { - /* - * Consider this a new segment if we're splitting in - * the middle of this vector. - */ - if (nsegs < max_segs && - sectors < max_sectors) { - /* split in the middle of bvec */ - bv.bv_len = (max_sectors - sectors) << 9; - bvec_split_segs(q, &bv, &nsegs, - §ors, max_segs); - } + if (nsegs < max_segs && + sectors + (bv.bv_len >> 9) <= max_sectors && + bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + nsegs++; + sectors += bv.bv_len >> 9; + } else if (bvec_split_segs(q, &bv, &nsegs, §ors, max_segs, + max_sectors)) { goto split; } - if (nsegs == max_segs) - goto split; - bvprv = bv; bvprvp = &bvprv; - - if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) { - nsegs++; - sectors += bv.bv_len >> 9; - } else if (bvec_split_segs(q, &bv, &nsegs, §ors, - max_segs)) { - goto split; - } } *segs = nsegs; @@ -250,10 +281,23 @@ split: return bio_split(bio, sectors, GFP_NOIO, bs); } +/** + * __blk_queue_split - split a bio and submit the second half + * @q: [in] request queue pointer + * @bio: [in, out] bio to be split + * @nr_segs: [out] number of segments in the first bio + * + * Split a bio into two bios, chain the two bios, submit the second half and + * store a pointer to the first half in *@bio. If the second bio is still too + * big it will be split by a recursive call to this function. Since this + * function may allocate a new bio from @q->bio_split, it is the responsibility + * of the caller to ensure that @q is only released after processing of the + * split bio has finished. + */ void __blk_queue_split(struct request_queue *q, struct bio **bio, unsigned int *nr_segs) { - struct bio *split; + struct bio *split = NULL; switch (bio_op(*bio)) { case REQ_OP_DISCARD: @@ -269,6 +313,21 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, nr_segs); break; default: + /* + * All drivers must accept single-segments bios that are <= + * PAGE_SIZE. This is a quick and dirty check that relies on + * the fact that bi_io_vec[0] is always valid if a bio has data. + * The check might lead to occasional false negatives when bios + * are cloned, but compared to the performance impact of cloned + * bios themselves the loop below doesn't matter anyway. + */ + if (!q->limits.chunk_sectors && + (*bio)->bi_vcnt == 1 && + ((*bio)->bi_io_vec[0].bv_len + + (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) { + *nr_segs = 1; + break; + } split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); break; } @@ -294,6 +353,17 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, } } +/** + * blk_queue_split - split a bio and submit the second half + * @q: [in] request queue pointer + * @bio: [in, out] bio to be split + * + * Split a bio into two bios, chains the two bios, submit the second half and + * store a pointer to the first half in *@bio. Since this function may allocate + * a new bio from @q->bio_split, it is the responsibility of the caller to + * ensure that @q is only released after processing of the split bio has + * finished. + */ void blk_queue_split(struct request_queue *q, struct bio **bio) { unsigned int nr_segs; @@ -305,6 +375,7 @@ EXPORT_SYMBOL(blk_queue_split); unsigned int blk_recalc_rq_segments(struct request *rq) { unsigned int nr_phys_segs = 0; + unsigned int nr_sectors = 0; struct req_iterator iter; struct bio_vec bv; @@ -321,7 +392,8 @@ unsigned int blk_recalc_rq_segments(struct request *rq) } rq_for_each_bvec(bv, rq, iter) - bvec_split_segs(rq->q, &bv, &nr_phys_segs, NULL, UINT_MAX); + bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors, + UINT_MAX, UINT_MAX); return nr_phys_segs; } @@ -351,7 +423,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q, while (nbytes > 0) { unsigned offset = bvec->bv_offset + total; - unsigned len = min(get_max_segment_size(q, offset), nbytes); + unsigned len = min(get_max_segment_size(q, bvec->bv_page, + offset), nbytes); struct page *page = bvec->bv_page; /* diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index f945621a0e8f..0157f2b3485a 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -15,10 +15,10 @@ #include "blk.h" #include "blk-mq.h" -static int cpu_to_queue_index(struct blk_mq_queue_map *qmap, - unsigned int nr_queues, const int cpu) +static int queue_index(struct blk_mq_queue_map *qmap, + unsigned int nr_queues, const int q) { - return qmap->queue_offset + (cpu % nr_queues); + return qmap->queue_offset + (q % nr_queues); } static int get_first_sibling(unsigned int cpu) @@ -36,21 +36,36 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap) { unsigned int *map = qmap->mq_map; unsigned int nr_queues = qmap->nr_queues; - unsigned int cpu, first_sibling; + unsigned int cpu, first_sibling, q = 0; + + for_each_possible_cpu(cpu) + map[cpu] = -1; + + /* + * Spread queues among present CPUs first for minimizing + * count of dead queues which are mapped by all un-present CPUs + */ + for_each_present_cpu(cpu) { + if (q >= nr_queues) + break; + map[cpu] = queue_index(qmap, nr_queues, q++); + } for_each_possible_cpu(cpu) { + if (map[cpu] != -1) + continue; /* * First do sequential mapping between CPUs and queues. * In case we still have CPUs to map, and we have some number of * threads per cores then map sibling threads to the same queue * for performance optimizations. */ - if (cpu < nr_queues) { - map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu); + if (q < nr_queues) { + map[cpu] = queue_index(qmap, nr_queues, q++); } else { first_sibling = get_first_sibling(cpu); if (first_sibling == cpu) - map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu); + map[cpu] = queue_index(qmap, nr_queues, q++); else map[cpu] = map[first_sibling]; } diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c9d183d6c499..ca22afd47b3d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -555,8 +555,6 @@ void blk_mq_sched_free_requests(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - lockdep_assert_held(&q->sysfs_lock); - queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index d6e1a9bd7131..062229395a50 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -74,10 +74,8 @@ static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, if (!entry->show) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->show(ctx, page); + res = entry->show(ctx, page); mutex_unlock(&q->sysfs_lock); return res; } @@ -97,10 +95,8 @@ static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, if (!entry->store) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->store(ctx, page, length); + res = entry->store(ctx, page, length); mutex_unlock(&q->sysfs_lock); return res; } @@ -120,10 +116,8 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, if (!entry->show) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->show(hctx, page); + res = entry->show(hctx, page); mutex_unlock(&q->sysfs_lock); return res; } @@ -144,10 +138,8 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, if (!entry->store) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->store(hctx, page, length); + res = entry->store(hctx, page, length); mutex_unlock(&q->sysfs_lock); return res; } @@ -166,20 +158,25 @@ static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { + const size_t size = PAGE_SIZE - 1; unsigned int i, first = 1; - ssize_t ret = 0; + int ret = 0, pos = 0; for_each_cpu(i, hctx->cpumask) { if (first) - ret += sprintf(ret + page, "%u", i); + ret = snprintf(pos + page, size - pos, "%u", i); else - ret += sprintf(ret + page, ", %u", i); + ret = snprintf(pos + page, size - pos, ", %u", i); + + if (ret >= size - pos) + break; first = 0; + pos += ret; } - ret += sprintf(ret + page, "\n"); - return ret; + ret = snprintf(pos + page, size + 1 - pos, "\n"); + return pos + ret; } static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { @@ -270,7 +267,7 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->sysfs_dir_lock); queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); @@ -320,7 +317,7 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q) int ret, i; WARN_ON_ONCE(!q->kobj.parent); - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->sysfs_dir_lock); ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) @@ -349,23 +346,12 @@ unreg: return ret; } -int blk_mq_register_dev(struct device *dev, struct request_queue *q) -{ - int ret; - - mutex_lock(&q->sysfs_lock); - ret = __blk_mq_register_dev(dev, q); - mutex_unlock(&q->sysfs_lock); - - return ret; -} - void blk_mq_sysfs_unregister(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) goto unlock; @@ -373,7 +359,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q) blk_mq_unregister_hctx(hctx); unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); } int blk_mq_sysfs_register(struct request_queue *q) @@ -381,7 +367,7 @@ int blk_mq_sysfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i, ret = 0; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) goto unlock; @@ -392,7 +378,7 @@ int blk_mq_sysfs_register(struct request_queue *q) } unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); return ret; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index da19f0bc8876..fbacde454718 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -10,18 +10,11 @@ #include <linux/module.h> #include <linux/blk-mq.h> +#include <linux/delay.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" -bool blk_mq_has_free_tags(struct blk_mq_tags *tags) -{ - if (!tags) - return true; - - return sbitmap_any_bit_clear(&tags->bitmap_tags.sb); -} - /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail @@ -354,6 +347,37 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); +static bool blk_mq_tagset_count_completed_rqs(struct request *rq, + void *data, bool reserved) +{ + unsigned *count = data; + + if (blk_mq_request_completed(rq)) + (*count)++; + return true; +} + +/** + * blk_mq_tagset_wait_completed_request - wait until all completed req's + * complete funtion is run + * @tagset: Tag set to drain completed request + * + * Note: This function has to be run after all IO queues are shutdown + */ +void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) +{ + while (true) { + unsigned count = 0; + + blk_mq_tagset_busy_iter(tagset, + blk_mq_tagset_count_completed_rqs, &count); + if (!count) + break; + msleep(5); + } +} +EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); + /** * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag * @q: Request queue to examine. diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 61deab0b5a5a..15bc74acb57e 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -28,7 +28,6 @@ extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); -extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); diff --git a/block/blk-mq.c b/block/blk-mq.c index 0835f4d8d42e..a12b1763508d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -30,6 +30,7 @@ #include <trace/events/block.h> #include <linux/blk-mq.h> +#include <linux/t10-pi.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -44,12 +45,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); static int blk_mq_poll_stats_bkt(const struct request *rq) { - int ddir, bytes, bucket; + int ddir, sectors, bucket; ddir = rq_data_dir(rq); - bytes = blk_rq_bytes(rq); + sectors = blk_rq_stats_sectors(rq); - bucket = ddir + 2*(ilog2(bytes) - 9); + bucket = ddir + 2 * ilog2(sectors); if (bucket < 0) return -1; @@ -92,7 +93,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct mq_inflight { struct hd_struct *part; - unsigned int *inflight; + unsigned int inflight[2]; }; static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, @@ -101,45 +102,29 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - /* - * index[0] counts the specific partition that was asked for. - */ if (rq->part == mi->part) - mi->inflight[0]++; + mi->inflight[rq_data_dir(rq)]++; return true; } unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) { - unsigned inflight[2]; - struct mq_inflight mi = { .part = part, .inflight = inflight, }; + struct mq_inflight mi = { .part = part }; - inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - return inflight[0]; -} - -static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, - bool reserved) -{ - struct mq_inflight *mi = priv; - - if (rq->part == mi->part) - mi->inflight[rq_data_dir(rq)]++; - - return true; + return mi.inflight[0] + mi.inflight[1]; } void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - struct mq_inflight mi = { .part = part, .inflight = inflight, }; + struct mq_inflight mi = { .part = part }; - inflight[0] = inflight[1] = 0; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + inflight[0] = mi.inflight[0]; + inflight[1] = mi.inflight[1]; } void blk_freeze_queue_start(struct request_queue *q) @@ -275,23 +260,17 @@ void blk_mq_wake_waiters(struct request_queue *q) blk_mq_tag_wakeup_all(hctx->tags, true); } -bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) -{ - return blk_mq_has_free_tags(hctx->tags); -} -EXPORT_SYMBOL(blk_mq_can_queue); - /* - * Only need start/end time stamping if we have stats enabled, or using - * an IO scheduler. + * Only need start/end time stamping if we have iostat or + * blk stats enabled, or using an IO scheduler. */ static inline bool blk_mq_need_time_stamp(struct request *rq) { - return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; + return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator; } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, unsigned int op) + unsigned int tag, unsigned int op, u64 alloc_time_ns) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; @@ -325,11 +304,15 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + rq->alloc_time_ns = alloc_time_ns; +#endif if (blk_mq_need_time_stamp(rq)) rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; rq->io_start_time_ns = 0; + rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -356,8 +339,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct request *rq; unsigned int tag; bool clear_ctx_on_error = false; + u64 alloc_time_ns = 0; blk_queue_enter_live(q); + + /* alloc_time includes depth and tag waits */ + if (blk_queue_rq_alloc_time(q)) + alloc_time_ns = ktime_get_ns(); + data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); @@ -393,7 +382,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, return NULL; } - rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); + rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns); if (!op_is_flush(data->cmd_flags)) { rq->elv.icq = NULL; if (e && e->type->ops.prepare_request) { @@ -652,19 +641,14 @@ bool blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -void blk_mq_complete_request_sync(struct request *rq) -{ - WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); - rq->q->mq_ops->complete(rq); -} -EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync); - -int blk_mq_request_started(struct request *rq) -{ - return blk_mq_rq_state(rq) != MQ_RQ_IDLE; -} -EXPORT_SYMBOL_GPL(blk_mq_request_started); - +/** + * blk_mq_start_request - Start processing a request + * @rq: Pointer to request to be started + * + * Function used by device drivers to notify the block layer that a request + * is going to be processed now, so blk layer can do proper initializations + * such as starting the timeout timer. + */ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; @@ -673,9 +657,7 @@ void blk_mq_start_request(struct request *rq) if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - rq->throtl_size = blk_rq_sectors(rq); -#endif + rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); } @@ -693,6 +675,11 @@ void blk_mq_start_request(struct request *rq) */ rq->nr_phys_segments++; } + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) + q->integrity.profile->prepare_fn(rq); +#endif } EXPORT_SYMBOL(blk_mq_start_request); @@ -905,7 +892,10 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, */ if (blk_mq_req_expired(rq, next)) blk_mq_rq_timed_out(rq, reserved); - if (refcount_dec_and_test(&rq->ref)) + + if (is_flush_rq(rq, hctx)) + rq->end_io(rq, 0); + else if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); return true; @@ -1048,7 +1038,7 @@ bool blk_mq_get_driver_tag(struct request *rq) bool shared; if (rq->tag != -1) - goto done; + return true; if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) data.flags |= BLK_MQ_REQ_RESERVED; @@ -1063,7 +1053,6 @@ bool blk_mq_get_driver_tag(struct request *rq) data.hctx->tags->rqs[rq->tag] = rq; } -done: return rq->tag != -1; } @@ -1346,6 +1335,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, return (queued + errors) != 0; } +/** + * __blk_mq_run_hw_queue - Run a hardware queue. + * @hctx: Pointer to the hardware queue to run. + * + * Send pending requests to the hardware. + */ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; @@ -1443,6 +1438,15 @@ select_cpu: return next_cpu; } +/** + * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. + * @hctx: Pointer to the hardware queue to run. + * @async: If we want to run the queue asynchronously. + * @msecs: Microseconds of delay to wait before running the queue. + * + * If !@async, try to run the queue now. Else, run the queue asynchronously and + * with a delay of @msecs. + */ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, unsigned long msecs) { @@ -1464,13 +1468,29 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, msecs_to_jiffies(msecs)); } +/** + * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. + * @hctx: Pointer to the hardware queue to run. + * @msecs: Microseconds of delay to wait before running the queue. + * + * Run a hardware queue asynchronously with a delay of @msecs. + */ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { __blk_mq_delay_run_hw_queue(hctx, true, msecs); } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); -bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +/** + * blk_mq_run_hw_queue - Start to run a hardware queue. + * @hctx: Pointer to the hardware queue to run. + * @async: If we want to run the queue asynchronously. + * + * Check if the request queue is not in a quiesced state and if there are + * pending requests to be sent. If this is true, run the queue to send requests + * to hardware. + */ +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { int srcu_idx; bool need_run; @@ -1488,15 +1508,16 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) blk_mq_hctx_has_pending(hctx); hctx_unlock(hctx, srcu_idx); - if (need_run) { + if (need_run) __blk_mq_delay_run_hw_queue(hctx, async, 0); - return true; - } - - return false; } EXPORT_SYMBOL(blk_mq_run_hw_queue); +/** + * blk_mq_run_hw_queue - Run all hardware queues in a request queue. + * @q: Pointer to the request queue to run. + * @async: If we want to run the queue asynchronously. + */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; @@ -1648,7 +1669,11 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_mq_hctx_mark_pending(hctx, ctx); } -/* +/** + * blk_mq_request_bypass_insert - Insert a request at dispatch list. + * @rq: Pointer to request to be inserted. + * @run_queue: If we should run the hardware queue after inserting the request. + * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ @@ -1691,28 +1716,20 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); - if (rqa->mq_ctx < rqb->mq_ctx) - return -1; - else if (rqa->mq_ctx > rqb->mq_ctx) - return 1; - else if (rqa->mq_hctx < rqb->mq_hctx) - return -1; - else if (rqa->mq_hctx > rqb->mq_hctx) - return 1; + if (rqa->mq_ctx != rqb->mq_ctx) + return rqa->mq_ctx > rqb->mq_ctx; + if (rqa->mq_hctx != rqb->mq_hctx) + return rqa->mq_hctx > rqb->mq_hctx; return blk_rq_pos(rqa) > blk_rq_pos(rqb); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct blk_mq_hw_ctx *this_hctx; - struct blk_mq_ctx *this_ctx; - struct request_queue *this_q; - struct request *rq; LIST_HEAD(list); - LIST_HEAD(rq_list); - unsigned int depth; + if (list_empty(&plug->mq_list)) + return; list_splice_init(&plug->mq_list, &list); if (plug->rq_count > 2 && plug->multiple_queues) @@ -1720,42 +1737,27 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) plug->rq_count = 0; - this_q = NULL; - this_hctx = NULL; - this_ctx = NULL; - depth = 0; - - while (!list_empty(&list)) { - rq = list_entry_rq(list.next); - list_del_init(&rq->queuelist); - BUG_ON(!rq->q); - if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) { - if (this_hctx) { - trace_block_unplug(this_q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, - &rq_list, - from_schedule); - } - - this_q = rq->q; - this_ctx = rq->mq_ctx; - this_hctx = rq->mq_hctx; - depth = 0; + do { + struct list_head rq_list; + struct request *rq, *head_rq = list_entry_rq(list.next); + struct list_head *pos = &head_rq->queuelist; /* skip first */ + struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx; + struct blk_mq_ctx *this_ctx = head_rq->mq_ctx; + unsigned int depth = 1; + + list_for_each_continue(pos, &list) { + rq = list_entry_rq(pos); + BUG_ON(!rq->q); + if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) + break; + depth++; } - depth++; - list_add_tail(&rq->queuelist, &rq_list); - } - - /* - * If 'this_hctx' is set, we know we have entries to complete - * on 'rq_list'. Do those. - */ - if (this_hctx) { - trace_block_unplug(this_q, depth, !from_schedule); + list_cut_before(&rq_list, &list, pos); + trace_block_unplug(head_rq->q, depth, !from_schedule); blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, from_schedule); - } + } while(!list_empty(&list)); } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, @@ -1851,6 +1853,17 @@ insert: return BLK_STS_OK; } +/** + * blk_mq_try_issue_directly - Try to send a request directly to device driver. + * @hctx: Pointer of the associated hardware queue. + * @rq: Pointer to request to be sent. + * @cookie: Request queue cookie. + * + * If the device has enough resources to accept a new request now, send the + * request directly to device driver. Else, insert at hctx->dispatch queue, so + * we can try send it another time in the future. Requests inserted at this + * queue have higher priority. + */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie) { @@ -1928,6 +1941,22 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) } } +/** + * blk_mq_make_request - Create and send a request to block device. + * @q: Request queue pointer. + * @bio: Bio pointer. + * + * Builds up a request structure from @q and @bio and send to the device. The + * request may not be queued directly to hardware if: + * * This request can be merged with another one + * * We want to place request at plug queue for possible future merging + * * There is an IO scheduler active at this queue + * + * It will not queue the request if there is an error with the bio, or at the + * request creation. + * + * Returns: Request queue cookie. + */ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); @@ -1973,13 +2002,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) plug = blk_mq_plug(q, bio); if (unlikely(is_flush_fua)) { - /* bypass scheduler for flush rq */ + /* Bypass scheduler for flush requests */ blk_insert_flush(rq); blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { + } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs || + !blk_queue_nonrot(q))) { /* * Use plugging if we have a ->commit_rqs() hook as well, as * we know the driver uses bd->last in a smart fashion. + * + * Use normal plugging if this disk is slow HDD, as sequential + * IO may benefit a lot from plug merging. */ unsigned int request_count = plug->rq_count; struct request *last = NULL; @@ -1996,6 +2029,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } blk_add_rq_to_plug(plug, rq); + } else if (q->elevator) { + /* Insert the request at the IO scheduler queue */ + blk_mq_sched_insert_request(rq, false, true, true); } else if (plug && !blk_queue_nomerges(q)) { /* * We do limited plugging. If the bio can be merged, do that. @@ -2019,10 +2055,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } - } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && - !data.hctx->dispatch_busy)) { + } else if ((q->nr_hw_queues > 1 && is_sync) || + !data.hctx->dispatch_busy) { + /* + * There is no scheduler and we can try to send directly + * to the hardware. + */ blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { + /* Default case. */ blk_mq_sched_insert_request(rq, false, true, true); } @@ -2453,11 +2494,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; - /* - * Avoid others reading imcomplete hctx->cpumask through sysfs - */ - mutex_lock(&q->sysfs_lock); - queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -2518,8 +2554,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) HCTX_TYPE_DEFAULT, i); } - mutex_unlock(&q->sysfs_lock); - queue_for_each_hw_ctx(q, hctx, i) { /* * If no software queues are mapped to this hardware queue, @@ -2688,7 +2722,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!uninit_q) return ERR_PTR(-ENOMEM); - q = blk_mq_init_allocated_queue(set, uninit_q); + /* + * Initialize the queue without an elevator. device_add_disk() will do + * the initialization. + */ + q = blk_mq_init_allocated_queue(set, uninit_q, false); if (IS_ERR(q)) blk_cleanup_queue(uninit_q); @@ -2770,6 +2808,23 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, int i, j, end; struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + if (q->nr_hw_queues < set->nr_hw_queues) { + struct blk_mq_hw_ctx **new_hctxs; + + new_hctxs = kcalloc_node(set->nr_hw_queues, + sizeof(*new_hctxs), GFP_KERNEL, + set->numa_node); + if (!new_hctxs) + return; + if (hctxs) + memcpy(new_hctxs, hctxs, q->nr_hw_queues * + sizeof(*hctxs)); + q->queue_hw_ctx = new_hctxs; + q->nr_hw_queues = set->nr_hw_queues; + kfree(hctxs); + hctxs = new_hctxs; + } + /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { @@ -2825,21 +2880,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } -/* - * Maximum number of hardware queues we support. For single sets, we'll never - * have more than the CPUs (software queues). For multiple sets, the tag_set - * user may have set ->nr_hw_queues larger. - */ -static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) -{ - if (set->nr_maps == 1) - return nr_cpu_ids; - - return max(set->nr_hw_queues, nr_cpu_ids); -} - struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q) + struct request_queue *q, + bool elevator_init) { /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -2856,12 +2899,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); - q->nr_queues = nr_hw_queues(set); - q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), - GFP_KERNEL, set->numa_node); - if (!q->queue_hw_ctx) - goto err_sys_init; - INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); @@ -2901,19 +2938,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - if (!(set->flags & BLK_MQ_F_NO_SCHED)) { - int ret; - - ret = elevator_init_mq(q); - if (ret) - return ERR_PTR(ret); - } + if (elevator_init) + elevator_init_mq(q); return q; err_hctxs: kfree(q->queue_hw_ctx); -err_sys_init: + q->nr_hw_queues = 0; blk_mq_sysfs_deinit(q); err_poll: blk_stat_free_callback(q->poll_cb); @@ -3014,6 +3046,29 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) } } +static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, + int cur_nr_hw_queues, int new_nr_hw_queues) +{ + struct blk_mq_tags **new_tags; + + if (cur_nr_hw_queues >= new_nr_hw_queues) + return 0; + + new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!new_tags) + return -ENOMEM; + + if (set->tags) + memcpy(new_tags, set->tags, cur_nr_hw_queues * + sizeof(*set->tags)); + kfree(set->tags); + set->tags = new_tags; + set->nr_hw_queues = new_nr_hw_queues; + + return 0; +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -3067,9 +3122,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), - GFP_KERNEL, set->numa_node); - if (!set->tags) + if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) return -ENOMEM; ret = -ENOMEM; @@ -3110,7 +3163,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; - for (i = 0; i < nr_hw_queues(set); i++) + for (i = 0; i < set->nr_hw_queues; i++) blk_mq_free_map_and_requests(set, i); for (j = 0; j < set->nr_maps; j++) { @@ -3255,10 +3308,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); /* - * Sync with blk_mq_queue_tag_busy_iter. - */ - synchronize_rcu(); - /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done * updating the new sw to hw queue mappings. @@ -3272,6 +3321,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_sysfs_unregister(q); } + if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < + 0) + goto reregister; + prev_nr_hw_queues = set->nr_hw_queues; set->nr_hw_queues = nr_hw_queues; blk_mq_update_queue_map(set); @@ -3288,6 +3341,7 @@ fallback: blk_mq_map_swqueue(q); } +reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register(q); blk_mq_debugfs_register_hctxs(q); @@ -3411,15 +3465,14 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, kt = nsecs; mode = HRTIMER_MODE_REL; - hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); + hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode); hrtimer_set_expires(&hs.timer, kt); - hrtimer_init_sleeper(&hs, current); do { if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) break; set_current_state(TASK_UNINTERRUPTIBLE); - hrtimer_start_expires(&hs.timer, mode); + hrtimer_sleeper_start_expires(&hs, mode); if (hs.task) io_schedule(); hrtimer_cancel(&hs.timer); diff --git a/block/blk-mq.h b/block/blk-mq.h index 32c62c64e6c2..eaaca8fc1c28 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -128,15 +128,6 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_release(struct request_queue *q); -/** - * blk_mq_rq_state() - read the current MQ_RQ_* state of a request - * @rq: target request. - */ -static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) -{ - return READ_ONCE(rq->state); -} - static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/block/blk-pm.c b/block/blk-pm.c index 0a028c189897..1adc1cd748b4 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -207,10 +207,12 @@ EXPORT_SYMBOL(blk_post_runtime_resume); */ void blk_set_runtime_active(struct request_queue *q) { - spin_lock_irq(&q->queue_lock); - q->rpm_status = RPM_ACTIVE; - pm_runtime_mark_last_busy(q->dev); - pm_request_autosuspend(q->dev); - spin_unlock_irq(&q->queue_lock); + if (q->dev) { + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + pm_request_autosuspend(q->dev); + spin_unlock_irq(&q->queue_lock); + } } EXPORT_SYMBOL(blk_set_runtime_active); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 3954c0dc1443..656460636ad3 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -83,6 +83,15 @@ void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) } while (rqos); } +void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) +{ + do { + if (rqos->ops->merge) + rqos->ops->merge(rqos, rq, bio); + rqos = rqos->next; + } while (rqos); +} + void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) { do { @@ -92,6 +101,15 @@ void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) } while (rqos); } +void __rq_qos_queue_depth_changed(struct rq_qos *rqos) +{ + do { + if (rqos->ops->queue_depth_changed) + rqos->ops->queue_depth_changed(rqos); + rqos = rqos->next; + } while (rqos); +} + /* * Return true, if we can't increase the depth further by scaling */ @@ -142,24 +160,27 @@ bool rq_depth_calc_max_depth(struct rq_depth *rqd) return ret; } -void rq_depth_scale_up(struct rq_depth *rqd) +/* Returns true on success and false if scaling up wasn't possible */ +bool rq_depth_scale_up(struct rq_depth *rqd) { /* * Hit max in previous round, stop here */ if (rqd->scaled_max) - return; + return false; rqd->scale_step--; rqd->scaled_max = rq_depth_calc_max_depth(rqd); + return true; } /* * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we - * had a latency violation. + * had a latency violation. Returns true on success and returns false if + * scaling down wasn't possible. */ -void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) +bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) { /* * Stop scaling down when we've hit the limit. This also prevents @@ -167,7 +188,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) * keep up. */ if (rqd->max_depth == 1) - return; + return false; if (rqd->scale_step < 0 && hard_throttle) rqd->scale_step = 0; @@ -176,6 +197,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) rqd->scaled_max = false; rq_depth_calc_max_depth(rqd); + return true; } struct rq_qos_wait_data { diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2300e038b9fa..2bc43e94f4c4 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -14,7 +14,8 @@ struct blk_mq_debugfs_attr; enum rq_qos_id { RQ_QOS_WBT, - RQ_QOS_CGROUP, + RQ_QOS_LATENCY, + RQ_QOS_COST, }; struct rq_wait { @@ -35,11 +36,13 @@ struct rq_qos { struct rq_qos_ops { void (*throttle)(struct rq_qos *, struct bio *); void (*track)(struct rq_qos *, struct request *, struct bio *); + void (*merge)(struct rq_qos *, struct request *, struct bio *); void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); void (*done)(struct rq_qos *, struct request *); void (*done_bio)(struct rq_qos *, struct bio *); void (*cleanup)(struct rq_qos *, struct bio *); + void (*queue_depth_changed)(struct rq_qos *); void (*exit)(struct rq_qos *); const struct blk_mq_debugfs_attr *debugfs_attrs; }; @@ -72,7 +75,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) { - return rq_qos_id(q, RQ_QOS_CGROUP); + return rq_qos_id(q, RQ_QOS_LATENCY); } static inline const char *rq_qos_id_to_name(enum rq_qos_id id) @@ -80,8 +83,10 @@ static inline const char *rq_qos_id_to_name(enum rq_qos_id id) switch (id) { case RQ_QOS_WBT: return "wbt"; - case RQ_QOS_CGROUP: - return "cgroup"; + case RQ_QOS_LATENCY: + return "latency"; + case RQ_QOS_COST: + return "cost"; } return "unknown"; } @@ -103,16 +108,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) { - struct rq_qos *cur, *prev = NULL; - for (cur = q->rq_qos; cur; cur = cur->next) { - if (cur == rqos) { - if (prev) - prev->next = rqos->next; - else - q->rq_qos = cur; + struct rq_qos **cur; + + for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { + if (*cur == rqos) { + *cur = rqos->next; break; } - prev = cur; } blk_mq_debugfs_unregister_rqos(rqos); @@ -125,8 +127,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, acquire_inflight_cb_t *acquire_inflight_cb, cleanup_cb_t *cleanup_cb); bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); -void rq_depth_scale_up(struct rq_depth *rqd); -void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); +bool rq_depth_scale_up(struct rq_depth *rqd); +bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); @@ -135,7 +137,9 @@ void __rq_qos_issue(struct rq_qos *rqos, struct request *rq); void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq); void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio); void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio); +void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio); void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio); +void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { @@ -185,6 +189,19 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq, __rq_qos_track(q->rq_qos, rq, bio); } +static inline void rq_qos_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_merge(q->rq_qos, rq, bio); +} + +static inline void rq_qos_queue_depth_changed(struct request_queue *q) +{ + if (q->rq_qos) + __rq_qos_queue_depth_changed(q->rq_qos); +} + void rq_qos_exit(struct request_queue *); #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index 2c1831207a8f..c8eda2e7b91e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -12,6 +12,7 @@ #include <linux/lcm.h> #include <linux/jiffies.h> #include <linux/gfp.h> +#include <linux/dma-mapping.h> #include "blk.h" #include "blk-wbt.h" @@ -327,7 +328,7 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); * storage device can address. The default of 512 covers most * hardware. **/ -void blk_queue_logical_block_size(struct request_queue *q, unsigned short size) +void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) { q->limits.logical_block_size = size; @@ -805,7 +806,7 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment); void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; - wbt_set_queue_depth(q, depth); + rq_qos_queue_depth_changed(q); } EXPORT_SYMBOL(blk_set_queue_depth); @@ -832,6 +833,44 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) } EXPORT_SYMBOL_GPL(blk_queue_write_cache); +/** + * blk_queue_required_elevator_features - Set a queue required elevator features + * @q: the request queue for the target device + * @features: Required elevator features OR'ed together + * + * Tell the block layer that for the device controlled through @q, only the + * only elevators that can be used are those that implement at least the set of + * features specified by @features. + */ +void blk_queue_required_elevator_features(struct request_queue *q, + unsigned int features) +{ + q->required_elevator_features = features; +} +EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features); + +/** + * blk_queue_can_use_dma_map_merging - configure queue for merging segments. + * @q: the request queue for the device + * @dev: the device pointer for dma + * + * Tell the block layer about merging the segments by dma map of @q. + */ +bool blk_queue_can_use_dma_map_merging(struct request_queue *q, + struct device *dev) +{ + unsigned long boundary = dma_get_merge_boundary(dev); + + if (!boundary) + return false; + + /* No need to update max_segment_size. see blk_queue_virt_boundary() */ + blk_queue_virt_boundary(q, boundary); + + return true; +} +EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 457d9ba3eb20..6e7ec87d49fa 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -42,17 +42,13 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h) static void trigger_softirq(void *data) { struct request *rq = data; - unsigned long flags; struct list_head *list; - local_irq_save(flags); list = this_cpu_ptr(&blk_cpu_done); list_add_tail(&rq->ipi_list, list); if (list->next == &rq->ipi_list) raise_softirq_irqoff(BLOCK_SOFTIRQ); - - local_irq_restore(flags); } /* diff --git a/block/blk-stat.c b/block/blk-stat.c index 940f15d600f8..7da302ff88d0 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -53,7 +53,7 @@ void blk_stat_add(struct request *rq, u64 now) struct request_queue *q = rq->q; struct blk_stat_callback *cb; struct blk_rq_stat *stat; - int bucket; + int bucket, cpu; u64 value; value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; @@ -61,6 +61,7 @@ void blk_stat_add(struct request *rq, u64 now) blk_throtl_stat_add(rq, value); rcu_read_lock(); + cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { if (!blk_stat_is_active(cb)) continue; @@ -69,10 +70,10 @@ void blk_stat_add(struct request *rq, u64 now) if (bucket < 0) continue; - stat = &get_cpu_ptr(cb->cpu_stat)[bucket]; + stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket]; blk_rq_stat_add(stat, value); - put_cpu_ptr(cb->cpu_stat); } + put_cpu(); rcu_read_unlock(); } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9bfa3ea4ed63..fca9b158f4a0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -482,7 +482,6 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, blk_mq_quiesce_queue(q); wbt_set_min_lat(q, val); - wbt_update_limits(q); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); @@ -802,10 +801,6 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; mutex_lock(&q->sysfs_lock); - if (blk_queue_dying(q)) { - mutex_unlock(&q->sysfs_lock); - return -ENOENT; - } res = entry->show(q, page); mutex_unlock(&q->sysfs_lock); return res; @@ -824,10 +819,6 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); - if (blk_queue_dying(q)) { - mutex_unlock(&q->sysfs_lock); - return -ENOENT; - } res = entry->store(q, page, length); mutex_unlock(&q->sysfs_lock); return res; @@ -941,14 +932,14 @@ int blk_register_queue(struct gendisk *disk) int ret; struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; + bool has_elevator = false; if (WARN_ON(!q)) return -ENXIO; - WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), + WARN_ONCE(blk_queue_registered(q), "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); /* * SCSI probing may synchronously create and destroy a lot of @@ -968,8 +959,7 @@ int blk_register_queue(struct gendisk *disk) if (ret) return ret; - /* Prevent changes through sysfs until registration is completed. */ - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); if (ret < 0) { @@ -990,26 +980,33 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); } - kobject_uevent(&q->kobj, KOBJ_ADD); - - wbt_enable_default(q); - - blk_throtl_register_queue(q); - + mutex_lock(&q->sysfs_lock); if (q->elevator) { - ret = elv_register_queue(q); + ret = elv_register_queue(q, false); if (ret) { mutex_unlock(&q->sysfs_lock); - kobject_uevent(&q->kobj, KOBJ_REMOVE); + mutex_unlock(&q->sysfs_dir_lock); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); return ret; } + has_elevator = true; } + + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); + wbt_enable_default(q); + blk_throtl_register_queue(q); + + /* Now everything is ready and send out KOBJ_ADD uevent */ + kobject_uevent(&q->kobj, KOBJ_ADD); + if (has_elevator) + kobject_uevent(&q->elevator->kobj, KOBJ_ADD); + mutex_unlock(&q->sysfs_lock); + ret = 0; unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); return ret; } EXPORT_SYMBOL_GPL(blk_register_queue); @@ -1029,7 +1026,7 @@ void blk_unregister_queue(struct gendisk *disk) return; /* Return early if disk->queue was never registered. */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return; /* @@ -1038,16 +1035,16 @@ void blk_unregister_queue(struct gendisk *disk) * concurrent elv_iosched_store() calls. */ mutex_lock(&q->sysfs_lock); - blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); + mutex_unlock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); /* * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); - mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); @@ -1057,6 +1054,7 @@ void blk_unregister_queue(struct gendisk *disk) if (q->elevator) elv_unregister_queue(q); mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); kobject_put(&disk_to_dev(disk)->kobj); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8ab6c8153223..98233c9c65a8 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -12,6 +12,7 @@ #include <linux/blktrace_api.h> #include <linux/blk-cgroup.h> #include "blk.h" +#include "blk-cgroup-rwstat.h" /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; @@ -176,6 +177,9 @@ struct throtl_grp { unsigned int bio_cnt; /* total bios */ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; }; /* We measure latency for request size from <= 4k to >= 1M */ @@ -478,15 +482,23 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq) timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } -static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, + struct request_queue *q, + struct blkcg *blkcg) { struct throtl_grp *tg; int rw; - tg = kzalloc_node(sizeof(*tg), gfp, node); + tg = kzalloc_node(sizeof(*tg), gfp, q->node); if (!tg) return NULL; + if (blkg_rwstat_init(&tg->stat_bytes, gfp)) + goto err_free_tg; + + if (blkg_rwstat_init(&tg->stat_ios, gfp)) + goto err_exit_stat_bytes; + throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { @@ -511,6 +523,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; return &tg->pd; + +err_exit_stat_bytes: + blkg_rwstat_exit(&tg->stat_bytes); +err_free_tg: + kfree(tg); + return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) @@ -609,6 +627,8 @@ static void throtl_pd_free(struct blkg_policy_data *pd) struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); + blkg_rwstat_exit(&tg->stat_bytes); + blkg_rwstat_exit(&tg->stat_ios); kfree(tg); } @@ -1462,6 +1482,32 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } +static int tg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + +static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, + &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + tg_prfill_rwstat_recursive, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", @@ -1489,23 +1535,23 @@ static struct cftype throtl_legacy_files[] = { }, { .name = "throttle.io_service_bytes", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_bytes, + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_bytes_recursive, + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat_recursive, }, { .name = "throttle.io_serviced", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_ios, + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_ios_recursive, + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat_recursive, }, { } /* terminate */ }; @@ -2125,7 +2171,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, WARN_ON_ONCE(!rcu_read_lock_held()); /* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) + if (bio_flagged(bio, BIO_THROTTLED)) + goto out; + + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + + if (!tg->has_rules[rw]) goto out; spin_lock_irq(&q->queue_lock); @@ -2246,7 +2301,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns) struct request_queue *q = rq->q; struct throtl_data *td = q->td; - throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10); + throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq), + time_ns >> 10); } void blk_throtl_bio_endio(struct bio *bio) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 313f45a37e9d..8641ba9793c5 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -308,7 +308,8 @@ static void calc_wb_limits(struct rq_wb *rwb) static void scale_up(struct rq_wb *rwb) { - rq_depth_scale_up(&rwb->rq_depth); + if (!rq_depth_scale_up(&rwb->rq_depth)) + return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_wake_all(rwb); @@ -317,7 +318,8 @@ static void scale_up(struct rq_wb *rwb) static void scale_down(struct rq_wb *rwb, bool hard_throttle) { - rq_depth_scale_down(&rwb->rq_depth, hard_throttle); + if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle)) + return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_trace_step(rwb, "scale down"); @@ -629,15 +631,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq) } } -void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) -{ - struct rq_qos *rqos = wbt_rq_qos(q); - if (rqos) { - RQWB(rqos)->rq_depth.queue_depth = depth; - __wbt_update_limits(RQWB(rqos)); - } -} - void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) { struct rq_qos *rqos = wbt_rq_qos(q); @@ -656,7 +649,7 @@ void wbt_enable_default(struct request_queue *q) return; /* Queue not registered? Maybe shutting down... */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return; if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) @@ -689,6 +682,12 @@ static int wbt_data_dir(const struct request *rq) return -1; } +static void wbt_queue_depth_changed(struct rq_qos *rqos) +{ + RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q); + __wbt_update_limits(RQWB(rqos)); +} + static void wbt_exit(struct rq_qos *rqos) { struct rq_wb *rwb = RQWB(rqos); @@ -811,6 +810,7 @@ static struct rq_qos_ops wbt_rqos_ops = { .requeue = wbt_requeue, .done = wbt_done, .cleanup = wbt_cleanup, + .queue_depth_changed = wbt_queue_depth_changed, .exit = wbt_exit, #ifdef CONFIG_BLK_DEBUG_FS .debugfs_attrs = wbt_debugfs_attrs, @@ -853,7 +853,7 @@ int wbt_init(struct request_queue *q) rwb->min_lat_nsec = wbt_default_latency_nsec(q); - wbt_set_queue_depth(q, blk_queue_depth(q)); + wbt_queue_depth_changed(&rwb->rqos); wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); return 0; diff --git a/block/blk-wbt.h b/block/blk-wbt.h index f47218d5b3b2..8e4e37660971 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -95,7 +95,6 @@ void wbt_enable_default(struct request_queue *); u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); -void wbt_set_queue_depth(struct request_queue *, unsigned int); void wbt_set_write_cache(struct request_queue *, bool); u64 wbt_default_latency_nsec(struct request_queue *); @@ -118,9 +117,6 @@ static inline void wbt_disable_default(struct request_queue *q) static inline void wbt_enable_default(struct request_queue *q) { } -static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) -{ -} static inline void wbt_set_write_cache(struct request_queue *q, bool wc) { } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 6c503824ba3f..05741c6f618b 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -70,159 +70,98 @@ void __blk_req_zone_write_unlock(struct request *rq) } EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); -static inline unsigned int __blkdev_nr_zones(struct request_queue *q, - sector_t nr_sectors) -{ - sector_t zone_sectors = blk_queue_zone_sectors(q); - - return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); -} - /** * blkdev_nr_zones - Get number of zones - * @bdev: Target block device + * @disk: Target gendisk * - * Description: - * Return the total number of zones of a zoned block device. - * For a regular block device, the number of zones is always 0. + * Return the total number of zones of a zoned block device. For a block + * device without zone capabilities, the number of zones is always 0. */ -unsigned int blkdev_nr_zones(struct block_device *bdev) +unsigned int blkdev_nr_zones(struct gendisk *disk) { - struct request_queue *q = bdev_get_queue(bdev); + sector_t zone_sectors = blk_queue_zone_sectors(disk->queue); - if (!blk_queue_is_zoned(q)) + if (!blk_queue_is_zoned(disk->queue)) return 0; - - return __blkdev_nr_zones(q, bdev->bd_part->nr_sects); + return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors); } EXPORT_SYMBOL_GPL(blkdev_nr_zones); -/* - * Check that a zone report belongs to this partition, and if yes, fix its start - * sector and write pointer and return true. Return false otherwise. - */ -static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep) -{ - sector_t offset = get_start_sect(bdev); - - if (rep->start < offset) - return false; - - rep->start -= offset; - if (rep->start + rep->len > bdev->bd_part->nr_sects) - return false; - - if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL) - rep->wp = rep->start + rep->len; - else - rep->wp -= offset; - return true; -} - -static int blk_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) -{ - struct request_queue *q = disk->queue; - unsigned int z = 0, n, nrz = *nr_zones; - sector_t capacity = get_capacity(disk); - int ret; - - while (z < nrz && sector < capacity) { - n = nrz - z; - ret = disk->fops->report_zones(disk, sector, &zones[z], &n); - if (ret) - return ret; - if (!n) - break; - sector += blk_queue_zone_sectors(q) * n; - z += n; - } - - WARN_ON(z > *nr_zones); - *nr_zones = z; - - return 0; -} - /** * blkdev_report_zones - Get zones information * @bdev: Target block device * @sector: Sector from which to report zones - * @zones: Array of zone structures where to return the zones information - * @nr_zones: Number of zone structures in the zone array + * @nr_zones: Maximum number of zones to report + * @cb: Callback function called for each reported zone + * @data: Private data for the callback * * Description: - * Get zone information starting from the zone containing @sector. - * The number of zone information reported may be less than the number - * requested by @nr_zones. The number of zones actually reported is - * returned in @nr_zones. - * The caller must use memalloc_noXX_save/restore() calls to control - * memory allocations done within this function (zone array and command - * buffer allocation by the device driver). + * Get zone information starting from the zone containing @sector for at most + * @nr_zones, and call @cb for each zone reported by the device. + * To report all zones in a device starting from @sector, the BLK_ALL_ZONES + * constant can be passed to @nr_zones. + * Returns the number of zones reported by the device, or a negative errno + * value in case of failure. + * + * Note: The caller must use memalloc_noXX_save/restore() calls to control + * memory allocations done within this function. */ int blkdev_report_zones(struct block_device *bdev, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) + unsigned int nr_zones, report_zones_cb cb, void *data) { - struct request_queue *q = bdev_get_queue(bdev); - unsigned int i, nrz; - int ret; - - if (!blk_queue_is_zoned(q)) - return -EOPNOTSUPP; + struct gendisk *disk = bdev->bd_disk; + sector_t capacity = get_capacity(disk); - /* - * A block device that advertized itself as zoned must have a - * report_zones method. If it does not have one defined, the device - * driver has a bug. So warn about that. - */ - if (WARN_ON_ONCE(!bdev->bd_disk->fops->report_zones)) + if (!blk_queue_is_zoned(bdev_get_queue(bdev)) || + WARN_ON_ONCE(!disk->fops->report_zones)) return -EOPNOTSUPP; - if (!*nr_zones || sector >= bdev->bd_part->nr_sects) { - *nr_zones = 0; + if (!nr_zones || sector >= capacity) return 0; - } - nrz = min(*nr_zones, - __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector)); - ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector, - zones, &nrz); - if (ret) - return ret; - - for (i = 0; i < nrz; i++) { - if (!blkdev_report_zone(bdev, zones)) - break; - zones++; - } + return disk->fops->report_zones(disk, sector, nr_zones, cb, data); +} +EXPORT_SYMBOL_GPL(blkdev_report_zones); - *nr_zones = i; +static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, + sector_t sector, + sector_t nr_sectors) +{ + if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) + return false; - return 0; + /* + * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors + * of the applicable zone range is the entire disk. + */ + return !sector && nr_sectors == get_capacity(bdev->bd_disk); } -EXPORT_SYMBOL_GPL(blkdev_report_zones); /** - * blkdev_reset_zones - Reset zones write pointer + * blkdev_zone_mgmt - Execute a zone management operation on a range of zones * @bdev: Target block device - * @sector: Start sector of the first zone to reset - * @nr_sectors: Number of sectors, at least the length of one zone + * @op: Operation to be performed on the zones + * @sector: Start sector of the first zone to operate on + * @nr_sectors: Number of sectors, should be at least the length of one zone and + * must be zone size aligned. * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: - * Reset the write pointer of the zones contained in the range + * Perform the specified operation on the range of zones specified by * @sector..@sector+@nr_sectors. Specifying the entire disk sector range * is valid, but the specified range should not contain conventional zones. + * The operation to execute on each zone can be a zone reset, open, close + * or finish request. */ -int blkdev_reset_zones(struct block_device *bdev, - sector_t sector, sector_t nr_sectors, - gfp_t gfp_mask) +int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, + sector_t sector, sector_t nr_sectors, + gfp_t gfp_mask) { struct request_queue *q = bdev_get_queue(bdev); - sector_t zone_sectors; + sector_t zone_sectors = blk_queue_zone_sectors(q); + sector_t capacity = get_capacity(bdev->bd_disk); sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; - struct blk_plug plug; int ret; if (!blk_queue_is_zoned(q)) @@ -231,42 +170,62 @@ int blkdev_reset_zones(struct block_device *bdev, if (bdev_read_only(bdev)) return -EPERM; - if (!nr_sectors || end_sector > bdev->bd_part->nr_sects) + if (!op_is_zone_mgmt(op)) + return -EOPNOTSUPP; + + if (!nr_sectors || end_sector > capacity) /* Out of range */ return -EINVAL; /* Check alignment (handle eventual smaller last zone) */ - zone_sectors = blk_queue_zone_sectors(q); if (sector & (zone_sectors - 1)) return -EINVAL; - if ((nr_sectors & (zone_sectors - 1)) && - end_sector != bdev->bd_part->nr_sects) + if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity) return -EINVAL; - blk_start_plug(&plug); while (sector < end_sector) { - bio = blk_next_bio(bio, 0, gfp_mask); - bio->bi_iter.bi_sector = sector; bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); + /* + * Special case for the zone reset operation that reset all + * zones, this is useful for applications like mkfs. + */ + if (op == REQ_OP_ZONE_RESET && + blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { + bio->bi_opf = REQ_OP_ZONE_RESET_ALL; + break; + } + + bio->bi_opf = op | REQ_SYNC; + bio->bi_iter.bi_sector = sector; sector += zone_sectors; /* This may take a while, so be nice to others */ cond_resched(); - } ret = submit_bio_wait(bio); bio_put(bio); - blk_finish_plug(&plug); - return ret; } -EXPORT_SYMBOL_GPL(blkdev_reset_zones); +EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); + +struct zone_report_args { + struct blk_zone __user *zones; +}; + +static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct zone_report_args *args = data; + + if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) + return -EFAULT; + return 0; +} /* * BLKREPORTZONE ioctl processing. @@ -276,9 +235,9 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; + struct zone_report_args args; struct request_queue *q; struct blk_zone_report rep; - struct blk_zone *zones; int ret; if (!argp) @@ -300,44 +259,29 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!rep.nr_zones) return -EINVAL; - rep.nr_zones = min(blkdev_nr_zones(bdev), rep.nr_zones); - - zones = kvmalloc_array(rep.nr_zones, sizeof(struct blk_zone), - GFP_KERNEL | __GFP_ZERO); - if (!zones) - return -ENOMEM; - - ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones); - if (ret) - goto out; - - if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) { - ret = -EFAULT; - goto out; - } - - if (rep.nr_zones) { - if (copy_to_user(argp + sizeof(struct blk_zone_report), zones, - sizeof(struct blk_zone) * rep.nr_zones)) - ret = -EFAULT; - } - - out: - kvfree(zones); + args.zones = argp + sizeof(struct blk_zone_report); + ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, + blkdev_copy_zone_to_user, &args); + if (ret < 0) + return ret; - return ret; + rep.nr_zones = ret; + if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) + return -EFAULT; + return 0; } /* - * BLKRESETZONE ioctl processing. + * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. */ -int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct request_queue *q; struct blk_zone_range zrange; + enum req_opf op; if (!argp) return -EINVAL; @@ -358,8 +302,25 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) return -EFAULT; - return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); + switch (cmd) { + case BLKRESETZONE: + op = REQ_OP_ZONE_RESET; + break; + case BLKOPENZONE: + op = REQ_OP_ZONE_OPEN; + break; + case BLKCLOSEZONE: + op = REQ_OP_ZONE_CLOSE; + break; + case BLKFINISHZONE: + op = REQ_OP_ZONE_FINISH; + break; + default: + return -ENOTTY; + } + + return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); } static inline unsigned long *blk_alloc_zone_bitmap(int node, @@ -369,37 +330,96 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node, GFP_NOIO, node); } +void blk_queue_free_zone_bitmaps(struct request_queue *q) +{ + kfree(q->conv_zones_bitmap); + q->conv_zones_bitmap = NULL; + kfree(q->seq_zones_wlock); + q->seq_zones_wlock = NULL; +} + +struct blk_revalidate_zone_args { + struct gendisk *disk; + unsigned long *conv_zones_bitmap; + unsigned long *seq_zones_wlock; + unsigned int nr_zones; + sector_t zone_sectors; + sector_t sector; +}; + /* - * Allocate an array of struct blk_zone to get nr_zones zone information. - * The allocated array may be smaller than nr_zones. + * Helper function to check the validity of zones of a zoned block device. */ -static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones) +static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) { - struct blk_zone *zones; - size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES); + struct blk_revalidate_zone_args *args = data; + struct gendisk *disk = args->disk; + struct request_queue *q = disk->queue; + sector_t capacity = get_capacity(disk); /* - * GFP_KERNEL here is meaningless as the caller task context has - * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones() - * with memalloc_noio_save(). + * All zones must have the same size, with the exception on an eventual + * smaller last zone. */ - zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL); - if (!zones) { - *nr_zones = 0; - return NULL; + if (zone->start == 0) { + if (zone->len == 0 || !is_power_of_2(zone->len)) { + pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n", + disk->disk_name, zone->len); + return -ENODEV; + } + + args->zone_sectors = zone->len; + args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len); + } else if (zone->start + args->zone_sectors < capacity) { + if (zone->len != args->zone_sectors) { + pr_warn("%s: Invalid zoned device with non constant zone size\n", + disk->disk_name); + return -ENODEV; + } + } else { + if (zone->len > args->zone_sectors) { + pr_warn("%s: Invalid zoned device with larger last zone size\n", + disk->disk_name); + return -ENODEV; + } } - *nr_zones = nrz; + /* Check for holes in the zone report */ + if (zone->start != args->sector) { + pr_warn("%s: Zone gap at sectors %llu..%llu\n", + disk->disk_name, args->sector, zone->start); + return -ENODEV; + } - return zones; -} + /* Check zone type */ + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!args->conv_zones_bitmap) { + args->conv_zones_bitmap = + blk_alloc_zone_bitmap(q->node, args->nr_zones); + if (!args->conv_zones_bitmap) + return -ENOMEM; + } + set_bit(idx, args->conv_zones_bitmap); + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + if (!args->seq_zones_wlock) { + args->seq_zones_wlock = + blk_alloc_zone_bitmap(q->node, args->nr_zones); + if (!args->seq_zones_wlock) + return -ENOMEM; + } + break; + default: + pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", + disk->disk_name, (int)zone->type, zone->start); + return -ENODEV; + } -void blk_queue_free_zone_bitmaps(struct request_queue *q) -{ - kfree(q->seq_zones_bitmap); - q->seq_zones_bitmap = NULL; - kfree(q->seq_zones_wlock); - q->seq_zones_wlock = NULL; + args->sector += zone->len; + return 0; } /** @@ -408,102 +428,53 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q) * * Helper function for low-level device drivers to (re) allocate and initialize * a disk request queue zone bitmaps. This functions should normally be called - * within the disk ->revalidate method. For BIO based queues, no zone bitmap - * is allocated. + * within the disk ->revalidate method for blk-mq based drivers. For BIO based + * drivers only q->nr_zones needs to be updated so that the sysfs exposed value + * is correct. */ int blk_revalidate_disk_zones(struct gendisk *disk) { struct request_queue *q = disk->queue; - unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk)); - unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; - unsigned int i, rep_nr_zones = 0, z = 0, nrz; - struct blk_zone *zones = NULL; + struct blk_revalidate_zone_args args = { + .disk = disk, + }; unsigned int noio_flag; - sector_t sector = 0; - int ret = 0; + int ret; - /* - * BIO based queues do not use a scheduler so only q->nr_zones - * needs to be updated so that the sysfs exposed value is correct. - */ - if (!queue_is_mq(q)) { - q->nr_zones = nr_zones; - return 0; - } + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + return -EIO; + if (WARN_ON_ONCE(!queue_is_mq(q))) + return -EIO; /* - * Ensure that all memory allocations in this context are done as - * if GFP_NOIO was specified. + * Ensure that all memory allocations in this context are done as if + * GFP_NOIO was specified. */ noio_flag = memalloc_noio_save(); + ret = disk->fops->report_zones(disk, 0, UINT_MAX, + blk_revalidate_zone_cb, &args); + memalloc_noio_restore(noio_flag); - if (!blk_queue_is_zoned(q) || !nr_zones) { - nr_zones = 0; - goto update; - } - - /* Allocate bitmaps */ - ret = -ENOMEM; - seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones); - if (!seq_zones_wlock) - goto out; - seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones); - if (!seq_zones_bitmap) - goto out; - - /* Get zone information and initialize seq_zones_bitmap */ - rep_nr_zones = nr_zones; - zones = blk_alloc_zones(&rep_nr_zones); - if (!zones) - goto out; - - while (z < nr_zones) { - nrz = min(nr_zones - z, rep_nr_zones); - ret = blk_report_zones(disk, sector, zones, &nrz); - if (ret) - goto out; - if (!nrz) - break; - for (i = 0; i < nrz; i++) { - if (zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL) - set_bit(z, seq_zones_bitmap); - z++; - } - sector += nrz * blk_queue_zone_sectors(q); - } - - if (WARN_ON(z != nr_zones)) { - ret = -EIO; - goto out; - } - -update: /* - * Install the new bitmaps, making sure the queue is stopped and - * all I/Os are completed (i.e. a scheduler is not referencing the - * bitmaps). + * Install the new bitmaps and update nr_zones only once the queue is + * stopped and all I/Os are completed (i.e. a scheduler is not + * referencing the bitmaps). */ blk_mq_freeze_queue(q); - q->nr_zones = nr_zones; - swap(q->seq_zones_wlock, seq_zones_wlock); - swap(q->seq_zones_bitmap, seq_zones_bitmap); - blk_mq_unfreeze_queue(q); - -out: - memalloc_noio_restore(noio_flag); - - kvfree(zones); - kfree(seq_zones_wlock); - kfree(seq_zones_bitmap); - - if (ret) { + if (ret >= 0) { + blk_queue_chunk_sectors(q, args.zone_sectors); + q->nr_zones = args.nr_zones; + swap(q->seq_zones_wlock, args.seq_zones_wlock); + swap(q->conv_zones_bitmap, args.conv_zones_bitmap); + ret = 0; + } else { pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - blk_mq_freeze_queue(q); blk_queue_free_zone_bitmaps(q); - blk_mq_unfreeze_queue(q); } + blk_mq_unfreeze_queue(q); + kfree(args.seq_zones_wlock); + kfree(args.conv_zones_bitmap); return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); - diff --git a/block/blk.h b/block/blk.h index de6b2e146d6e..0b8884353f6b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -19,6 +19,7 @@ struct blk_flush_queue { unsigned int flush_queue_delayed:1; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; + blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; @@ -29,6 +30,7 @@ struct blk_flush_queue { * at the same time */ struct request *orig_rq; + struct lock_class_key key; spinlock_t mq_flush_lock; }; @@ -47,6 +49,12 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } +static inline bool +is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) +{ + return hctx->fq->flush_rq == req; +} + struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); @@ -114,6 +122,7 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); +void bio_integrity_free(struct bio *bio); static inline bool bio_integrity_endio(struct bio *bio) { if (bio_integrity(bio)) @@ -159,6 +168,9 @@ static inline bool bio_integrity_endio(struct bio *bio) { return true; } +static inline void bio_integrity_free(struct bio *bio) +{ +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); @@ -184,16 +196,18 @@ void blk_account_io_done(struct request *req, u64 now); void blk_insert_flush(struct request *rq); -int elevator_init_mq(struct request_queue *q); +void elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); void __elevator_exit(struct request_queue *, struct elevator_queue *); -int elv_register_queue(struct request_queue *q); +int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); static inline void elevator_exit(struct request_queue *q, struct elevator_queue *e) { + lockdep_assert_held(&q->sysfs_lock); + blk_mq_sched_free_requests(q); __elevator_exit(q, e); } @@ -233,14 +247,11 @@ int blk_dev_init(void); * Contribute to IO statistics IFF: * * a) it's attached to a gendisk, and - * b) the queue had IO stats enabled when this request was started, and - * c) it's a file system request + * b) the queue had IO stats enabled when this request was started */ static inline bool blk_do_io_stat(struct request *rq) { - return rq->rq_disk && - (rq->rq_flags & RQF_IO_STAT) && - !blk_rq_is_passthrough(rq); + return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT); } static inline void req_set_nomerge(struct request_queue *q, struct request *req) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 785dd58947f1..6cbb7926534c 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -266,6 +266,7 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct bsg_set *bset = container_of(q->tag_set, struct bsg_set, tag_set); + blk_status_t sts = BLK_STS_IOERR; int ret; blk_mq_start_request(req); @@ -274,14 +275,15 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_IOERR; if (!bsg_prepare_job(dev, req)) - return BLK_STS_IOERR; + goto out; ret = bset->job_fn(blk_mq_rq_to_pdu(req)); - if (ret) - return BLK_STS_IOERR; + if (!ret) + sts = BLK_STS_OK; +out: put_device(dev); - return BLK_STS_OK; + return sts; } /* called right after the request is allocated for the request_queue */ diff --git a/block/bsg.c b/block/bsg.c index 833c44b3d458..d7bae94b64d9 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -382,6 +382,7 @@ static const struct file_operations bsg_fops = { .open = bsg_open, .release = bsg_release, .unlocked_ioctl = bsg_ioctl, + .compat_ioctl = compat_ptr_ioctl, .owner = THIS_MODULE, .llseek = default_llseek, }; diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c deleted file mode 100644 index 6ca015f92766..000000000000 --- a/block/compat_ioctl.c +++ /dev/null @@ -1,411 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/blkdev.h> -#include <linux/blkpg.h> -#include <linux/blktrace_api.h> -#include <linux/cdrom.h> -#include <linux/compat.h> -#include <linux/elevator.h> -#include <linux/hdreg.h> -#include <linux/slab.h> -#include <linux/syscalls.h> -#include <linux/types.h> -#include <linux/uaccess.h> - -static int compat_put_ushort(unsigned long arg, unsigned short val) -{ - return put_user(val, (unsigned short __user *)compat_ptr(arg)); -} - -static int compat_put_int(unsigned long arg, int val) -{ - return put_user(val, (compat_int_t __user *)compat_ptr(arg)); -} - -static int compat_put_uint(unsigned long arg, unsigned int val) -{ - return put_user(val, (compat_uint_t __user *)compat_ptr(arg)); -} - -static int compat_put_long(unsigned long arg, long val) -{ - return put_user(val, (compat_long_t __user *)compat_ptr(arg)); -} - -static int compat_put_ulong(unsigned long arg, compat_ulong_t val) -{ - return put_user(val, (compat_ulong_t __user *)compat_ptr(arg)); -} - -static int compat_put_u64(unsigned long arg, u64 val) -{ - return put_user(val, (compat_u64 __user *)compat_ptr(arg)); -} - -struct compat_hd_geometry { - unsigned char heads; - unsigned char sectors; - unsigned short cylinders; - u32 start; -}; - -static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, - struct compat_hd_geometry __user *ugeo) -{ - struct hd_geometry geo; - int ret; - - if (!ugeo) - return -EINVAL; - if (!disk->fops->getgeo) - return -ENOTTY; - - memset(&geo, 0, sizeof(geo)); - /* - * We need to set the startsect first, the driver may - * want to override it. - */ - geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); - if (ret) - return ret; - - ret = copy_to_user(ugeo, &geo, 4); - ret |= put_user(geo.start, &ugeo->start); - if (ret) - ret = -EFAULT; - - return ret; -} - -static int compat_hdio_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - unsigned long __user *p; - int error; - - p = compat_alloc_user_space(sizeof(unsigned long)); - error = __blkdev_driver_ioctl(bdev, mode, - cmd, (unsigned long)p); - if (error == 0) { - unsigned int __user *uvp = compat_ptr(arg); - unsigned long v; - if (get_user(v, p) || put_user(v, uvp)) - error = -EFAULT; - } - return error; -} - -struct compat_cdrom_read_audio { - union cdrom_addr addr; - u8 addr_format; - compat_int_t nframes; - compat_caddr_t buf; -}; - -struct compat_cdrom_generic_command { - unsigned char cmd[CDROM_PACKET_SIZE]; - compat_caddr_t buffer; - compat_uint_t buflen; - compat_int_t stat; - compat_caddr_t sense; - unsigned char data_direction; - compat_int_t quiet; - compat_int_t timeout; - compat_caddr_t reserved[1]; -}; - -static int compat_cdrom_read_audio(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct cdrom_read_audio __user *cdread_audio; - struct compat_cdrom_read_audio __user *cdread_audio32; - __u32 data; - void __user *datap; - - cdread_audio = compat_alloc_user_space(sizeof(*cdread_audio)); - cdread_audio32 = compat_ptr(arg); - - if (copy_in_user(&cdread_audio->addr, - &cdread_audio32->addr, - (sizeof(*cdread_audio32) - - sizeof(compat_caddr_t)))) - return -EFAULT; - - if (get_user(data, &cdread_audio32->buf)) - return -EFAULT; - datap = compat_ptr(data); - if (put_user(datap, &cdread_audio->buf)) - return -EFAULT; - - return __blkdev_driver_ioctl(bdev, mode, cmd, - (unsigned long)cdread_audio); -} - -static int compat_cdrom_generic_command(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct cdrom_generic_command __user *cgc; - struct compat_cdrom_generic_command __user *cgc32; - u32 data; - unsigned char dir; - int itmp; - - cgc = compat_alloc_user_space(sizeof(*cgc)); - cgc32 = compat_ptr(arg); - - if (copy_in_user(&cgc->cmd, &cgc32->cmd, sizeof(cgc->cmd)) || - get_user(data, &cgc32->buffer) || - put_user(compat_ptr(data), &cgc->buffer) || - copy_in_user(&cgc->buflen, &cgc32->buflen, - (sizeof(unsigned int) + sizeof(int))) || - get_user(data, &cgc32->sense) || - put_user(compat_ptr(data), &cgc->sense) || - get_user(dir, &cgc32->data_direction) || - put_user(dir, &cgc->data_direction) || - get_user(itmp, &cgc32->quiet) || - put_user(itmp, &cgc->quiet) || - get_user(itmp, &cgc32->timeout) || - put_user(itmp, &cgc->timeout) || - get_user(data, &cgc32->reserved[0]) || - put_user(compat_ptr(data), &cgc->reserved[0])) - return -EFAULT; - - return __blkdev_driver_ioctl(bdev, mode, cmd, (unsigned long)cgc); -} - -struct compat_blkpg_ioctl_arg { - compat_int_t op; - compat_int_t flags; - compat_int_t datalen; - compat_caddr_t data; -}; - -static int compat_blkpg_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, struct compat_blkpg_ioctl_arg __user *ua32) -{ - struct blkpg_ioctl_arg __user *a = compat_alloc_user_space(sizeof(*a)); - compat_caddr_t udata; - compat_int_t n; - int err; - - err = get_user(n, &ua32->op); - err |= put_user(n, &a->op); - err |= get_user(n, &ua32->flags); - err |= put_user(n, &a->flags); - err |= get_user(n, &ua32->datalen); - err |= put_user(n, &a->datalen); - err |= get_user(udata, &ua32->data); - err |= put_user(compat_ptr(udata), &a->data); - if (err) - return err; - - return blkdev_ioctl(bdev, mode, cmd, (unsigned long)a); -} - -#define BLKBSZGET_32 _IOR(0x12, 112, int) -#define BLKBSZSET_32 _IOW(0x12, 113, int) -#define BLKGETSIZE64_32 _IOR(0x12, 114, int) - -static int compat_blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) -{ - switch (cmd) { - case HDIO_GET_UNMASKINTR: - case HDIO_GET_MULTCOUNT: - case HDIO_GET_KEEPSETTINGS: - case HDIO_GET_32BIT: - case HDIO_GET_NOWERR: - case HDIO_GET_DMA: - case HDIO_GET_NICE: - case HDIO_GET_WCACHE: - case HDIO_GET_ACOUSTIC: - case HDIO_GET_ADDRESS: - case HDIO_GET_BUSSTATE: - return compat_hdio_ioctl(bdev, mode, cmd, arg); - case CDROMREADAUDIO: - return compat_cdrom_read_audio(bdev, mode, cmd, arg); - case CDROM_SEND_PACKET: - return compat_cdrom_generic_command(bdev, mode, cmd, arg); - - /* - * No handler required for the ones below, we just need to - * convert arg to a 64 bit pointer. - */ - case BLKSECTSET: - /* - * 0x03 -- HD/IDE ioctl's used by hdparm and friends. - * Some need translations, these do not. - */ - case HDIO_GET_IDENTITY: - case HDIO_DRIVE_TASK: - case HDIO_DRIVE_CMD: - /* 0x330 is reserved -- it used to be HDIO_GETGEO_BIG */ - case 0x330: - /* CDROM stuff */ - case CDROMPAUSE: - case CDROMRESUME: - case CDROMPLAYMSF: - case CDROMPLAYTRKIND: - case CDROMREADTOCHDR: - case CDROMREADTOCENTRY: - case CDROMSTOP: - case CDROMSTART: - case CDROMEJECT: - case CDROMVOLCTRL: - case CDROMSUBCHNL: - case CDROMMULTISESSION: - case CDROM_GET_MCN: - case CDROMRESET: - case CDROMVOLREAD: - case CDROMSEEK: - case CDROMPLAYBLK: - case CDROMCLOSETRAY: - case CDROM_DISC_STATUS: - case CDROM_CHANGER_NSLOTS: - case CDROM_GET_CAPABILITY: - /* Ignore cdrom.h about these next 5 ioctls, they absolutely do - * not take a struct cdrom_read, instead they take a struct cdrom_msf - * which is compatible. - */ - case CDROMREADMODE2: - case CDROMREADMODE1: - case CDROMREADRAW: - case CDROMREADCOOKED: - case CDROMREADALL: - /* DVD ioctls */ - case DVD_READ_STRUCT: - case DVD_WRITE_STRUCT: - case DVD_AUTH: - arg = (unsigned long)compat_ptr(arg); - /* These intepret arg as an unsigned long, not as a pointer, - * so we must not do compat_ptr() conversion. */ - case HDIO_SET_MULTCOUNT: - case HDIO_SET_UNMASKINTR: - case HDIO_SET_KEEPSETTINGS: - case HDIO_SET_32BIT: - case HDIO_SET_NOWERR: - case HDIO_SET_DMA: - case HDIO_SET_PIO_MODE: - case HDIO_SET_NICE: - case HDIO_SET_WCACHE: - case HDIO_SET_ACOUSTIC: - case HDIO_SET_BUSSTATE: - case HDIO_SET_ADDRESS: - case CDROMEJECT_SW: - case CDROM_SET_OPTIONS: - case CDROM_CLEAR_OPTIONS: - case CDROM_SELECT_SPEED: - case CDROM_SELECT_DISC: - case CDROM_MEDIA_CHANGED: - case CDROM_DRIVE_STATUS: - case CDROM_LOCKDOOR: - case CDROM_DEBUG: - break; - default: - /* unknown ioctl number */ - return -ENOIOCTLCMD; - } - - return __blkdev_driver_ioctl(bdev, mode, cmd, arg); -} - -/* Most of the generic ioctls are handled in the normal fallback path. - This assumes the blkdev's low level compat_ioctl always returns - ENOIOCTLCMD for unknown ioctls. */ -long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - int ret = -ENOIOCTLCMD; - struct inode *inode = file->f_mapping->host; - struct block_device *bdev = inode->i_bdev; - struct gendisk *disk = bdev->bd_disk; - fmode_t mode = file->f_mode; - loff_t size; - unsigned int max_sectors; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; - - switch (cmd) { - case HDIO_GETGEO: - return compat_hdio_getgeo(disk, bdev, compat_ptr(arg)); - case BLKPBSZGET: - return compat_put_uint(arg, bdev_physical_block_size(bdev)); - case BLKIOMIN: - return compat_put_uint(arg, bdev_io_min(bdev)); - case BLKIOOPT: - return compat_put_uint(arg, bdev_io_opt(bdev)); - case BLKALIGNOFF: - return compat_put_int(arg, bdev_alignment_offset(bdev)); - case BLKDISCARDZEROES: - return compat_put_uint(arg, 0); - case BLKFLSBUF: - case BLKROSET: - case BLKDISCARD: - case BLKSECDISCARD: - case BLKZEROOUT: - /* - * the ones below are implemented in blkdev_locked_ioctl, - * but we call blkdev_ioctl, which gets the lock for us - */ - case BLKRRPART: - return blkdev_ioctl(bdev, mode, cmd, - (unsigned long)compat_ptr(arg)); - case BLKBSZSET_32: - return blkdev_ioctl(bdev, mode, BLKBSZSET, - (unsigned long)compat_ptr(arg)); - case BLKPG: - return compat_blkpg_ioctl(bdev, mode, cmd, compat_ptr(arg)); - case BLKRAGET: - case BLKFRAGET: - if (!arg) - return -EINVAL; - return compat_put_long(arg, - (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); - case BLKROGET: /* compatible */ - return compat_put_int(arg, bdev_read_only(bdev) != 0); - case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ - return compat_put_int(arg, block_size(bdev)); - case BLKSSZGET: /* get block device hardware sector size */ - return compat_put_int(arg, bdev_logical_block_size(bdev)); - case BLKSECTGET: - max_sectors = min_t(unsigned int, USHRT_MAX, - queue_max_sectors(bdev_get_queue(bdev))); - return compat_put_ushort(arg, max_sectors); - case BLKROTATIONAL: - return compat_put_ushort(arg, - !blk_queue_nonrot(bdev_get_queue(bdev))); - case BLKRASET: /* compatible, but no compat_ptr (!) */ - case BLKFRASET: - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; - return 0; - case BLKGETSIZE: - size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) - return -EFBIG; - return compat_put_ulong(arg, size >> 9); - - case BLKGETSIZE64_32: - return compat_put_u64(arg, i_size_read(bdev->bd_inode)); - - case BLKTRACESETUP32: - case BLKTRACESTART: /* compatible */ - case BLKTRACESTOP: /* compatible */ - case BLKTRACETEARDOWN: /* compatible */ - ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg)); - return ret; - default: - if (disk->fops->compat_ioctl) - ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); - if (ret == -ENOIOCTLCMD) - ret = compat_blkdev_driver_ioctl(bdev, mode, cmd, arg); - return ret; - } -} diff --git a/block/elevator.c b/block/elevator.c index 2f17d66d0e61..4eab3d70e880 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,8 +83,26 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static bool elevator_match(const struct elevator_type *e, const char *name) +static inline bool elv_support_features(unsigned int elv_features, + unsigned int required_features) { + return (required_features & elv_features) == required_features; +} + +/** + * elevator_match - Test an elevator name and features + * @e: Scheduler to test + * @name: Elevator name to test + * @required_features: Features that the elevator must provide + * + * Return true is the elevator @e name matches @name and if @e provides all the + * the feratures spcified by @required_features. + */ +static bool elevator_match(const struct elevator_type *e, const char *name, + unsigned int required_features) +{ + if (!elv_support_features(e->elevator_features, required_features)) + return false; if (!strcmp(e->elevator_name, name)) return true; if (e->elevator_alias && !strcmp(e->elevator_alias, name)) @@ -93,15 +111,21 @@ static bool elevator_match(const struct elevator_type *e, const char *name) return false; } -/* - * Return scheduler with name 'name' +/** + * elevator_find - Find an elevator + * @name: Name of the elevator to find + * @required_features: Features that the elevator must provide + * + * Return the first registered scheduler with name @name and supporting the + * features @required_features and NULL otherwise. */ -static struct elevator_type *elevator_find(const char *name) +static struct elevator_type *elevator_find(const char *name, + unsigned int required_features) { struct elevator_type *e; list_for_each_entry(e, &elv_list, list) { - if (elevator_match(e, name)) + if (elevator_match(e, name, required_features)) return e; } @@ -120,12 +144,12 @@ static struct elevator_type *elevator_get(struct request_queue *q, spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->required_elevator_features); if (!e && try_loading) { spin_unlock(&elv_list_lock); request_module("%s-iosched", name); spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->required_elevator_features); } if (e && !try_module_get(e->elevator_owner)) @@ -135,20 +159,6 @@ static struct elevator_type *elevator_get(struct request_queue *q, return e; } -static char chosen_elevator[ELV_NAME_MAX]; - -static int __init elevator_setup(char *str) -{ - /* - * Be backwards-compatible with previous kernels, so users - * won't get the wrong elevator. - */ - strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); - return 1; -} - -__setup("elevator=", elevator_setup); - static struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, @@ -470,13 +480,16 @@ static struct kobj_type elv_ktype = { .release = elevator_release, }; -int elv_register_queue(struct request_queue *q) +/* + * elv_register_queue is called from either blk_register_queue or + * elevator_switch, elevator switch is prevented from being happen + * in the two paths, so it is safe to not hold q->sysfs_lock. + */ +int elv_register_queue(struct request_queue *q, bool uevent) { struct elevator_queue *e = q->elevator; int error; - lockdep_assert_held(&q->sysfs_lock); - error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -487,21 +500,27 @@ int elv_register_queue(struct request_queue *q) attr++; } } - kobject_uevent(&e->kobj, KOBJ_ADD); + if (uevent) + kobject_uevent(&e->kobj, KOBJ_ADD); + e->registered = 1; } return error; } +/* + * elv_unregister_queue is called from either blk_unregister_queue or + * elevator_switch, elevator switch is prevented from being happen + * in the two paths, so it is safe to not hold q->sysfs_lock. + */ void elv_unregister_queue(struct request_queue *q) { - lockdep_assert_held(&q->sysfs_lock); - if (q) { struct elevator_queue *e = q->elevator; kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + e->registered = 0; /* Re-enable throttling in case elevator disabled it */ wbt_enable_default(q); @@ -526,7 +545,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name)) { + if (elevator_find(e->elevator_name, 0)) { spin_unlock(&elv_list_lock); kmem_cache_destroy(e->icq_cache); return -EBUSY; @@ -569,6 +588,7 @@ int elevator_switch_mq(struct request_queue *q, if (q->elevator) { if (q->elevator->registered) elv_unregister_queue(q); + ioc_clear_queue(q); elevator_exit(q, q->elevator); } @@ -578,7 +598,7 @@ int elevator_switch_mq(struct request_queue *q, goto out; if (new_e) { - ret = elv_register_queue(q); + ret = elv_register_queue(q, true); if (ret) { elevator_exit(q, q->elevator); goto out; @@ -594,37 +614,90 @@ out: return ret; } +static inline bool elv_support_iosched(struct request_queue *q) +{ + if (!q->mq_ops || + (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))) + return false; + return true; +} + /* - * For blk-mq devices, we default to using mq-deadline, if available, for single - * queue devices. If deadline isn't available OR we have multiple queues, - * default to "none". + * For single queue devices, default to using mq-deadline. If we have multiple + * queues or mq-deadline is not available, default to "none". */ -int elevator_init_mq(struct request_queue *q) +static struct elevator_type *elevator_get_default(struct request_queue *q) +{ + if (q->nr_hw_queues != 1) + return NULL; + + return elevator_get(q, "mq-deadline", false); +} + +/* + * Get the first elevator providing the features required by the request queue. + * Default to "none" if no matching elevator is found. + */ +static struct elevator_type *elevator_get_by_features(struct request_queue *q) +{ + struct elevator_type *e, *found = NULL; + + spin_lock(&elv_list_lock); + + list_for_each_entry(e, &elv_list, list) { + if (elv_support_features(e->elevator_features, + q->required_elevator_features)) { + found = e; + break; + } + } + + if (found && !try_module_get(found->elevator_owner)) + found = NULL; + + spin_unlock(&elv_list_lock); + return found; +} + +/* + * For a device queue that has no required features, use the default elevator + * settings. Otherwise, use the first elevator available matching the required + * features. If no suitable elevator is find or if the chosen elevator + * initialization fails, fall back to the "none" elevator (no elevator). + */ +void elevator_init_mq(struct request_queue *q) { struct elevator_type *e; - int err = 0; + int err; - if (q->nr_hw_queues != 1) - return 0; + if (!elv_support_iosched(q)) + return; + + WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)); - /* - * q->sysfs_lock must be held to provide mutual exclusion between - * elevator_switch() and here. - */ - mutex_lock(&q->sysfs_lock); if (unlikely(q->elevator)) - goto out_unlock; + return; - e = elevator_get(q, "mq-deadline", false); + if (!q->required_elevator_features) + e = elevator_get_default(q); + else + e = elevator_get_by_features(q); if (!e) - goto out_unlock; + return; + + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); err = blk_mq_init_sched(q, e); - if (err) + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + if (err) { + pr_warn("\"%s\" elevator initialization failed, " + "falling back to \"none\"\n", e->elevator_name); elevator_put(e); -out_unlock: - mutex_unlock(&q->sysfs_lock); - return err; + } } @@ -660,7 +733,7 @@ static int __elevator_change(struct request_queue *q, const char *name) struct elevator_type *e; /* Make sure queue is not in the middle of being removed */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return -ENOENT; /* @@ -677,7 +750,8 @@ static int __elevator_change(struct request_queue *q, const char *name) if (!e) return -EINVAL; - if (q->elevator && elevator_match(q->elevator->type, elevator_name)) { + if (q->elevator && + elevator_match(q->elevator->type, elevator_name, 0)) { elevator_put(e); return 0; } @@ -685,13 +759,6 @@ static int __elevator_change(struct request_queue *q, const char *name) return elevator_switch(q, e); } -static inline bool elv_support_iosched(struct request_queue *q) -{ - if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) - return false; - return true; -} - ssize_t elv_iosched_store(struct request_queue *q, const char *name, size_t count) { @@ -724,11 +791,13 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (elv && elevator_match(elv, __e->elevator_name)) { + if (elv && elevator_match(elv, __e->elevator_name, 0)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); continue; } - if (elv_support_iosched(q)) + if (elv_support_iosched(q) && + elevator_match(__e, __e->elevator_name, + q->required_elevator_features)) len += sprintf(name+len, "%s ", __e->elevator_name); } spin_unlock(&elv_list_lock); @@ -763,3 +832,12 @@ struct request *elv_rb_latter_request(struct request_queue *q, return NULL; } EXPORT_SYMBOL(elv_rb_latter_request); + +static int __init elevator_setup(char *str) +{ + pr_warn("Kernel parameter elevator= does not have any effect anymore.\n" + "Please use sysfs to set IO scheduler for individual devices.\n"); + return 1; +} + +__setup("elevator=", elevator_setup); diff --git a/block/genhd.c b/block/genhd.c index 54f1f0d381f4..ff6268970ddc 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -695,6 +695,15 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, dev_t devt; int retval; + /* + * The disk queue should now be all set with enough information about + * the device for the elevator code to pick an adequate default + * elevator if one is needed, that is, for devices requesting queue + * registration. + */ + if (register_queue) + elevator_init_mq(disk->queue); + /* minors == 0 indicates to use ext devt from part0 and should * be accompanied with EXT_DEVT flag. Make sure all * parameters make sense. @@ -1376,7 +1385,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " - "%lu %lu %lu %u\n", + "%lu %lu %lu %u " + "%lu %u" + "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), part_stat_read(hd, ios[STAT_READ]), @@ -1393,7 +1404,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, ios[STAT_DISCARD]), part_stat_read(hd, merges[STAT_DISCARD]), part_stat_read(hd, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD) + (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD), + part_stat_read(hd, ios[STAT_FLUSH]), + (unsigned int)part_stat_read_msecs(hd, STAT_FLUSH) ); } disk_part_iter_exit(&piter); diff --git a/block/ioctl.c b/block/ioctl.c index 15a0eb80ada9..127194b9f9bd 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/capability.h> +#include <linux/compat.h> #include <linux/blkdev.h> #include <linux/export.h> #include <linux/gfp.h> @@ -11,12 +12,12 @@ #include <linux/pr.h> #include <linux/uaccess.h> -static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) +static int blkpg_do_ioctl(struct block_device *bdev, + struct blkpg_partition __user *upart, int op) { struct block_device *bdevp; struct gendisk *disk; struct hd_struct *part, *lpart; - struct blkpg_ioctl_arg a; struct blkpg_partition p; struct disk_part_iter piter; long long start, length; @@ -24,9 +25,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (copy_from_user(&a, arg, sizeof(struct blkpg_ioctl_arg))) - return -EFAULT; - if (copy_from_user(&p, a.data, sizeof(struct blkpg_partition))) + if (copy_from_user(&p, upart, sizeof(struct blkpg_partition))) return -EFAULT; disk = bdev->bd_disk; if (bdev != bdev->bd_contains) @@ -34,7 +33,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user partno = p.pno; if (partno <= 0) return -EINVAL; - switch (a.op) { + switch (op) { case BLKPG_ADD_PARTITION: start = p.start >> 9; length = p.length >> 9; @@ -155,48 +154,54 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user } } -/* - * This is an exported API for the block driver, and will not - * acquire bd_mutex. This API should be used in case that - * caller has held bd_mutex already. - */ -int __blkdev_reread_part(struct block_device *bdev) +static int blkpg_ioctl(struct block_device *bdev, + struct blkpg_ioctl_arg __user *arg) { - struct gendisk *disk = bdev->bd_disk; + struct blkpg_partition __user *udata; + int op; - if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + if (get_user(op, &arg->op) || get_user(udata, &arg->data)) + return -EFAULT; + + return blkpg_do_ioctl(bdev, udata, op); +} - lockdep_assert_held(&bdev->bd_mutex); +#ifdef CONFIG_COMPAT +struct compat_blkpg_ioctl_arg { + compat_int_t op; + compat_int_t flags; + compat_int_t datalen; + compat_caddr_t data; +}; - return rescan_partitions(disk, bdev); +static int compat_blkpg_ioctl(struct block_device *bdev, + struct compat_blkpg_ioctl_arg __user *arg) +{ + compat_caddr_t udata; + int op; + + if (get_user(op, &arg->op) || get_user(udata, &arg->data)) + return -EFAULT; + + return blkpg_do_ioctl(bdev, compat_ptr(udata), op); } -EXPORT_SYMBOL(__blkdev_reread_part); +#endif -/* - * This is an exported API for the block driver, and will - * try to acquire bd_mutex. If bd_mutex has been held already - * in current context, please call __blkdev_reread_part(). - * - * Make sure the held locks in current context aren't required - * in open()/close() handler and I/O path for avoiding ABBA deadlock: - * - bd_mutex is held before calling block driver's open/close - * handler - * - reading partition table may submit I/O to the block device - */ -int blkdev_reread_part(struct block_device *bdev) +static int blkdev_reread_part(struct block_device *bdev) { - int res; + int ret; + + if (!disk_part_scan_enabled(bdev->bd_disk) || bdev != bdev->bd_contains) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; mutex_lock(&bdev->bd_mutex); - res = __blkdev_reread_part(bdev); + ret = bdev_disk_changed(bdev, false); mutex_unlock(&bdev->bd_mutex); - return res; + return ret; } -EXPORT_SYMBOL(blkdev_reread_part); static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, unsigned long arg, unsigned long flags) @@ -265,36 +270,48 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, BLKDEV_ZERO_NOUNMAP); } -static int put_ushort(unsigned long arg, unsigned short val) +static int put_ushort(unsigned short __user *argp, unsigned short val) +{ + return put_user(val, argp); +} + +static int put_int(int __user *argp, int val) { - return put_user(val, (unsigned short __user *)arg); + return put_user(val, argp); } -static int put_int(unsigned long arg, int val) +static int put_uint(unsigned int __user *argp, unsigned int val) { - return put_user(val, (int __user *)arg); + return put_user(val, argp); } -static int put_uint(unsigned long arg, unsigned int val) +static int put_long(long __user *argp, long val) { - return put_user(val, (unsigned int __user *)arg); + return put_user(val, argp); } -static int put_long(unsigned long arg, long val) +static int put_ulong(unsigned long __user *argp, unsigned long val) { - return put_user(val, (long __user *)arg); + return put_user(val, argp); } -static int put_ulong(unsigned long arg, unsigned long val) +static int put_u64(u64 __user *argp, u64 val) { - return put_user(val, (unsigned long __user *)arg); + return put_user(val, argp); } -static int put_u64(unsigned long arg, u64 val) +#ifdef CONFIG_COMPAT +static int compat_put_long(compat_long_t *argp, long val) { - return put_user(val, (u64 __user *)arg); + return put_user(val, argp); } +static int compat_put_ulong(compat_ulong_t *argp, compat_ulong_t val) +{ + return put_user(val, argp); +} +#endif + int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { @@ -312,6 +329,26 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, */ EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); +#ifdef CONFIG_COMPAT +/* + * This is the equivalent of compat_ptr_ioctl(), to be used by block + * drivers that implement only commands that are completely compatible + * between 32-bit and 64-bit user space + */ +int blkdev_compat_ptr_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + struct gendisk *disk = bdev->bd_disk; + + if (disk->fops->ioctl) + return disk->fops->ioctl(bdev, mode, cmd, + (unsigned long)compat_ptr(arg)); + + return -ENOIOCTLCMD; +} +EXPORT_SYMBOL(blkdev_compat_ptr_ioctl); +#endif + static int blkdev_pr_register(struct block_device *bdev, struct pr_registration __user *arg) { @@ -482,6 +519,45 @@ static int blkdev_getgeo(struct block_device *bdev, return 0; } +#ifdef CONFIG_COMPAT +struct compat_hd_geometry { + unsigned char heads; + unsigned char sectors; + unsigned short cylinders; + u32 start; +}; + +static int compat_hdio_getgeo(struct block_device *bdev, + struct compat_hd_geometry __user *ugeo) +{ + struct gendisk *disk = bdev->bd_disk; + struct hd_geometry geo; + int ret; + + if (!ugeo) + return -EINVAL; + if (!disk->fops->getgeo) + return -ENOTTY; + + memset(&geo, 0, sizeof(geo)); + /* + * We need to set the startsect first, the driver may + * want to override it. + */ + geo.start = get_start_sect(bdev); + ret = disk->fops->getgeo(bdev, &geo); + if (ret) + return ret; + + ret = copy_to_user(ugeo, &geo, 4); + ret |= put_user(geo.start, &ugeo->start); + if (ret) + ret = -EFAULT; + + return ret; +} +#endif + /* set the logical block size */ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, int __user *argp) @@ -508,13 +584,13 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, } /* - * always keep this in sync with compat_blkdev_ioctl() + * Common commands that are handled the same way on native and compat + * user space. Note the separate arg/argp parameters that are needed + * to deal with the compat_ptr() conversion. */ -int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, - unsigned long arg) +static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg, void __user *argp) { - void __user *argp = (void __user *)arg; - loff_t size; unsigned int max_sectors; switch (cmd) { @@ -532,62 +608,44 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); case BLKRESETZONE: - return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg); + case BLKOPENZONE: + case BLKCLOSEZONE: + case BLKFINISHZONE: + return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg); case BLKGETZONESZ: - return put_uint(arg, bdev_zone_sectors(bdev)); + return put_uint(argp, bdev_zone_sectors(bdev)); case BLKGETNRZONES: - return put_uint(arg, blkdev_nr_zones(bdev)); - case HDIO_GETGEO: - return blkdev_getgeo(bdev, argp); - case BLKRAGET: - case BLKFRAGET: - if (!arg) - return -EINVAL; - return put_long(arg, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512); + return put_uint(argp, blkdev_nr_zones(bdev->bd_disk)); case BLKROGET: - return put_int(arg, bdev_read_only(bdev) != 0); - case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ - return put_int(arg, block_size(bdev)); + return put_int(argp, bdev_read_only(bdev) != 0); case BLKSSZGET: /* get block device logical block size */ - return put_int(arg, bdev_logical_block_size(bdev)); + return put_int(argp, bdev_logical_block_size(bdev)); case BLKPBSZGET: /* get block device physical block size */ - return put_uint(arg, bdev_physical_block_size(bdev)); + return put_uint(argp, bdev_physical_block_size(bdev)); case BLKIOMIN: - return put_uint(arg, bdev_io_min(bdev)); + return put_uint(argp, bdev_io_min(bdev)); case BLKIOOPT: - return put_uint(arg, bdev_io_opt(bdev)); + return put_uint(argp, bdev_io_opt(bdev)); case BLKALIGNOFF: - return put_int(arg, bdev_alignment_offset(bdev)); + return put_int(argp, bdev_alignment_offset(bdev)); case BLKDISCARDZEROES: - return put_uint(arg, 0); + return put_uint(argp, 0); case BLKSECTGET: max_sectors = min_t(unsigned int, USHRT_MAX, queue_max_sectors(bdev_get_queue(bdev))); - return put_ushort(arg, max_sectors); + return put_ushort(argp, max_sectors); case BLKROTATIONAL: - return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev))); + return put_ushort(argp, !blk_queue_nonrot(bdev_get_queue(bdev))); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; - case BLKBSZSET: - return blkdev_bszset(bdev, mode, argp); - case BLKPG: - return blkpg_ioctl(bdev, argp); case BLKRRPART: return blkdev_reread_part(bdev); - case BLKGETSIZE: - size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) - return -EFBIG; - return put_ulong(arg, size >> 9); - case BLKGETSIZE64: - return put_u64(arg, i_size_read(bdev->bd_inode)); case BLKTRACESTART: case BLKTRACESTOP: - case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: @@ -603,7 +661,132 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case IOC_PR_CLEAR: return blkdev_pr_clear(bdev, argp); default: + return -ENOIOCTLCMD; + } +} + +/* + * Always keep this in sync with compat_blkdev_ioctl() + * to handle all incompatible commands in both functions. + * + * New commands must be compatible and go into blkdev_common_ioctl + */ +int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, + unsigned long arg) +{ + int ret; + loff_t size; + void __user *argp = (void __user *)arg; + + switch (cmd) { + /* These need separate implementations for the data structure */ + case HDIO_GETGEO: + return blkdev_getgeo(bdev, argp); + case BLKPG: + return blkpg_ioctl(bdev, argp); + + /* Compat mode returns 32-bit data instead of 'long' */ + case BLKRAGET: + case BLKFRAGET: + if (!argp) + return -EINVAL; + return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512); + case BLKGETSIZE: + size = i_size_read(bdev->bd_inode); + if ((size >> 9) > ~0UL) + return -EFBIG; + return put_ulong(argp, size >> 9); + + /* The data is compatible, but the command number is different */ + case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ + return put_int(argp, block_size(bdev)); + case BLKBSZSET: + return blkdev_bszset(bdev, mode, argp); + case BLKGETSIZE64: + return put_u64(argp, i_size_read(bdev->bd_inode)); + + /* Incompatible alignment on i386 */ + case BLKTRACESETUP: + return blk_trace_ioctl(bdev, cmd, argp); + default: + break; + } + + ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); + if (ret == -ENOIOCTLCMD) return __blkdev_driver_ioctl(bdev, mode, cmd, arg); + + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */ + +#ifdef CONFIG_COMPAT + +#define BLKBSZGET_32 _IOR(0x12, 112, int) +#define BLKBSZSET_32 _IOW(0x12, 113, int) +#define BLKGETSIZE64_32 _IOR(0x12, 114, int) + +/* Most of the generic ioctls are handled in the normal fallback path. + This assumes the blkdev's low level compat_ioctl always returns + ENOIOCTLCMD for unknown ioctls. */ +long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + int ret; + void __user *argp = compat_ptr(arg); + struct inode *inode = file->f_mapping->host; + struct block_device *bdev = inode->i_bdev; + struct gendisk *disk = bdev->bd_disk; + fmode_t mode = file->f_mode; + loff_t size; + + /* + * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have + * to updated it before every ioctl. + */ + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY; + else + mode &= ~FMODE_NDELAY; + + switch (cmd) { + /* These need separate implementations for the data structure */ + case HDIO_GETGEO: + return compat_hdio_getgeo(bdev, argp); + case BLKPG: + return compat_blkpg_ioctl(bdev, argp); + + /* Compat mode returns 32-bit data instead of 'long' */ + case BLKRAGET: + case BLKFRAGET: + if (!argp) + return -EINVAL; + return compat_put_long(argp, + (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); + case BLKGETSIZE: + size = i_size_read(bdev->bd_inode); + if ((size >> 9) > ~0UL) + return -EFBIG; + return compat_put_ulong(argp, size >> 9); + + /* The data is compatible, but the command number is different */ + case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ + return put_int(argp, bdev_logical_block_size(bdev)); + case BLKBSZSET_32: + return blkdev_bszset(bdev, mode, argp); + case BLKGETSIZE64_32: + return put_u64(argp, i_size_read(bdev->bd_inode)); + + /* Incompatible alignment on i386 */ + case BLKTRACESETUP32: + return blk_trace_ioctl(bdev, cmd, argp); + default: + break; } + + ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); + if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl) + ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); + + return ret; } -EXPORT_SYMBOL_GPL(blkdev_ioctl); +#endif diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 2a2a2e82832e..b490f47fd553 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -377,13 +377,6 @@ done: * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. - * - * For a zoned block device, __dd_dispatch_request() may return NULL - * if all the queued write requests are directed at zones that are already - * locked due to on-going write requests. In this case, make sure to mark - * the queue as needing a restart to ensure that the queue is run again - * and the pending writes dispatched once the target zones for the ongoing - * write requests are unlocked in dd_finish_request(). */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { @@ -392,9 +385,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock(&dd->lock); rq = __dd_dispatch_request(dd); - if (!rq && blk_queue_is_zoned(hctx->queue) && - !list_empty(&dd->fifo_list[WRITE])) - blk_mq_sched_mark_restart_hctx(hctx); spin_unlock(&dd->lock); return rq; @@ -561,6 +551,13 @@ static void dd_prepare_request(struct request *rq, struct bio *bio) * spinlock so that the zone is never unlocked while deadline_fifo_request() * or deadline_next_request() are executing. This function is called for * all requests, whether or not these requests complete successfully. + * + * For a zoned block device, __dd_dispatch_request() may have stopped + * dispatching requests if all the queued requests are write requests directed + * at zones that are already locked due to on-going write requests. To ensure + * write request dispatch progress in this case, mark the queue as needing a + * restart to ensure that the queue is run again after completion of the + * request and zones being unlocked. */ static void dd_finish_request(struct request *rq) { @@ -572,6 +569,8 @@ static void dd_finish_request(struct request *rq) spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); + if (!list_empty(&dd->fifo_list[WRITE])) + blk_mq_sched_mark_restart_hctx(rq->mq_hctx); spin_unlock_irqrestore(&dd->zone_lock, flags); } } @@ -795,6 +794,7 @@ static struct elevator_type mq_deadline = { .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", + .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); diff --git a/block/opal_proto.h b/block/opal_proto.h index 466ec7be16ef..325cbba2465f 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -76,7 +76,6 @@ enum opal_response_token { * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Section: 6.3 Assigned UIDs */ -#define OPAL_UID_LENGTH 8 #define OPAL_METHOD_LENGTH 8 #define OPAL_MSID_KEYLEN 15 #define OPAL_UID_LENGTH_HALF 4 @@ -108,6 +107,7 @@ enum opal_uid { OPAL_C_PIN_TABLE, OPAL_LOCKING_INFO_TABLE, OPAL_ENTERPRISE_LOCKING_INFO_TABLE, + OPAL_DATASTORE, /* C_PIN_TABLE object ID's */ OPAL_C_PIN_MSID, OPAL_C_PIN_SID, @@ -119,8 +119,6 @@ enum opal_uid { OPAL_UID_HEXFF, }; -#define OPAL_METHOD_LENGTH 8 - /* Enum for indexing the OPALMETHOD array */ enum opal_method { OPAL_PROPERTIES, @@ -167,7 +165,6 @@ enum opal_token { OPAL_TABLE_LASTID = 0x0A, OPAL_TABLE_MIN = 0x0B, OPAL_TABLE_MAX = 0x0C, - /* authority table */ OPAL_PIN = 0x03, /* locking tokens */ @@ -182,7 +179,7 @@ enum opal_token { OPAL_LIFECYCLE = 0x06, /* locking info table */ OPAL_MAXRANGES = 0x04, - /* mbr control */ + /* mbr control */ OPAL_MBRENABLE = 0x01, OPAL_MBRDONE = 0x02, /* properties */ @@ -208,6 +205,10 @@ enum opal_lockingstate { OPAL_LOCKING_LOCKED = 0x03, }; +enum opal_parameter { + OPAL_SUM_SET_LIST = 0x060000, +}; + /* Packets derived from: * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Secion: 3.2.3 ComPackets, Packets & Subpackets diff --git a/block/partition-generic.c b/block/partition-generic.c index aee643ce13d1..564fae77711d 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -127,7 +127,8 @@ ssize_t part_stat_show(struct device *dev, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " "%8u %8u %8u " - "%8lu %8lu %8llu %8u" + "%8lu %8lu %8llu %8u " + "%8lu %8u" "\n", part_stat_read(p, ios[STAT_READ]), part_stat_read(p, merges[STAT_READ]), @@ -143,7 +144,9 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, ios[STAT_DISCARD]), part_stat_read(p, merges[STAT_DISCARD]), (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(p, STAT_DISCARD)); + (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), + part_stat_read(p, ios[STAT_FLUSH]), + (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, @@ -318,6 +321,24 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, const char *dname; int err; + /* + * Partitions are not supported on zoned block devices that are used as + * such. + */ + switch (disk->queue->limits.zoned) { + case BLK_ZONED_HM: + pr_warn("%s: partitions not supported on host managed zoned block device\n", + disk->disk_name); + return ERR_PTR(-ENXIO); + case BLK_ZONED_HA: + pr_info("%s: disabling host aware zoned block device support due to partitions\n", + disk->disk_name); + disk->queue->limits.zoned = BLK_ZONED_NONE; + break; + case BLK_ZONED_NONE: + break; + } + err = disk_expand_part_tbl(disk, partno); if (err) return ERR_PTR(err); @@ -439,12 +460,14 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -static int drop_partitions(struct gendisk *disk, struct block_device *bdev) +int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev) { struct disk_part_iter piter; struct hd_struct *part; int res; + if (!disk_part_scan_enabled(disk)) + return 0; if (bdev->bd_part_count || bdev->bd_super) return -EBUSY; res = invalidate_partition(disk, 0); @@ -459,204 +482,124 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } -static bool part_zone_aligned(struct gendisk *disk, - struct block_device *bdev, - sector_t from, sector_t size) +static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, + struct parsed_partitions *state, int p) { - unsigned int zone_sectors = bdev_zone_sectors(bdev); + sector_t size = state->parts[p].size; + sector_t from = state->parts[p].from; + struct hd_struct *part; - /* - * If this function is called, then the disk is a zoned block device - * (host-aware or host-managed). This can be detected even if the - * zoned block device support is disabled (CONFIG_BLK_DEV_ZONED not - * set). In this case, however, only host-aware devices will be seen - * as a block device is not created for host-managed devices. Without - * zoned block device support, host-aware drives can still be used as - * regular block devices (no zone operation) and their zone size will - * be reported as 0. Allow this case. - */ - if (!zone_sectors) + if (!size) return true; - /* - * Check partition start and size alignement. If the drive has a - * smaller last runt zone, ignore it and allow the partition to - * use it. Check the zone size too: it should be a power of 2 number - * of sectors. - */ - if (WARN_ON_ONCE(!is_power_of_2(zone_sectors))) { - u32 rem; - - div_u64_rem(from, zone_sectors, &rem); - if (rem) + if (from >= get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d start %llu is beyond EOD, ", + disk->disk_name, p, (unsigned long long) from); + if (disk_unlock_native_capacity(disk)) return false; - if ((from + size) < get_capacity(disk)) { - div_u64_rem(size, zone_sectors, &rem); - if (rem) - return false; - } + return true; + } - } else { + if (from + size > get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d size %llu extends beyond EOD, ", + disk->disk_name, p, (unsigned long long) size); - if (from & (zone_sectors - 1)) - return false; - if ((from + size) < get_capacity(disk) && - (size & (zone_sectors - 1))) + if (disk_unlock_native_capacity(disk)) return false; + /* + * We can not ignore partitions of broken tables created by for + * example camera firmware, but we limit them to the end of the + * disk to avoid creating invalid block devices. + */ + size = get_capacity(disk) - from; } + part = add_partition(disk, p, from, size, state->parts[p].flags, + &state->parts[p].info); + if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { + printk(KERN_ERR " %s: p%d could not be added: %ld\n", + disk->disk_name, p, -PTR_ERR(part)); + return true; + } + +#ifdef CONFIG_BLK_DEV_MD + if (state->parts[p].flags & ADDPART_FLAG_RAID) + md_autodetect_dev(part_to_dev(part)->devt); +#endif return true; } -int rescan_partitions(struct gendisk *disk, struct block_device *bdev) +int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) { - struct parsed_partitions *state = NULL; - struct hd_struct *part; - int p, highest, res; -rescan: - if (state && !IS_ERR(state)) { - free_partitions(state); - state = NULL; - } + struct parsed_partitions *state; + int ret = -EAGAIN, p, highest; - res = drop_partitions(disk, bdev); - if (res) - return res; + if (!disk_part_scan_enabled(disk)) + return 0; - if (disk->fops->revalidate_disk) - disk->fops->revalidate_disk(disk); - check_disk_size_change(disk, bdev, true); - bdev->bd_invalidated = 0; - if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) + state = check_partition(disk, bdev); + if (!state) return 0; if (IS_ERR(state)) { /* - * I/O error reading the partition table. If any - * partition code tried to read beyond EOD, retry - * after unlocking native capacity. + * I/O error reading the partition table. If we tried to read + * beyond EOD, retry after unlocking the native capacity. */ if (PTR_ERR(state) == -ENOSPC) { printk(KERN_WARNING "%s: partition table beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) - goto rescan; + return -EAGAIN; } return -EIO; } + /* - * If any partition code tried to read beyond EOD, try - * unlocking native capacity even if partition table is - * successfully read as we could be missing some partitions. + * Partitions are not supported on host managed zoned block devices. + */ + if (disk->queue->limits.zoned == BLK_ZONED_HM) { + pr_warn("%s: ignoring partition table on host managed zoned block device\n", + disk->disk_name); + ret = 0; + goto out_free_state; + } + + /* + * If we read beyond EOD, try unlocking native capacity even if the + * partition table was successfully read as we could be missing some + * partitions. */ if (state->access_beyond_eod) { printk(KERN_WARNING "%s: partition table partially beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) - goto rescan; + goto out_free_state; } /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - /* Detect the highest partition number and preallocate - * disk->part_tbl. This is an optimization and not strictly - * necessary. + /* + * Detect the highest partition number and preallocate disk->part_tbl. + * This is an optimization and not strictly necessary. */ for (p = 1, highest = 0; p < state->limit; p++) if (state->parts[p].size) highest = p; - disk_expand_part_tbl(disk, highest); - /* add partitions */ - for (p = 1; p < state->limit; p++) { - sector_t size, from; - - size = state->parts[p].size; - if (!size) - continue; - - from = state->parts[p].from; - if (from >= get_capacity(disk)) { - printk(KERN_WARNING - "%s: p%d start %llu is beyond EOD, ", - disk->disk_name, p, (unsigned long long) from); - if (disk_unlock_native_capacity(disk)) - goto rescan; - continue; - } + for (p = 1; p < state->limit; p++) + if (!blk_add_partition(disk, bdev, state, p)) + goto out_free_state; - if (from + size > get_capacity(disk)) { - printk(KERN_WARNING - "%s: p%d size %llu extends beyond EOD, ", - disk->disk_name, p, (unsigned long long) size); - - if (disk_unlock_native_capacity(disk)) { - /* free state and restart */ - goto rescan; - } else { - /* - * we can not ignore partitions of broken tables - * created by for example camera firmware, but - * we limit them to the end of the disk to avoid - * creating invalid block devices - */ - size = get_capacity(disk) - from; - } - } - - /* - * On a zoned block device, partitions should be aligned on the - * device zone size (i.e. zone boundary crossing not allowed). - * Otherwise, resetting the write pointer of the last zone of - * one partition may impact the following partition. - */ - if (bdev_is_zoned(bdev) && - !part_zone_aligned(disk, bdev, from, size)) { - printk(KERN_WARNING - "%s: p%d start %llu+%llu is not zone aligned\n", - disk->disk_name, p, (unsigned long long) from, - (unsigned long long) size); - continue; - } - - part = add_partition(disk, p, from, size, - state->parts[p].flags, - &state->parts[p].info); - if (IS_ERR(part)) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); - continue; - } -#ifdef CONFIG_BLK_DEV_MD - if (state->parts[p].flags & ADDPART_FLAG_RAID) - md_autodetect_dev(part_to_dev(part)->devt); -#endif - } + ret = 0; +out_free_state: free_partitions(state); - return 0; -} - -int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) -{ - int res; - - if (!bdev->bd_invalidated) - return 0; - - res = drop_partitions(disk, bdev); - if (res) - return res; - - set_capacity(disk, 0); - check_disk_size_change(disk, bdev, false); - bdev->bd_invalidated = 0; - /* tell userspace that the media / partition table may have changed */ - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - - return 0; + return ret; } unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index fe5d970e2e60..a2d97ee1908c 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -1233,7 +1233,7 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) BUG_ON (!data || !frags); if (size < 2 * VBLK_SIZE_HEAD) { - ldm_error("Value of size is to small."); + ldm_error("Value of size is too small."); return false; } diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index f5e0ad65e86a..b4e73d5dd5c2 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2001 Jens Axboe <axboe@suse.de> */ +#include <linux/compat.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> @@ -19,6 +20,7 @@ #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsi_cmnd.h> +#include <scsi/sg.h> struct blk_cmd_filter { unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; @@ -327,7 +329,14 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, struct iov_iter i; struct iovec *iov = NULL; - ret = import_iovec(rq_data_dir(rq), +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_import_iovec(rq_data_dir(rq), + hdr->dxferp, hdr->iovec_count, + 0, &iov, &i); + else +#endif + ret = import_iovec(rq_data_dir(rq), hdr->dxferp, hdr->iovec_count, 0, &iov, &i); if (ret < 0) @@ -542,6 +551,224 @@ static inline int blk_send_start_stop(struct request_queue *q, return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data); } +int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp) +{ +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) { + struct compat_sg_io_hdr hdr32 = { + .interface_id = hdr->interface_id, + .dxfer_direction = hdr->dxfer_direction, + .cmd_len = hdr->cmd_len, + .mx_sb_len = hdr->mx_sb_len, + .iovec_count = hdr->iovec_count, + .dxfer_len = hdr->dxfer_len, + .dxferp = (uintptr_t)hdr->dxferp, + .cmdp = (uintptr_t)hdr->cmdp, + .sbp = (uintptr_t)hdr->sbp, + .timeout = hdr->timeout, + .flags = hdr->flags, + .pack_id = hdr->pack_id, + .usr_ptr = (uintptr_t)hdr->usr_ptr, + .status = hdr->status, + .masked_status = hdr->masked_status, + .msg_status = hdr->msg_status, + .sb_len_wr = hdr->sb_len_wr, + .host_status = hdr->host_status, + .driver_status = hdr->driver_status, + .resid = hdr->resid, + .duration = hdr->duration, + .info = hdr->info, + }; + + if (copy_to_user(argp, &hdr32, sizeof(hdr32))) + return -EFAULT; + + return 0; + } +#endif + + if (copy_to_user(argp, hdr, sizeof(*hdr))) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(put_sg_io_hdr); + +int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp) +{ +#ifdef CONFIG_COMPAT + struct compat_sg_io_hdr hdr32; + + if (in_compat_syscall()) { + if (copy_from_user(&hdr32, argp, sizeof(hdr32))) + return -EFAULT; + + *hdr = (struct sg_io_hdr) { + .interface_id = hdr32.interface_id, + .dxfer_direction = hdr32.dxfer_direction, + .cmd_len = hdr32.cmd_len, + .mx_sb_len = hdr32.mx_sb_len, + .iovec_count = hdr32.iovec_count, + .dxfer_len = hdr32.dxfer_len, + .dxferp = compat_ptr(hdr32.dxferp), + .cmdp = compat_ptr(hdr32.cmdp), + .sbp = compat_ptr(hdr32.sbp), + .timeout = hdr32.timeout, + .flags = hdr32.flags, + .pack_id = hdr32.pack_id, + .usr_ptr = compat_ptr(hdr32.usr_ptr), + .status = hdr32.status, + .masked_status = hdr32.masked_status, + .msg_status = hdr32.msg_status, + .sb_len_wr = hdr32.sb_len_wr, + .host_status = hdr32.host_status, + .driver_status = hdr32.driver_status, + .resid = hdr32.resid, + .duration = hdr32.duration, + .info = hdr32.info, + }; + + return 0; + } +#endif + + if (copy_from_user(hdr, argp, sizeof(*hdr))) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(get_sg_io_hdr); + +#ifdef CONFIG_COMPAT +struct compat_cdrom_generic_command { + unsigned char cmd[CDROM_PACKET_SIZE]; + compat_caddr_t buffer; + compat_uint_t buflen; + compat_int_t stat; + compat_caddr_t sense; + unsigned char data_direction; + compat_int_t quiet; + compat_int_t timeout; + compat_caddr_t reserved[1]; +}; +#endif + +static int scsi_get_cdrom_generic_arg(struct cdrom_generic_command *cgc, + const void __user *arg) +{ +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) { + struct compat_cdrom_generic_command cgc32; + + if (copy_from_user(&cgc32, arg, sizeof(cgc32))) + return -EFAULT; + + *cgc = (struct cdrom_generic_command) { + .buffer = compat_ptr(cgc32.buffer), + .buflen = cgc32.buflen, + .stat = cgc32.stat, + .sense = compat_ptr(cgc32.sense), + .data_direction = cgc32.data_direction, + .quiet = cgc32.quiet, + .timeout = cgc32.timeout, + .reserved[0] = compat_ptr(cgc32.reserved[0]), + }; + memcpy(&cgc->cmd, &cgc32.cmd, CDROM_PACKET_SIZE); + return 0; + } +#endif + if (copy_from_user(cgc, arg, sizeof(*cgc))) + return -EFAULT; + + return 0; +} + +static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc, + void __user *arg) +{ +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) { + struct compat_cdrom_generic_command cgc32 = { + .buffer = (uintptr_t)(cgc->buffer), + .buflen = cgc->buflen, + .stat = cgc->stat, + .sense = (uintptr_t)(cgc->sense), + .data_direction = cgc->data_direction, + .quiet = cgc->quiet, + .timeout = cgc->timeout, + .reserved[0] = (uintptr_t)(cgc->reserved[0]), + }; + memcpy(&cgc32.cmd, &cgc->cmd, CDROM_PACKET_SIZE); + + if (copy_to_user(arg, &cgc32, sizeof(cgc32))) + return -EFAULT; + + return 0; + } +#endif + if (copy_to_user(arg, cgc, sizeof(*cgc))) + return -EFAULT; + + return 0; +} + +static int scsi_cdrom_send_packet(struct request_queue *q, + struct gendisk *bd_disk, + fmode_t mode, void __user *arg) +{ + struct cdrom_generic_command cgc; + struct sg_io_hdr hdr; + int err; + + err = scsi_get_cdrom_generic_arg(&cgc, arg); + if (err) + return err; + + cgc.timeout = clock_t_to_jiffies(cgc.timeout); + memset(&hdr, 0, sizeof(hdr)); + hdr.interface_id = 'S'; + hdr.cmd_len = sizeof(cgc.cmd); + hdr.dxfer_len = cgc.buflen; + switch (cgc.data_direction) { + case CGC_DATA_UNKNOWN: + hdr.dxfer_direction = SG_DXFER_UNKNOWN; + break; + case CGC_DATA_WRITE: + hdr.dxfer_direction = SG_DXFER_TO_DEV; + break; + case CGC_DATA_READ: + hdr.dxfer_direction = SG_DXFER_FROM_DEV; + break; + case CGC_DATA_NONE: + hdr.dxfer_direction = SG_DXFER_NONE; + break; + default: + return -EINVAL; + } + + hdr.dxferp = cgc.buffer; + hdr.sbp = cgc.sense; + if (hdr.sbp) + hdr.mx_sb_len = sizeof(struct request_sense); + hdr.timeout = jiffies_to_msecs(cgc.timeout); + hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd; + hdr.cmd_len = sizeof(cgc.cmd); + + err = sg_io(q, bd_disk, &hdr, mode); + if (err == -EFAULT) + return -EFAULT; + + if (hdr.status) + return -EIO; + + cgc.stat = err; + cgc.buflen = hdr.resid; + if (scsi_put_cdrom_generic_arg(&cgc, arg)) + return -EFAULT; + + return err; +} + int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode, unsigned int cmd, void __user *arg) { @@ -581,71 +808,20 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod case SG_IO: { struct sg_io_hdr hdr; - err = -EFAULT; - if (copy_from_user(&hdr, arg, sizeof(hdr))) - break; - err = sg_io(q, bd_disk, &hdr, mode); - if (err == -EFAULT) - break; - - if (copy_to_user(arg, &hdr, sizeof(hdr))) - err = -EFAULT; - break; - } - case CDROM_SEND_PACKET: { - struct cdrom_generic_command cgc; - struct sg_io_hdr hdr; - - err = -EFAULT; - if (copy_from_user(&cgc, arg, sizeof(cgc))) - break; - cgc.timeout = clock_t_to_jiffies(cgc.timeout); - memset(&hdr, 0, sizeof(hdr)); - hdr.interface_id = 'S'; - hdr.cmd_len = sizeof(cgc.cmd); - hdr.dxfer_len = cgc.buflen; - err = 0; - switch (cgc.data_direction) { - case CGC_DATA_UNKNOWN: - hdr.dxfer_direction = SG_DXFER_UNKNOWN; - break; - case CGC_DATA_WRITE: - hdr.dxfer_direction = SG_DXFER_TO_DEV; - break; - case CGC_DATA_READ: - hdr.dxfer_direction = SG_DXFER_FROM_DEV; - break; - case CGC_DATA_NONE: - hdr.dxfer_direction = SG_DXFER_NONE; - break; - default: - err = -EINVAL; - } + err = get_sg_io_hdr(&hdr, arg); if (err) break; - - hdr.dxferp = cgc.buffer; - hdr.sbp = cgc.sense; - if (hdr.sbp) - hdr.mx_sb_len = sizeof(struct request_sense); - hdr.timeout = jiffies_to_msecs(cgc.timeout); - hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd; - hdr.cmd_len = sizeof(cgc.cmd); - err = sg_io(q, bd_disk, &hdr, mode); if (err == -EFAULT) break; - if (hdr.status) - err = -EIO; - - cgc.stat = err; - cgc.buflen = hdr.resid; - if (copy_to_user(arg, &cgc, sizeof(cgc))) + if (put_sg_io_hdr(&hdr, arg)) err = -EFAULT; - break; } + case CDROM_SEND_PACKET: + err = scsi_cdrom_send_packet(q, bd_disk, mode, arg); + break; /* * old junk scsi send command ioctl diff --git a/block/sed-opal.c b/block/sed-opal.c index 7e1a444a25b2..880cc57a5f6b 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -129,8 +129,7 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 }, /* tables */ - - [OPAL_TABLE_TABLE] + [OPAL_TABLE_TABLE] = { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_LOCKINGRANGE_GLOBAL] = { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, @@ -150,9 +149,10 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_ENTERPRISE_LOCKING_INFO_TABLE] = { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_DATASTORE] = + { 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00 }, /* C_PIN_TABLE object ID's */ - [OPAL_C_PIN_MSID] = { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02}, [OPAL_C_PIN_SID] = @@ -161,7 +161,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01}, /* half UID's (only first 4 bytes used) */ - [OPAL_HALF_UID_AUTHORITY_OBJ_REF] = { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff }, [OPAL_HALF_UID_BOOLEAN_ACE] = @@ -375,8 +374,8 @@ static void check_geometry(struct opal_dev *dev, const void *data) { const struct d0_geometry_features *geo = data; - dev->align = geo->alignment_granularity; - dev->lowest_lba = geo->lowest_aligned_lba; + dev->align = be64_to_cpu(geo->alignment_granularity); + dev->lowest_lba = be64_to_cpu(geo->lowest_aligned_lba); } static int execute_step(struct opal_dev *dev, @@ -517,6 +516,7 @@ static int opal_discovery0(struct opal_dev *dev, void *data) ret = opal_recv_cmd(dev); if (ret) return ret; + return opal_discovery0_end(dev); } @@ -525,6 +525,7 @@ static int opal_discovery0_step(struct opal_dev *dev) const struct opal_step discovery0_step = { opal_discovery0, }; + return execute_step(dev, &discovery0_step, 0); } @@ -551,6 +552,7 @@ static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok) { if (!can_add(err, cmd, 1)) return; + cmd->cmd[cmd->pos++] = tok; } @@ -577,6 +579,7 @@ static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring, header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0; header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0; header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK; + cmd->cmd[cmd->pos++] = header0; cmd->cmd[cmd->pos++] = len; } @@ -649,6 +652,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr) if (lr == 0) return 0; + buffer[5] = LOCKING_RANGE_NON_GLOBAL; buffer[7] = lr; @@ -903,10 +907,6 @@ static int response_parse(const u8 *buf, size_t length, num_entries++; } - if (num_entries == 0) { - pr_debug("Couldn't parse response.\n"); - return -EINVAL; - } resp->num = num_entries; return 0; @@ -945,6 +945,7 @@ static size_t response_get_string(const struct parsed_resp *resp, int n, } *store = tok->pos + skip; + return tok->len - skip; } @@ -1062,6 +1063,7 @@ static int start_opal_session_cont(struct opal_dev *dev) dev->hsn = hsn; dev->tsn = tsn; + return 0; } @@ -1084,6 +1086,7 @@ static int end_session_cont(struct opal_dev *dev) { dev->hsn = 0; dev->tsn = 0; + return parse_and_check_status(dev); } @@ -1138,11 +1141,11 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, * * the result is provided in dev->resp->tok[4] */ -static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table, +static int generic_get_table_info(struct opal_dev *dev, const u8 *table_uid, u64 column) { u8 uid[OPAL_UID_LENGTH]; - const unsigned int half = OPAL_UID_LENGTH/2; + const unsigned int half = OPAL_UID_LENGTH_HALF; /* sed-opal UIDs can be split in two halves: * first: actual table index @@ -1151,7 +1154,7 @@ static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table, * first part of the target table as relative index into that table */ memcpy(uid, opaluid[OPAL_TABLE_TABLE], half); - memcpy(uid+half, opaluid[table], half); + memcpy(uid + half, table_uid, half); return generic_get_column(dev, uid, column); } @@ -1172,6 +1175,7 @@ static int gen_key(struct opal_dev *dev, void *data) return err; } + return finalize_and_send(dev, parse_and_check_status); } @@ -1184,12 +1188,14 @@ static int get_active_key_cont(struct opal_dev *dev) error = parse_and_check_status(dev); if (error) return error; + keylen = response_get_string(&dev->parsed, 4, &activekey); if (!activekey) { pr_debug("%s: Couldn't extract the Activekey from the response\n", __func__); return OPAL_INVAL_PARAM; } + dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL); if (!dev->prev_data) @@ -1217,6 +1223,75 @@ static int get_active_key(struct opal_dev *dev, void *data) return get_active_key_cont(dev); } +static int generic_table_write_data(struct opal_dev *dev, const u64 data, + u64 offset, u64 size, const u8 *uid) +{ + const u8 __user *src = (u8 __user *)(uintptr_t)data; + u8 *dst; + u64 len; + size_t off = 0; + int err; + + /* do we fit in the available space? */ + err = generic_get_table_info(dev, uid, OPAL_TABLE_ROWS); + if (err) { + pr_debug("Couldn't get the table size\n"); + return err; + } + + len = response_get_u64(&dev->parsed, 4); + if (size > len || offset > len - size) { + pr_debug("Does not fit in the table (%llu vs. %llu)\n", + offset + size, len); + return -ENOSPC; + } + + /* do the actual transmission(s) */ + while (off < size) { + err = cmd_start(dev, uid, opalmethod[OPAL_SET]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WHERE); + add_token_u64(&err, dev, offset + off); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + /* + * The bytestring header is either 1 or 2 bytes, so assume 2. + * There also needs to be enough space to accommodate the + * trailing OPAL_ENDNAME (1 byte) and tokens added by + * cmd_finalize. + */ + len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), + (size_t)(size - off)); + pr_debug("Write bytes %zu+%llu/%llu\n", off, len, size); + + dst = add_bytestring_header(&err, dev, len); + if (!dst) + break; + + if (copy_from_user(dst, src + off, len)) { + err = -EFAULT; + break; + } + + dev->pos += len; + + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) + break; + + err = finalize_and_send(dev, parse_and_check_status); + if (err) + break; + + off += len; + } + + return err; +} + static int generic_lr_enable_disable(struct opal_dev *dev, u8 *uid, bool rle, bool wle, bool rl, bool wl) @@ -1251,6 +1326,7 @@ static int generic_lr_enable_disable(struct opal_dev *dev, add_token_u8(&err, dev, OPAL_ENDLIST); add_token_u8(&err, dev, OPAL_ENDNAME); + return err; } @@ -1263,6 +1339,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid, 0, 0); if (err) pr_debug("Failed to create enable global lr command\n"); + return err; } @@ -1313,7 +1390,6 @@ static int setup_locking_range(struct opal_dev *dev, void *data) if (err) { pr_debug("Error building Setup Locking range command.\n"); return err; - } return finalize_and_send(dev, parse_and_check_status); @@ -1393,6 +1469,7 @@ static int start_SIDASP_opal_session(struct opal_dev *dev, void *data) kfree(key); dev->prev_data = NULL; } + return ret; } @@ -1518,6 +1595,7 @@ static int erase_locking_range(struct opal_dev *dev, void *data) pr_debug("Error building Erase Locking Range Command.\n"); return err; } + return finalize_and_send(dev, parse_and_check_status); } @@ -1576,67 +1654,9 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data) static int write_shadow_mbr(struct opal_dev *dev, void *data) { struct opal_shadow_mbr *shadow = data; - const u8 __user *src; - u8 *dst; - size_t off = 0; - u64 len; - int err = 0; - - /* do we fit in the available shadow mbr space? */ - err = generic_get_table_info(dev, OPAL_MBR, OPAL_TABLE_ROWS); - if (err) { - pr_debug("MBR: could not get shadow size\n"); - return err; - } - - len = response_get_u64(&dev->parsed, 4); - if (shadow->size > len || shadow->offset > len - shadow->size) { - pr_debug("MBR: does not fit in shadow (%llu vs. %llu)\n", - shadow->offset + shadow->size, len); - return -ENOSPC; - } - - /* do the actual transmission(s) */ - src = (u8 __user *)(uintptr_t)shadow->data; - while (off < shadow->size) { - err = cmd_start(dev, opaluid[OPAL_MBR], opalmethod[OPAL_SET]); - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_WHERE); - add_token_u64(&err, dev, shadow->offset + off); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_VALUES); - - /* - * The bytestring header is either 1 or 2 bytes, so assume 2. - * There also needs to be enough space to accommodate the - * trailing OPAL_ENDNAME (1 byte) and tokens added by - * cmd_finalize. - */ - len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), - (size_t)(shadow->size - off)); - pr_debug("MBR: write bytes %zu+%llu/%llu\n", - off, len, shadow->size); - - dst = add_bytestring_header(&err, dev, len); - if (!dst) - break; - if (copy_from_user(dst, src + off, len)) - err = -EFAULT; - dev->pos += len; - add_token_u8(&err, dev, OPAL_ENDNAME); - if (err) - break; - - err = finalize_and_send(dev, parse_and_check_status); - if (err) - break; - - off += len; - } - return err; + return generic_table_write_data(dev, shadow->data, shadow->offset, + shadow->size, opaluid[OPAL_MBR]); } static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, @@ -1816,6 +1836,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data) pr_debug("Error building SET command.\n"); return err; } + return finalize_and_send(dev, parse_and_check_status); } @@ -1857,6 +1878,7 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data) pr_debug("Error building SET command.\n"); return ret; } + return finalize_and_send(dev, parse_and_check_status); } @@ -1864,7 +1886,6 @@ static int activate_lsp(struct opal_dev *dev, void *data) { struct opal_lr_act *opal_act = data; u8 user_lr[OPAL_UID_LENGTH]; - u8 uint_3 = 0x83; int err, i; err = cmd_start(dev, opaluid[OPAL_LOCKINGSP_UID], @@ -1877,10 +1898,7 @@ static int activate_lsp(struct opal_dev *dev, void *data) return err; add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, uint_3); - add_token_u8(&err, dev, 6); - add_token_u8(&err, dev, 0); - add_token_u8(&err, dev, 0); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); add_token_u8(&err, dev, OPAL_STARTLIST); add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); @@ -1947,6 +1965,113 @@ static int get_msid_cpin_pin(struct opal_dev *dev, void *data) return 0; } +static int write_table_data(struct opal_dev *dev, void *data) +{ + struct opal_read_write_table *write_tbl = data; + + return generic_table_write_data(dev, write_tbl->data, write_tbl->offset, + write_tbl->size, write_tbl->table_uid); +} + +static int read_table_data_cont(struct opal_dev *dev) +{ + int err; + const char *data_read; + + err = parse_and_check_status(dev); + if (err) + return err; + + dev->prev_d_len = response_get_string(&dev->parsed, 1, &data_read); + dev->prev_data = (void *)data_read; + if (!dev->prev_data) { + pr_debug("%s: Couldn't read data from the table.\n", __func__); + return OPAL_INVAL_PARAM; + } + + return 0; +} + +/* + * IO_BUFFER_LENGTH = 2048 + * sizeof(header) = 56 + * No. of Token Bytes in the Response = 11 + * MAX size of data that can be carried in response buffer + * at a time is : 2048 - (56 + 11) = 1981 = 0x7BD. + */ +#define OPAL_MAX_READ_TABLE (0x7BD) + +static int read_table_data(struct opal_dev *dev, void *data) +{ + struct opal_read_write_table *read_tbl = data; + int err; + size_t off = 0, max_read_size = OPAL_MAX_READ_TABLE; + u64 table_len, len; + u64 offset = read_tbl->offset, read_size = read_tbl->size - 1; + u8 __user *dst; + + err = generic_get_table_info(dev, read_tbl->table_uid, OPAL_TABLE_ROWS); + if (err) { + pr_debug("Couldn't get the table size\n"); + return err; + } + + table_len = response_get_u64(&dev->parsed, 4); + + /* Check if the user is trying to read from the table limits */ + if (read_size > table_len || offset > table_len - read_size) { + pr_debug("Read size exceeds the Table size limits (%llu vs. %llu)\n", + offset + read_size, table_len); + return -EINVAL; + } + + while (off < read_size) { + err = cmd_start(dev, read_tbl->table_uid, opalmethod[OPAL_GET]); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_STARTROW); + add_token_u64(&err, dev, offset + off); /* start row value */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_ENDROW); + + len = min(max_read_size, (size_t)(read_size - off)); + add_token_u64(&err, dev, offset + off + len); /* end row value + */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_debug("Error building read table data command.\n"); + break; + } + + err = finalize_and_send(dev, read_table_data_cont); + if (err) + break; + + /* len+1: This includes the NULL terminator at the end*/ + if (dev->prev_d_len > len + 1) { + err = -EOVERFLOW; + break; + } + + dst = (u8 __user *)(uintptr_t)read_tbl->data; + if (copy_to_user(dst + off, dev->prev_data, dev->prev_d_len)) { + pr_debug("Error copying data to userspace\n"); + err = -EFAULT; + break; + } + dev->prev_data = NULL; + + off += len; + } + + return err; +} + static int end_opal_session(struct opal_dev *dev, void *data) { int err = 0; @@ -1957,6 +2082,7 @@ static int end_opal_session(struct opal_dev *dev, void *data) if (err < 0) return err; + return finalize_and_send(dev, end_session_cont); } @@ -1965,6 +2091,7 @@ static int end_opal_session_error(struct opal_dev *dev) const struct opal_step error_end_session = { end_opal_session, }; + return execute_step(dev, &error_end_session, 0); } @@ -1984,6 +2111,7 @@ static int check_opal_support(struct opal_dev *dev) ret = opal_discovery0_step(dev); dev->supported = !ret; mutex_unlock(&dev->dev_lock); + return ret; } @@ -2004,6 +2132,7 @@ void free_opal_dev(struct opal_dev *dev) { if (!dev) return; + clean_opal_dev(dev); kfree(dev); } @@ -2026,6 +2155,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) kfree(dev); return NULL; } + return dev; } EXPORT_SYMBOL(init_opal_dev); @@ -2045,6 +2175,7 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2062,6 +2193,7 @@ static int opal_erase_locking_range(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2089,6 +2221,7 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2113,6 +2246,7 @@ static int opal_set_mbr_done(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2133,6 +2267,7 @@ static int opal_write_shadow_mbr(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2151,6 +2286,7 @@ static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) setup_opal_dev(dev); add_suspend_info(dev, suspend); mutex_unlock(&dev->dev_lock); + return 0; } @@ -2169,12 +2305,14 @@ static int opal_add_user_to_lr(struct opal_dev *dev, pr_debug("Locking state was not RO or RW\n"); return -EINVAL; } + if (lk_unlk->session.who < OPAL_USER1 || lk_unlk->session.who > OPAL_USER9) { pr_debug("Authority was not within the range of users: %d\n", lk_unlk->session.who); return -EINVAL; } + if (lk_unlk->session.sum) { pr_debug("%s not supported in sum. Use setup locking range\n", __func__); @@ -2185,6 +2323,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2267,6 +2406,7 @@ static int opal_lock_unlock(struct opal_dev *dev, mutex_lock(&dev->dev_lock); ret = __opal_lock_unlock(dev, lk_unlk); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2289,6 +2429,7 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) setup_opal_dev(dev); ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2310,6 +2451,7 @@ static int opal_activate_lsp(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2327,6 +2469,7 @@ static int opal_setup_locking_range(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2347,6 +2490,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) setup_opal_dev(dev); ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2371,6 +2515,7 @@ static int opal_activate_user(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2382,6 +2527,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) if (!dev) return false; + if (!dev->supported) return false; @@ -2399,6 +2545,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) suspend->unlk.session.sum); was_failure = true; } + if (dev->mbr_enabled) { ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); if (ret) @@ -2406,10 +2553,73 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) } } mutex_unlock(&dev->dev_lock); + return was_failure; } EXPORT_SYMBOL(opal_unlock_from_suspend); +static int opal_read_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + const struct opal_step read_table_steps[] = { + { start_admin1LSP_opal_session, &rw_tbl->key }, + { read_table_data, rw_tbl }, + { end_opal_session, } + }; + int ret = 0; + + if (!rw_tbl->size) + return ret; + + return execute_steps(dev, read_table_steps, + ARRAY_SIZE(read_table_steps)); +} + +static int opal_write_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + const struct opal_step write_table_steps[] = { + { start_admin1LSP_opal_session, &rw_tbl->key }, + { write_table_data, rw_tbl }, + { end_opal_session, } + }; + int ret = 0; + + if (!rw_tbl->size) + return ret; + + return execute_steps(dev, write_table_steps, + ARRAY_SIZE(write_table_steps)); +} + +static int opal_generic_read_write_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + int ret, bit_set; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + + bit_set = fls64(rw_tbl->flags) - 1; + switch (bit_set) { + case OPAL_READ_TABLE: + ret = opal_read_table(dev, rw_tbl); + break; + case OPAL_WRITE_TABLE: + ret = opal_write_table(dev, rw_tbl); + break; + default: + pr_debug("Invalid bit set in the flag (%016llx).\n", + rw_tbl->flags); + ret = -EINVAL; + break; + } + + mutex_unlock(&dev->dev_lock); + + return ret; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -2472,6 +2682,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_PSID_REVERT_TPR: ret = opal_reverttper(dev, p, true); break; + case IOC_OPAL_GENERIC_TABLE_RW: + ret = opal_generic_read_write_table(dev, p); + break; default: break; } diff --git a/block/t10-pi.c b/block/t10-pi.c index 0c0094609dd6..d910534b3a41 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -7,6 +7,7 @@ #include <linux/t10-pi.h> #include <linux/blkdev.h> #include <linux/crc-t10dif.h> +#include <linux/module.h> #include <net/checksum.h> typedef __be16 (csum_fn) (void *, unsigned int); @@ -27,7 +28,7 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len) * tag. */ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, - csum_fn *fn, unsigned int type) + csum_fn *fn, enum t10_dif_type type) { unsigned int i; @@ -37,7 +38,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, pi->guard_tag = fn(iter->data_buf, iter->interval); pi->app_tag = 0; - if (type == 1) + if (type == T10_PI_TYPE1_PROTECTION) pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); else pi->ref_tag = 0; @@ -51,17 +52,18 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, } static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - csum_fn *fn, unsigned int type) + csum_fn *fn, enum t10_dif_type type) { unsigned int i; + BUG_ON(type == T10_PI_TYPE0_PROTECTION); + for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf; __be16 csum; - switch (type) { - case 1: - case 2: + if (type == T10_PI_TYPE1_PROTECTION || + type == T10_PI_TYPE2_PROTECTION) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -73,12 +75,10 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, iter->seed, be32_to_cpu(pi->ref_tag)); return BLK_STS_PROTECTION; } - break; - case 3: + } else if (type == T10_PI_TYPE3_PROTECTION) { if (pi->app_tag == T10_PI_APP_ESCAPE && pi->ref_tag == T10_PI_REF_ESCAPE) goto next; - break; } csum = fn(iter->data_buf, iter->interval); @@ -102,94 +102,40 @@ next: static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_crc_fn, 1); + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_ip_fn, 1); + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_crc_fn, 1); + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_ip_fn, 1); -} - -static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, 3); -} - -static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, 3); -} - -static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, 3); + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } -static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, 3); -} - -const struct blk_integrity_profile t10_pi_type1_crc = { - .name = "T10-DIF-TYPE1-CRC", - .generate_fn = t10_pi_type1_generate_crc, - .verify_fn = t10_pi_type1_verify_crc, -}; -EXPORT_SYMBOL(t10_pi_type1_crc); - -const struct blk_integrity_profile t10_pi_type1_ip = { - .name = "T10-DIF-TYPE1-IP", - .generate_fn = t10_pi_type1_generate_ip, - .verify_fn = t10_pi_type1_verify_ip, -}; -EXPORT_SYMBOL(t10_pi_type1_ip); - -const struct blk_integrity_profile t10_pi_type3_crc = { - .name = "T10-DIF-TYPE3-CRC", - .generate_fn = t10_pi_type3_generate_crc, - .verify_fn = t10_pi_type3_verify_crc, -}; -EXPORT_SYMBOL(t10_pi_type3_crc); - -const struct blk_integrity_profile t10_pi_type3_ip = { - .name = "T10-DIF-TYPE3-IP", - .generate_fn = t10_pi_type3_generate_ip, - .verify_fn = t10_pi_type3_verify_ip, -}; -EXPORT_SYMBOL(t10_pi_type3_ip); - /** - * t10_pi_prepare - prepare PI prior submitting request to device + * t10_pi_type1_prepare - prepare PI prior submitting request to device * @rq: request with PI that should be prepared - * @protection_type: PI type (Type 1/Type 2/Type 3) * * For Type 1/Type 2, the virtual start sector is the one that was * originally submitted by the block layer for the ref_tag usage. Due to * partitioning, MD/DM cloning, etc. the actual physical start sector is * likely to be different. Remap protection information to match the * physical LBA. - * - * Type 3 does not have a reference tag so no remapping is required. */ -void t10_pi_prepare(struct request *rq, u8 protection_type) +static void t10_pi_type1_prepare(struct request *rq) { const int tuple_sz = rq->q->integrity.tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); struct bio *bio; - if (protection_type == T10_PI_TYPE3_PROTECTION) - return; - __rq_for_each_bio(bio, rq) { struct bio_integrity_payload *bip = bio_integrity(bio); u32 virt = bip_get_seed(bip) & 0xffffffff; @@ -222,13 +168,11 @@ void t10_pi_prepare(struct request *rq, u8 protection_type) bip->bip_flags |= BIP_MAPPED_INTEGRITY; } } -EXPORT_SYMBOL(t10_pi_prepare); /** - * t10_pi_complete - prepare PI prior returning request to the block layer + * t10_pi_type1_complete - prepare PI prior returning request to the blk layer * @rq: request with PI that should be prepared - * @protection_type: PI type (Type 1/Type 2/Type 3) - * @intervals: total elements to prepare + * @nr_bytes: total bytes to prepare * * For Type 1/Type 2, the virtual start sector is the one that was * originally submitted by the block layer for the ref_tag usage. Due to @@ -236,19 +180,14 @@ EXPORT_SYMBOL(t10_pi_prepare); * likely to be different. Since the physical start sector was submitted * to the device, we should remap it back to virtual values expected by the * block layer. - * - * Type 3 does not have a reference tag so no remapping is required. */ -void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals) +static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { + unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; const int tuple_sz = rq->q->integrity.tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); struct bio *bio; - if (protection_type == T10_PI_TYPE3_PROTECTION) - return; - __rq_for_each_bio(bio, rq) { struct bio_integrity_payload *bip = bio_integrity(bio); u32 virt = bip_get_seed(bip) & 0xffffffff; @@ -276,4 +215,71 @@ void t10_pi_complete(struct request *rq, u8 protection_type, } } } -EXPORT_SYMBOL(t10_pi_complete); + +static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +/* Type 3 does not have a reference tag so no remapping is required. */ +static void t10_pi_type3_prepare(struct request *rq) +{ +} + +/* Type 3 does not have a reference tag so no remapping is required. */ +static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) +{ +} + +const struct blk_integrity_profile t10_pi_type1_crc = { + .name = "T10-DIF-TYPE1-CRC", + .generate_fn = t10_pi_type1_generate_crc, + .verify_fn = t10_pi_type1_verify_crc, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_crc); + +const struct blk_integrity_profile t10_pi_type1_ip = { + .name = "T10-DIF-TYPE1-IP", + .generate_fn = t10_pi_type1_generate_ip, + .verify_fn = t10_pi_type1_verify_ip, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_ip); + +const struct blk_integrity_profile t10_pi_type3_crc = { + .name = "T10-DIF-TYPE3-CRC", + .generate_fn = t10_pi_type3_generate_crc, + .verify_fn = t10_pi_type3_verify_crc, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_crc); + +const struct blk_integrity_profile t10_pi_type3_ip = { + .name = "T10-DIF-TYPE3-IP", + .generate_fn = t10_pi_type3_generate_ip, + .verify_fn = t10_pi_type3_verify_ip, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_ip); + +MODULE_LICENSE("GPL"); |