From 26f76dce60d28028e5c1fbbc39e771366a27671f Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 5 Jan 2018 14:15:59 +0100
Subject: lightnvm: use internal pblk methods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that rrpc has been removed, the only users of the ppa helpers
is pblk. However, pblk already defines similar functions.

Switch pblk to use the internal ones, and remove the generic ppa
helpers.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 2d1d9de06728..14e274b7d094 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -418,25 +418,6 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev,
 	return l;
 }
 
-static inline int ppa_empty(struct ppa_addr ppa_addr)
-{
-	return (ppa_addr.ppa == ADDR_EMPTY);
-}
-
-static inline void ppa_set_empty(struct ppa_addr *ppa_addr)
-{
-	ppa_addr->ppa = ADDR_EMPTY;
-}
-
-static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
-{
-	if (ppa_empty(ppa1) || ppa_empty(ppa2))
-		return 0;
-
-	return ((ppa1.g.ch == ppa2.g.ch) && (ppa1.g.lun == ppa2.g.lun) &&
-					(ppa1.g.blk == ppa2.g.blk));
-}
-
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
 typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
-- 
cgit v1.2.3


From e3e13bcc14717800e3e3239ca3faac24f2f04575 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 5 Jan 2018 14:16:00 +0100
Subject: lightnvm: remove hybrid ocssd 1.2 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that rrpc have been removed. Also remove the hybrid 1.2 support
from the core.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c      | 141 -------------------------------------------
 drivers/nvme/host/lightnvm.c |  96 -----------------------------
 include/linux/lightnvm.h     |  43 -------------
 3 files changed, 280 deletions(-)

(limited to 'include')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 83249b43dd06..390d5efd6287 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -45,12 +45,6 @@ struct nvm_dev_map {
 	int nr_chnls;
 };
 
-struct nvm_area {
-	struct list_head list;
-	sector_t begin;
-	sector_t end;	/* end is excluded */
-};
-
 static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
 {
 	struct nvm_target *tgt;
@@ -524,35 +518,6 @@ static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 	nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
 }
 
-void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
-		     int len)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_dev_map *dev_rmap = dev->rmap;
-	u64 i;
-
-	for (i = 0; i < len; i++) {
-		struct nvm_ch_map *ch_rmap;
-		int *lun_roffs;
-		struct ppa_addr gaddr;
-		u64 pba = le64_to_cpu(entries[i]);
-		u64 diff;
-
-		if (!pba)
-			continue;
-
-		gaddr = linear_to_generic_addr(geo, pba);
-		ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
-		lun_roffs = ch_rmap->lun_offs;
-
-		diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
-				(lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
-
-		entries[i] -= cpu_to_le64(diff);
-	}
-}
-EXPORT_SYMBOL(nvm_part_to_tgt);
-
 int nvm_register_tgt_type(struct nvm_tgt_type *tt)
 {
 	int ret = 0;
@@ -726,112 +691,6 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_submit_io_sync);
 
-int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
-								int nr_ppas)
-{
-	struct nvm_geo *geo = &tgt_dev->geo;
-	struct nvm_rq rqd;
-	int ret;
-
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-
-	rqd.opcode = NVM_OP_ERASE;
-	rqd.flags = geo->plane_mode >> 1;
-
-	ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
-	if (ret)
-		return ret;
-
-	ret = nvm_submit_io_sync(tgt_dev, &rqd);
-	if (ret) {
-		pr_err("rrpr: erase I/O submission failed: %d\n", ret);
-		goto free_ppa_list;
-	}
-
-free_ppa_list:
-	nvm_free_rqd_ppalist(tgt_dev, &rqd);
-
-	return ret;
-}
-EXPORT_SYMBOL(nvm_erase_sync);
-
-int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
-		    nvm_l2p_update_fn *update_l2p, void *priv)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-
-	if (!dev->ops->get_l2p_tbl)
-		return 0;
-
-	return dev->ops->get_l2p_tbl(dev, slba, nlb, update_l2p, priv);
-}
-EXPORT_SYMBOL(nvm_get_l2p_tbl);
-
-int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_area *area, *prev, *next;
-	sector_t begin = 0;
-	sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
-
-	if (len > max_sectors)
-		return -EINVAL;
-
-	area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL);
-	if (!area)
-		return -ENOMEM;
-
-	prev = NULL;
-
-	spin_lock(&dev->lock);
-	list_for_each_entry(next, &dev->area_list, list) {
-		if (begin + len > next->begin) {
-			begin = next->end;
-			prev = next;
-			continue;
-		}
-		break;
-	}
-
-	if ((begin + len) > max_sectors) {
-		spin_unlock(&dev->lock);
-		kfree(area);
-		return -EINVAL;
-	}
-
-	area->begin = *lba = begin;
-	area->end = begin + len;
-
-	if (prev) /* insert into sorted order */
-		list_add(&area->list, &prev->list);
-	else
-		list_add(&area->list, &dev->area_list);
-	spin_unlock(&dev->lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(nvm_get_area);
-
-void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct nvm_area *area;
-
-	spin_lock(&dev->lock);
-	list_for_each_entry(area, &dev->area_list, list) {
-		if (area->begin != begin)
-			continue;
-
-		list_del(&area->list);
-		spin_unlock(&dev->lock);
-		kfree(area);
-		return;
-	}
-	spin_unlock(&dev->lock);
-}
-EXPORT_SYMBOL(nvm_put_area);
-
 void nvm_end_io(struct nvm_rq *rqd)
 {
 	struct nvm_tgt_dev *tgt_dev = rqd->dev;
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index ba3d7f3349e5..26f7eccc1684 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -31,27 +31,10 @@
 
 enum nvme_nvm_admin_opcode {
 	nvme_nvm_admin_identity		= 0xe2,
-	nvme_nvm_admin_get_l2p_tbl	= 0xea,
 	nvme_nvm_admin_get_bb_tbl	= 0xf2,
 	nvme_nvm_admin_set_bb_tbl	= 0xf1,
 };
 
-struct nvme_nvm_hb_rw {
-	__u8			opcode;
-	__u8			flags;
-	__u16			command_id;
-	__le32			nsid;
-	__u64			rsvd2;
-	__le64			metadata;
-	__le64			prp1;
-	__le64			prp2;
-	__le64			spba;
-	__le16			length;
-	__le16			control;
-	__le32			dsmgmt;
-	__le64			slba;
-};
-
 struct nvme_nvm_ph_rw {
 	__u8			opcode;
 	__u8			flags;
@@ -80,19 +63,6 @@ struct nvme_nvm_identity {
 	__u32			rsvd11[5];
 };
 
-struct nvme_nvm_l2ptbl {
-	__u8			opcode;
-	__u8			flags;
-	__u16			command_id;
-	__le32			nsid;
-	__le32			cdw2[4];
-	__le64			prp1;
-	__le64			prp2;
-	__le64			slba;
-	__le32			nlb;
-	__le16			cdw14[6];
-};
-
 struct nvme_nvm_getbbtbl {
 	__u8			opcode;
 	__u8			flags;
@@ -139,9 +109,7 @@ struct nvme_nvm_command {
 	union {
 		struct nvme_common_command common;
 		struct nvme_nvm_identity identity;
-		struct nvme_nvm_hb_rw hb_rw;
 		struct nvme_nvm_ph_rw ph_rw;
-		struct nvme_nvm_l2ptbl l2p;
 		struct nvme_nvm_getbbtbl get_bb;
 		struct nvme_nvm_setbbtbl set_bb;
 		struct nvme_nvm_erase_blk erase;
@@ -234,11 +202,9 @@ struct nvme_nvm_bb_tbl {
 static inline void _nvme_nvm_check_size(void)
 {
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16);
@@ -332,62 +298,6 @@ out:
 	return ret;
 }
 
-static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
-				nvm_l2p_update_fn *update_l2p, void *priv)
-{
-	struct nvme_ns *ns = nvmdev->q->queuedata;
-	struct nvme_nvm_command c = {};
-	u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9;
-	u32 nlb_pr_rq = len / sizeof(u64);
-	u64 cmd_slba = slba;
-	void *entries;
-	int ret = 0;
-
-	c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl;
-	c.l2p.nsid = cpu_to_le32(ns->head->ns_id);
-	entries = kmalloc(len, GFP_KERNEL);
-	if (!entries)
-		return -ENOMEM;
-
-	while (nlb) {
-		u32 cmd_nlb = min(nlb_pr_rq, nlb);
-		u64 elba = slba + cmd_nlb;
-
-		c.l2p.slba = cpu_to_le64(cmd_slba);
-		c.l2p.nlb = cpu_to_le32(cmd_nlb);
-
-		ret = nvme_submit_sync_cmd(ns->ctrl->admin_q,
-				(struct nvme_command *)&c, entries, len);
-		if (ret) {
-			dev_err(ns->ctrl->device,
-				"L2P table transfer failed (%d)\n", ret);
-			ret = -EIO;
-			goto out;
-		}
-
-		if (unlikely(elba > nvmdev->total_secs)) {
-			pr_err("nvm: L2P data from device is out of bounds!\n");
-			ret = -EINVAL;
-			goto out;
-		}
-
-		/* Transform physical address to target address space */
-		nvm_part_to_tgt(nvmdev, entries, cmd_nlb);
-
-		if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) {
-			ret = -EINTR;
-			goto out;
-		}
-
-		cmd_slba += cmd_nlb;
-		nlb -= cmd_nlb;
-	}
-
-out:
-	kfree(entries);
-	return ret;
-}
-
 static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 								u8 *blks)
 {
@@ -474,10 +384,6 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
 	c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
 	c->ph_rw.control = cpu_to_le16(rqd->flags);
 	c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
-
-	if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
-		c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
-					rqd->bio->bi_iter.bi_sector));
 }
 
 static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
@@ -597,8 +503,6 @@ static void nvme_nvm_dev_dma_free(void *pool, void *addr,
 static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.identity		= nvme_nvm_identity,
 
-	.get_l2p_tbl		= nvme_nvm_get_l2p_tbl,
-
 	.get_bb_tbl		= nvme_nvm_get_bb_tbl,
 	.set_bb_tbl		= nvme_nvm_set_bb_tbl,
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 14e274b7d094..97ceb841e9a0 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -50,10 +50,7 @@ struct nvm_id;
 struct nvm_dev;
 struct nvm_tgt_dev;
 
-typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
 typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
-typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
-				nvm_l2p_update_fn *, void *);
 typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
 typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
@@ -66,7 +63,6 @@ typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t);
 
 struct nvm_dev_ops {
 	nvm_id_fn		*identity;
-	nvm_get_l2p_tbl_fn	*get_l2p_tbl;
 	nvm_op_bb_tbl_fn	*get_bb_tbl;
 	nvm_op_set_bb_fn	*set_bb_tbl;
 
@@ -112,8 +108,6 @@ enum {
 	NVM_RSP_WARN_HIGHECC	= 0x4700,
 
 	/* Device opcodes */
-	NVM_OP_HBREAD		= 0x02,
-	NVM_OP_HBWRITE		= 0x81,
 	NVM_OP_PWRITE		= 0x91,
 	NVM_OP_PREAD		= 0x92,
 	NVM_OP_ERASE		= 0x90,
@@ -346,36 +340,6 @@ struct nvm_dev {
 	struct list_head targets;
 };
 
-static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo,
-						     u64 pba)
-{
-	struct ppa_addr l;
-	int secs, pgs, blks, luns;
-	sector_t ppa = pba;
-
-	l.ppa = 0;
-
-	div_u64_rem(ppa, geo->sec_per_pg, &secs);
-	l.g.sec = secs;
-
-	sector_div(ppa, geo->sec_per_pg);
-	div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
-	l.g.pg = pgs;
-
-	sector_div(ppa, geo->pgs_per_blk);
-	div_u64_rem(ppa, geo->blks_per_lun, &blks);
-	l.g.blk = blks;
-
-	sector_div(ppa, geo->blks_per_lun);
-	div_u64_rem(ppa, geo->luns_per_chnl, &luns);
-	l.g.lun = luns;
-
-	sector_div(ppa, geo->luns_per_chnl);
-	l.g.ch = ppa;
-
-	return l;
-}
-
 static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev,
 						  struct ppa_addr r)
 {
@@ -462,17 +426,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *);
-extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
-extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
-			   void *);
-extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
-extern void nvm_put_area(struct nvm_tgt_dev *, sector_t);
 extern void nvm_end_io(struct nvm_rq *);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
 extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
 
-extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int);
-
 #else /* CONFIG_NVM */
 struct nvm_dev_ops;
 
-- 
cgit v1.2.3


From 98281a90acc04d8a10407dabd2e397e4312b80c0 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 5 Jan 2018 14:16:01 +0100
Subject: lightnvm: remove unnecessary field from nvm_rq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the wait filed in nvm_rq. It is not used anymore, as targets rely
on the functionality provided by the LightNVM subsystem when sending
sync I/O.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 97ceb841e9a0..07cdb05a9a87 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -233,7 +233,6 @@ struct nvm_rq {
 	void *meta_list;
 	dma_addr_t dma_meta_list;
 
-	struct completion *wait;
 	nvm_end_io_fn *end_io;
 
 	uint8_t opcode;
-- 
cgit v1.2.3


From bb27aa9ecd1f72e68b0fa2dffeb45bee3b1cb5ca Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 5 Jan 2018 14:16:02 +0100
Subject: lightnvm: remove lower page tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The lower page table is unused. All page tables reported by 1.2
devices are all reporting a sequential 1:1 page mapping. This is
also not used going forward with the 2.0 revision.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c      | 67 --------------------------------------------
 drivers/nvme/host/lightnvm.c | 14 ---------
 include/linux/lightnvm.h     |  6 ----
 3 files changed, 87 deletions(-)

(limited to 'include')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 390d5efd6287..52059dd0ed18 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -751,53 +751,6 @@ int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
 }
 EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
 
-static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
-{
-	struct nvm_geo *geo = &dev->geo;
-	int i;
-
-	dev->lps_per_blk = geo->pgs_per_blk;
-	dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
-	if (!dev->lptbl)
-		return -ENOMEM;
-
-	/* Just a linear array */
-	for (i = 0; i < dev->lps_per_blk; i++)
-		dev->lptbl[i] = i;
-
-	return 0;
-}
-
-static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
-{
-	int i, p;
-	struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc;
-
-	if (!mlc->num_pairs)
-		return 0;
-
-	dev->lps_per_blk = mlc->num_pairs;
-	dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
-	if (!dev->lptbl)
-		return -ENOMEM;
-
-	/* The lower page table encoding consists of a list of bytes, where each
-	 * has a lower and an upper half. The first half byte maintains the
-	 * increment value and every value after is an offset added to the
-	 * previous incrementation value
-	 */
-	dev->lptbl[0] = mlc->pairs[0] & 0xF;
-	for (i = 1; i < dev->lps_per_blk; i++) {
-		p = mlc->pairs[i >> 1];
-		if (i & 0x1) /* upper */
-			dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4);
-		else /* lower */
-			dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF);
-	}
-
-	return 0;
-}
-
 static int nvm_core_init(struct nvm_dev *dev)
 {
 	struct nvm_id *id = &dev->identity;
@@ -846,25 +799,6 @@ static int nvm_core_init(struct nvm_dev *dev)
 	if (!dev->lun_map)
 		return -ENOMEM;
 
-	switch (grp->fmtype) {
-	case NVM_ID_FMTYPE_SLC:
-		if (nvm_init_slc_tbl(dev, grp)) {
-			ret = -ENOMEM;
-			goto err_fmtype;
-		}
-		break;
-	case NVM_ID_FMTYPE_MLC:
-		if (nvm_init_mlc_tbl(dev, grp)) {
-			ret = -ENOMEM;
-			goto err_fmtype;
-		}
-		break;
-	default:
-		pr_err("nvm: flash type not supported\n");
-		ret = -EINVAL;
-		goto err_fmtype;
-	}
-
 	INIT_LIST_HEAD(&dev->area_list);
 	INIT_LIST_HEAD(&dev->targets);
 	mutex_init(&dev->mlock);
@@ -890,7 +824,6 @@ static void nvm_free(struct nvm_dev *dev)
 		dev->ops->destroy_dma_pool(dev->dma_pool);
 
 	nvm_unregister_map(dev);
-	kfree(dev->lptbl);
 	kfree(dev->lun_map);
 	kfree(dev);
 }
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 26f7eccc1684..15bf243f6096 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -246,20 +246,6 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
 
 	dst->cpar = le16_to_cpu(src->cpar);
 
-	if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
-		memcpy(dst->lptbl.id, src->lptbl.id, 8);
-		dst->lptbl.mlc.num_pairs =
-				le16_to_cpu(src->lptbl.mlc.num_pairs);
-
-		if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) {
-			pr_err("nvm: number of MLC pairs not supported\n");
-			return -EINVAL;
-		}
-
-		memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
-					dst->lptbl.mlc.num_pairs);
-	}
-
 	return 0;
 }
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 07cdb05a9a87..a5d8e0cbbb46 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -174,8 +174,6 @@ struct nvm_id_group {
 	u32	mpos;
 	u32	mccap;
 	u16	cpar;
-
-	struct nvm_id_lp_tbl lptbl;
 };
 
 struct nvm_addr_format {
@@ -313,10 +311,6 @@ struct nvm_dev {
 	/* Device information */
 	struct nvm_geo geo;
 
-	  /* lower page table */
-	int lps_per_blk;
-	int *lptbl;
-
 	unsigned long total_secs;
 
 	unsigned long *lun_map;
-- 
cgit v1.2.3


From fae7fae4077c24dc2be720b9f21f53adea98d7dd Mon Sep 17 00:00:00 2001
From: Matias Bjørling <matias@cnexlabs.com>
Date: Fri, 5 Jan 2018 14:16:03 +0100
Subject: lightnvm: make geometry structures 2.0 ready
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare for the 2.0 revision by adapting the geometry
structures to coexist with the 1.2 revision.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c          | 91 +++++++++++++++++++---------------------
 drivers/lightnvm/pblk-core.c     |  6 +--
 drivers/lightnvm/pblk-init.c     | 62 ++++++++++++++-------------
 drivers/lightnvm/pblk-recovery.c |  2 +-
 drivers/lightnvm/pblk-sysfs.c    |  6 +--
 drivers/lightnvm/pblk.h          |  8 ++--
 drivers/nvme/host/lightnvm.c     | 79 +++++++++++++++++++++-------------
 include/linux/lightnvm.h         | 52 ++++++++++++++---------
 8 files changed, 170 insertions(+), 136 deletions(-)

(limited to 'include')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 52059dd0ed18..6d6d2c12ff5b 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -98,7 +98,7 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
 		if (clear) {
 			for (j = 0; j < ch_map->nr_luns; j++) {
 				int lun = j + lun_offs[j];
-				int lunid = (ch * dev->geo.luns_per_chnl) + lun;
+				int lunid = (ch * dev->geo.nr_luns) + lun;
 
 				WARN_ON(!test_and_clear_bit(lunid,
 							dev->lun_map));
@@ -124,10 +124,10 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
 	struct ppa_addr *luns;
 	int nr_luns = lun_end - lun_begin + 1;
 	int luns_left = nr_luns;
-	int nr_chnls = nr_luns / dev->geo.luns_per_chnl;
-	int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl;
-	int bch = lun_begin / dev->geo.luns_per_chnl;
-	int blun = lun_begin % dev->geo.luns_per_chnl;
+	int nr_chnls = nr_luns / dev->geo.nr_luns;
+	int nr_chnls_mod = nr_luns % dev->geo.nr_luns;
+	int bch = lun_begin / dev->geo.nr_luns;
+	int blun = lun_begin % dev->geo.nr_luns;
 	int lunid = 0;
 	int lun_balanced = 1;
 	int prev_nr_luns;
@@ -148,15 +148,15 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
 	if (!luns)
 		goto err_luns;
 
-	prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ?
-					dev->geo.luns_per_chnl : luns_left;
+	prev_nr_luns = (luns_left > dev->geo.nr_luns) ?
+					dev->geo.nr_luns : luns_left;
 	for (i = 0; i < nr_chnls; i++) {
 		struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
 		int *lun_roffs = ch_rmap->lun_offs;
 		struct nvm_ch_map *ch_map = &dev_map->chnls[i];
 		int *lun_offs;
-		int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ?
-					dev->geo.luns_per_chnl : luns_left;
+		int luns_in_chnl = (luns_left > dev->geo.nr_luns) ?
+					dev->geo.nr_luns : luns_left;
 
 		if (lun_balanced && prev_nr_luns != luns_in_chnl)
 			lun_balanced = 0;
@@ -193,8 +193,8 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
 	memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
 	/* Target device only owns a portion of the physical device */
 	tgt_dev->geo.nr_chnls = nr_chnls;
-	tgt_dev->geo.nr_luns = nr_luns;
-	tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1;
+	tgt_dev->geo.all_luns = nr_luns;
+	tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
 	tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
 	tgt_dev->q = dev->q;
 	tgt_dev->map = dev_map;
@@ -414,7 +414,7 @@ static int nvm_register_map(struct nvm_dev *dev)
 	for (i = 0; i < dev->geo.nr_chnls; i++) {
 		struct nvm_ch_map *ch_rmap;
 		int *lun_roffs;
-		int luns_in_chnl = dev->geo.luns_per_chnl;
+		int luns_in_chnl = dev->geo.nr_luns;
 
 		ch_rmap = &rmap->chnls[i];
 
@@ -717,10 +717,10 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
 	struct nvm_geo *geo = &dev->geo;
 	int blk, offset, pl, blktype;
 
-	if (nr_blks != geo->blks_per_lun * geo->plane_mode)
+	if (nr_blks != geo->nr_chks * geo->plane_mode)
 		return -EINVAL;
 
-	for (blk = 0; blk < geo->blks_per_lun; blk++) {
+	for (blk = 0; blk < geo->nr_chks; blk++) {
 		offset = blk * geo->plane_mode;
 		blktype = blks[offset];
 
@@ -736,7 +736,7 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
 		blks[blk] = blktype;
 	}
 
-	return geo->blks_per_lun;
+	return geo->nr_chks;
 }
 EXPORT_SYMBOL(nvm_bb_tbl_fold);
 
@@ -758,43 +758,40 @@ static int nvm_core_init(struct nvm_dev *dev)
 	struct nvm_geo *geo = &dev->geo;
 	int ret;
 
+	memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
+
+	if (grp->mtype != 0) {
+		pr_err("nvm: memory type not supported\n");
+		return -EINVAL;
+	}
+
 	/* Whole device values */
 	geo->nr_chnls = grp->num_ch;
-	geo->luns_per_chnl = grp->num_lun;
-
-	/* Generic device values */
-	geo->pgs_per_blk = grp->num_pg;
-	geo->blks_per_lun = grp->num_blk;
-	geo->nr_planes = grp->num_pln;
-	geo->fpg_size = grp->fpg_sz;
-	geo->pfpg_size = grp->fpg_sz * grp->num_pln;
+	geo->nr_luns = grp->num_lun;
+
+	/* Generic device geometry values */
+	geo->ws_min = grp->ws_min;
+	geo->ws_opt = grp->ws_opt;
+	geo->ws_seq = grp->ws_seq;
+	geo->ws_per_chk = grp->ws_per_chk;
+	geo->nr_chks = grp->num_chk;
 	geo->sec_size = grp->csecs;
 	geo->oob_size = grp->sos;
-	geo->sec_per_pg = grp->fpg_sz / grp->csecs;
 	geo->mccap = grp->mccap;
-	memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
-
-	geo->plane_mode = NVM_PLANE_SINGLE;
 	geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size;
 
-	if (grp->mpos & 0x020202)
-		geo->plane_mode = NVM_PLANE_DOUBLE;
-	if (grp->mpos & 0x040404)
-		geo->plane_mode = NVM_PLANE_QUAD;
-
-	if (grp->mtype != 0) {
-		pr_err("nvm: memory type not supported\n");
-		return -EINVAL;
-	}
+	geo->sec_per_chk = grp->clba;
+	geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks;
+	geo->all_luns = geo->nr_luns * geo->nr_chnls;
 
-	/* calculated values */
+	/* 1.2 spec device geometry values */
+	geo->plane_mode = 1 << geo->ws_seq;
+	geo->nr_planes = geo->ws_opt / geo->ws_min;
+	geo->sec_per_pg = geo->ws_min;
 	geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes;
-	geo->sec_per_blk = geo->sec_per_pl * geo->pgs_per_blk;
-	geo->sec_per_lun = geo->sec_per_blk * geo->blks_per_lun;
-	geo->nr_luns = geo->luns_per_chnl * geo->nr_chnls;
 
-	dev->total_secs = geo->nr_luns * geo->sec_per_lun;
-	dev->lun_map = kcalloc(BITS_TO_LONGS(geo->nr_luns),
+	dev->total_secs = geo->all_luns * geo->sec_per_lun;
+	dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns),
 					sizeof(unsigned long), GFP_KERNEL);
 	if (!dev->lun_map)
 		return -ENOMEM;
@@ -854,8 +851,8 @@ static int nvm_init(struct nvm_dev *dev)
 
 	pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n",
 			dev->name, geo->sec_per_pg, geo->nr_planes,
-			geo->pgs_per_blk, geo->blks_per_lun,
-			geo->nr_luns, geo->nr_chnls);
+			geo->ws_per_chk, geo->nr_chks,
+			geo->all_luns, geo->nr_chnls);
 	return 0;
 err:
 	pr_err("nvm: failed to initialize nvm\n");
@@ -946,12 +943,12 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
 
 	if (s->lun_begin == -1 && s->lun_end == -1) {
 		s->lun_begin = 0;
-		s->lun_end = dev->geo.nr_luns - 1;
+		s->lun_end = dev->geo.all_luns - 1;
 	}
 
-	if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) {
+	if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.all_luns) {
 		pr_err("nvm: lun out of bound (%u:%u > %u)\n",
-			s->lun_begin, s->lun_end, dev->geo.nr_luns - 1);
+			s->lun_begin, s->lun_end, dev->geo.all_luns - 1);
 		return -EINVAL;
 	}
 
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 76516ee84e9a..0849046b2a7a 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -979,7 +979,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
 
 	/* Start metadata */
 	smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
-	smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns);
+	smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns);
 
 	/* Fill metadata among lines */
 	if (cur) {
@@ -1032,7 +1032,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
 							lm->sec_per_line);
 		bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
 							lm->sec_per_line);
-		line->sec_in_line -= geo->sec_per_blk;
+		line->sec_in_line -= geo->sec_per_chk;
 		if (bit >= lm->emeta_bb)
 			nr_bb++;
 	}
@@ -1746,7 +1746,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_lun *rlun;
-	int nr_luns = geo->nr_luns;
+	int nr_luns = geo->all_luns;
 	int bit = -1;
 
 	while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 695826a06b5d..d13bb51f0e2f 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -169,8 +169,8 @@ static int pblk_set_ppaf(struct pblk *pblk)
 	}
 	ppaf.ch_len = power_len;
 
-	power_len = get_count_order(geo->luns_per_chnl);
-	if (1 << power_len != geo->luns_per_chnl) {
+	power_len = get_count_order(geo->nr_luns);
+	if (1 << power_len != geo->nr_luns) {
 		pr_err("pblk: supports only power-of-two LUN config.\n");
 		return -EINVAL;
 	}
@@ -254,7 +254,7 @@ static int pblk_core_init(struct pblk *pblk)
 	struct nvm_geo *geo = &dev->geo;
 
 	pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
-						geo->nr_planes * geo->nr_luns;
+						geo->nr_planes * geo->all_luns;
 
 	if (pblk_init_global_caches(pblk))
 		return -ENOMEM;
@@ -270,21 +270,22 @@ static int pblk_core_init(struct pblk *pblk)
 	if (!pblk->gen_ws_pool)
 		goto free_page_bio_pool;
 
-	pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
+	pblk->rec_pool = mempool_create_slab_pool(geo->all_luns,
+							pblk_rec_cache);
 	if (!pblk->rec_pool)
 		goto free_gen_ws_pool;
 
-	pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+	pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns,
 							pblk_g_rq_cache);
 	if (!pblk->r_rq_pool)
 		goto free_rec_pool;
 
-	pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+	pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns,
 							pblk_g_rq_cache);
 	if (!pblk->e_rq_pool)
 		goto free_r_rq_pool;
 
-	pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+	pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns,
 							pblk_w_rq_cache);
 	if (!pblk->w_rq_pool)
 		goto free_e_rq_pool;
@@ -409,7 +410,7 @@ static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
 	u8 *blks;
 	int nr_blks, ret;
 
-	nr_blks = geo->blks_per_lun * geo->plane_mode;
+	nr_blks = geo->nr_chks * geo->plane_mode;
 	blks = kmalloc(nr_blks, GFP_KERNEL);
 	if (!blks)
 		return -ENOMEM;
@@ -482,20 +483,21 @@ static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
 	int i, ret;
 
 	/* TODO: Implement unbalanced LUN support */
-	if (geo->luns_per_chnl < 0) {
+	if (geo->nr_luns < 0) {
 		pr_err("pblk: unbalanced LUN config.\n");
 		return -EINVAL;
 	}
 
-	pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL);
+	pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun),
+								GFP_KERNEL);
 	if (!pblk->luns)
 		return -ENOMEM;
 
-	for (i = 0; i < geo->nr_luns; i++) {
+	for (i = 0; i < geo->all_luns; i++) {
 		/* Stripe across channels */
 		int ch = i % geo->nr_chnls;
 		int lun_raw = i / geo->nr_chnls;
-		int lunid = lun_raw + ch * geo->luns_per_chnl;
+		int lunid = lun_raw + ch * geo->nr_luns;
 
 		rlun = &pblk->luns[i];
 		rlun->bppa = luns[lunid];
@@ -590,8 +592,8 @@ static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
 	 * on user capacity consider only provisioned blocks
 	 */
 	pblk->rl.total_blocks = nr_free_blks;
-	pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk;
-	pblk->capacity = provisioned * geo->sec_per_blk;
+	pblk->rl.nr_secs = nr_free_blks * geo->sec_per_chk;
+	pblk->capacity = provisioned * geo->sec_per_chk;
 	atomic_set(&pblk->rl.free_blocks, nr_free_blks);
 }
 
@@ -683,7 +685,7 @@ static int pblk_lines_init(struct pblk *pblk)
 	int i, ret;
 
 	pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
-	max_write_ppas = pblk->min_write_pgs * geo->nr_luns;
+	max_write_ppas = pblk->min_write_pgs * geo->all_luns;
 	pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
 				max_write_ppas : nvm_max_phys_sects(dev);
 	pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
@@ -693,26 +695,26 @@ static int pblk_lines_init(struct pblk *pblk)
 		return -EINVAL;
 	}
 
-	div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
+	div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, &mod);
 	if (mod) {
 		pr_err("pblk: bad configuration of sectors/pages\n");
 		return -EINVAL;
 	}
 
-	l_mg->nr_lines = geo->blks_per_lun;
+	l_mg->nr_lines = geo->nr_chks;
 	l_mg->log_line = l_mg->data_line = NULL;
 	l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
 	l_mg->nr_free_lines = 0;
 	bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
 
-	lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
-	lm->blk_per_line = geo->nr_luns;
-	lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+	lm->sec_per_line = geo->sec_per_chk * geo->all_luns;
+	lm->blk_per_line = geo->all_luns;
+	lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
 	lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
-	lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+	lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
 	lm->mid_thrs = lm->sec_per_line / 2;
 	lm->high_thrs = lm->sec_per_line / 4;
-	lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs;
+	lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs;
 
 	/* Calculate necessary pages for smeta. See comment over struct
 	 * line_smeta definition
@@ -742,12 +744,12 @@ add_emeta_page:
 		goto add_emeta_page;
 	}
 
-	lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0;
+	lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0;
 
 	lm->min_blk_line = 1;
-	if (geo->nr_luns > 1)
+	if (geo->all_luns > 1)
 		lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
-					lm->emeta_sec[0], geo->sec_per_blk);
+					lm->emeta_sec[0], geo->sec_per_chk);
 
 	if (lm->min_blk_line > lm->blk_per_line) {
 		pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
@@ -772,7 +774,7 @@ add_emeta_page:
 		goto fail_free_bb_template;
 	}
 
-	bb_distance = (geo->nr_luns) * geo->sec_per_pl;
+	bb_distance = (geo->all_luns) * geo->sec_per_pl;
 	for (i = 0; i < lm->sec_per_line; i += bb_distance)
 		bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
 
@@ -844,7 +846,7 @@ add_emeta_page:
 	pblk_set_provision(pblk, nr_free_blks);
 
 	/* Cleanup per-LUN bad block lists - managed within lines on run-time */
-	for (i = 0; i < geo->nr_luns; i++)
+	for (i = 0; i < geo->all_luns; i++)
 		kfree(pblk->luns[i].bb_list);
 
 	return 0;
@@ -858,7 +860,7 @@ fail_free_bb_template:
 fail_free_meta:
 	pblk_line_meta_free(pblk);
 fail:
-	for (i = 0; i < geo->nr_luns; i++)
+	for (i = 0; i < geo->all_luns; i++)
 		kfree(pblk->luns[i].bb_list);
 
 	return ret;
@@ -1041,13 +1043,13 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 
 	blk_queue_write_cache(tqueue, true, false);
 
-	tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size;
+	tqueue->limits.discard_granularity = geo->sec_per_chk * geo->sec_size;
 	tqueue->limits.discard_alignment = 0;
 	blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
 
 	pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
-			geo->nr_luns, pblk->l_mg.nr_lines,
+			geo->all_luns, pblk->l_mg.nr_lines,
 			(unsigned long long)pblk->rl.nr_secs,
 			pblk->rwb.nr_entries);
 
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index eadb3eb5d4dc..ceec12d26643 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -188,7 +188,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
 	int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
 
 	return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] -
-				nr_bb * geo->sec_per_blk;
+				nr_bb * geo->sec_per_chk;
 }
 
 struct pblk_recov_alloc {
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index cd49e8875d4e..5cee2ac49c72 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -28,7 +28,7 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
 	ssize_t sz = 0;
 	int i;
 
-	for (i = 0; i < geo->nr_luns; i++) {
+	for (i = 0; i < geo->all_luns; i++) {
 		int active = 1;
 
 		rlun = &pblk->luns[i];
@@ -238,7 +238,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
 
 	sz = snprintf(page, PAGE_SIZE - sz,
 		"line: nluns:%d, nblks:%d, nsecs:%d\n",
-		geo->nr_luns, lm->blk_per_line, lm->sec_per_line);
+		geo->all_luns, lm->blk_per_line, lm->sec_per_line);
 
 	sz += snprintf(page + sz, PAGE_SIZE - sz,
 		"lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n",
@@ -287,7 +287,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
 				"blk_line:%d, sec_line:%d, sec_blk:%d\n",
 					lm->blk_per_line,
 					lm->sec_per_line,
-					geo->sec_per_blk);
+					geo->sec_per_chk);
 
 	return sz;
 }
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 59a64d461a5d..c150728c3b49 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -907,7 +907,7 @@ static inline int pblk_pad_distance(struct pblk *pblk)
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 
-	return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl;
+	return NVM_MEM_PAGE_WRITE * geo->all_luns * geo->sec_per_pl;
 }
 
 static inline int pblk_dev_ppa_to_line(struct ppa_addr p)
@@ -1212,10 +1212,10 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
 
 		if (!ppa->c.is_cached &&
 				ppa->g.ch < geo->nr_chnls &&
-				ppa->g.lun < geo->luns_per_chnl &&
+				ppa->g.lun < geo->nr_luns &&
 				ppa->g.pl < geo->nr_planes &&
-				ppa->g.blk < geo->blks_per_lun &&
-				ppa->g.pg < geo->pgs_per_blk &&
+				ppa->g.blk < geo->nr_chks &&
+				ppa->g.pg < geo->ws_per_chk &&
 				ppa->g.sec < geo->sec_per_pg)
 			continue;
 
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 15bf243f6096..50ef71ee3d86 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -135,7 +135,7 @@ struct nvme_nvm_id_group {
 	__u8			num_lun;
 	__u8			num_pln;
 	__u8			rsvd1;
-	__le16			num_blk;
+	__le16			num_chk;
 	__le16			num_pg;
 	__le16			fpg_sz;
 	__le16			csecs;
@@ -215,36 +215,57 @@ static inline void _nvme_nvm_check_size(void)
 static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
 {
 	struct nvme_nvm_id_group *src;
-	struct nvm_id_group *dst;
+	struct nvm_id_group *grp;
+	int sec_per_pg, sec_per_pl, pg_per_blk;
 
 	if (nvme_nvm_id->cgrps != 1)
 		return -EINVAL;
 
 	src = &nvme_nvm_id->groups[0];
-	dst = &nvm_id->grp;
-
-	dst->mtype = src->mtype;
-	dst->fmtype = src->fmtype;
-	dst->num_ch = src->num_ch;
-	dst->num_lun = src->num_lun;
-	dst->num_pln = src->num_pln;
-
-	dst->num_pg = le16_to_cpu(src->num_pg);
-	dst->num_blk = le16_to_cpu(src->num_blk);
-	dst->fpg_sz = le16_to_cpu(src->fpg_sz);
-	dst->csecs = le16_to_cpu(src->csecs);
-	dst->sos = le16_to_cpu(src->sos);
-
-	dst->trdt = le32_to_cpu(src->trdt);
-	dst->trdm = le32_to_cpu(src->trdm);
-	dst->tprt = le32_to_cpu(src->tprt);
-	dst->tprm = le32_to_cpu(src->tprm);
-	dst->tbet = le32_to_cpu(src->tbet);
-	dst->tbem = le32_to_cpu(src->tbem);
-	dst->mpos = le32_to_cpu(src->mpos);
-	dst->mccap = le32_to_cpu(src->mccap);
-
-	dst->cpar = le16_to_cpu(src->cpar);
+	grp = &nvm_id->grp;
+
+	grp->mtype = src->mtype;
+	grp->fmtype = src->fmtype;
+
+	grp->num_ch = src->num_ch;
+	grp->num_lun = src->num_lun;
+
+	grp->num_chk = le16_to_cpu(src->num_chk);
+	grp->csecs = le16_to_cpu(src->csecs);
+	grp->sos = le16_to_cpu(src->sos);
+
+	pg_per_blk = le16_to_cpu(src->num_pg);
+	sec_per_pg = le16_to_cpu(src->fpg_sz) / grp->csecs;
+	sec_per_pl = sec_per_pg * src->num_pln;
+	grp->clba = sec_per_pl * pg_per_blk;
+	grp->ws_per_chk = pg_per_blk;
+
+	grp->mpos = le32_to_cpu(src->mpos);
+	grp->cpar = le16_to_cpu(src->cpar);
+	grp->mccap = le32_to_cpu(src->mccap);
+
+	grp->ws_opt = grp->ws_min = sec_per_pg;
+	grp->ws_seq = NVM_IO_SNGL_ACCESS;
+
+	if (grp->mpos & 0x020202) {
+		grp->ws_seq = NVM_IO_DUAL_ACCESS;
+		grp->ws_opt <<= 1;
+	} else if (grp->mpos & 0x040404) {
+		grp->ws_seq = NVM_IO_QUAD_ACCESS;
+		grp->ws_opt <<= 2;
+	}
+
+	grp->trdt = le32_to_cpu(src->trdt);
+	grp->trdm = le32_to_cpu(src->trdm);
+	grp->tprt = le32_to_cpu(src->tprt);
+	grp->tprm = le32_to_cpu(src->tprm);
+	grp->tbet = le32_to_cpu(src->tbet);
+	grp->tbem = le32_to_cpu(src->tbem);
+
+	/* 1.2 compatibility */
+	grp->num_pln = src->num_pln;
+	grp->num_pg = le16_to_cpu(src->num_pg);
+	grp->fpg_sz = le16_to_cpu(src->fpg_sz);
 
 	return 0;
 }
@@ -293,7 +314,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_nvm_command c = {};
 	struct nvme_nvm_bb_tbl *bb_tbl;
-	int nr_blks = geo->blks_per_lun * geo->plane_mode;
+	int nr_blks = geo->nr_chks * geo->plane_mode;
 	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
 	int ret = 0;
 
@@ -334,7 +355,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 		goto out;
 	}
 
-	memcpy(blks, bb_tbl->blk, geo->blks_per_lun * geo->plane_mode);
+	memcpy(blks, bb_tbl->blk, geo->nr_chks * geo->plane_mode);
 out:
 	kfree(bb_tbl);
 	return ret;
@@ -773,7 +794,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev,
 	} else if (strcmp(attr->name, "num_planes") == 0) {
 		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
 	} else if (strcmp(attr->name, "num_blocks") == 0) {	/* u16 */
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk);
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_chk);
 	} else if (strcmp(attr->name, "num_pages") == 0) {
 		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
 	} else if (strcmp(attr->name, "page_size") == 0) {
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index a5d8e0cbbb46..8e43bfebd38d 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -159,12 +159,16 @@ struct nvm_id_group {
 	u8	fmtype;
 	u8	num_ch;
 	u8	num_lun;
-	u8	num_pln;
-	u16	num_blk;
-	u16	num_pg;
-	u16	fpg_sz;
+	u16	num_chk;
+	u16	clba;
 	u16	csecs;
 	u16	sos;
+
+	u16	ws_min;
+	u16	ws_opt;
+	u16	ws_seq;
+	u16	ws_per_chk;
+
 	u32	trdt;
 	u32	trdm;
 	u32	tprt;
@@ -174,6 +178,11 @@ struct nvm_id_group {
 	u32	mpos;
 	u32	mccap;
 	u16	cpar;
+
+	/* 1.2 compatibility */
+	u8	num_pln;
+	u16	num_pg;
+	u16	fpg_sz;
 };
 
 struct nvm_addr_format {
@@ -259,31 +268,36 @@ enum {
 	NVM_BLK_ST_BAD =	0x8,	/* Bad block */
 };
 
+
 /* Device generic information */
 struct nvm_geo {
+	/* generic geometry */
 	int nr_chnls;
-	int nr_luns;
-	int luns_per_chnl; /* -1 if channels are not symmetric */
-	int nr_planes;
-	int sec_per_pg; /* only sectors for a single page */
-	int pgs_per_blk;
-	int blks_per_lun;
-	int fpg_size;
-	int pfpg_size; /* size of buffer if all pages are to be read */
+	int all_luns; /* across channels */
+	int nr_luns; /* per channel */
+	int nr_chks; /* per lun */
+
 	int sec_size;
 	int oob_size;
 	int mccap;
-	struct nvm_addr_format ppaf;
 
-	/* Calculated/Cached values. These do not reflect the actual usable
-	 * blocks at run-time.
-	 */
+	int sec_per_chk;
+	int sec_per_lun;
+
+	int ws_min;
+	int ws_opt;
+	int ws_seq;
+	int ws_per_chk;
+
 	int max_rq_size;
-	int plane_mode; /* drive device in single, double or quad mode */
 
+	struct nvm_addr_format ppaf;
+
+	/* Legacy 1.2 specific geometry */
+	int plane_mode; /* drive device in single, double or quad mode */
+	int nr_planes;
+	int sec_per_pg; /* only sectors for a single page */
 	int sec_per_pl; /* all sectors across planes */
-	int sec_per_blk;
-	int sec_per_lun;
 };
 
 /* sub-device structure */
-- 
cgit v1.2.3


From e53927393b9987b7c986b6364c27111077f0ea3e Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 5 Jan 2018 14:16:14 +0100
Subject: lightnvm: set target over-provision on create ioctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow to set the over-provision percentage on target creation. In case
that the value is not provided, fall back to the default value set by
the target.

In pblk, set the default OP to 11% of the total size of the device

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c       | 106 +++++++++++++++++++++++++++++++++---------
 drivers/lightnvm/pblk-init.c  |   5 +-
 drivers/lightnvm/pblk.h       |   2 +
 include/linux/lightnvm.h      |   6 +++
 include/uapi/linux/lightnvm.h |   9 ++++
 5 files changed, 104 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index d5f231c9339e..dcc9e621e651 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -140,7 +140,8 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
 }
 
 static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
-					      int lun_begin, int lun_end)
+					      u16 lun_begin, u16 lun_end,
+					      u16 op)
 {
 	struct nvm_tgt_dev *tgt_dev = NULL;
 	struct nvm_dev_map *dev_rmap = dev->rmap;
@@ -219,6 +220,7 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
 	tgt_dev->geo.nr_chnls = nr_chnls;
 	tgt_dev->geo.all_luns = nr_luns;
 	tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
+	tgt_dev->geo.op = op;
 	tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
 	tgt_dev->q = dev->q;
 	tgt_dev->map = dev_map;
@@ -266,9 +268,57 @@ static struct nvm_tgt_type *nvm_find_target_type(const char *name)
 	return tt;
 }
 
+static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
+				 int lun_end)
+{
+	if (lun_begin > lun_end || lun_end >= geo->all_luns) {
+		pr_err("nvm: lun out of bound (%u:%u > %u)\n",
+			lun_begin, lun_end, geo->all_luns - 1);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int __nvm_config_simple(struct nvm_dev *dev,
+			       struct nvm_ioctl_create_simple *s)
+{
+	struct nvm_geo *geo = &dev->geo;
+
+	if (s->lun_begin == -1 && s->lun_end == -1) {
+		s->lun_begin = 0;
+		s->lun_end = geo->all_luns - 1;
+	}
+
+	return nvm_config_check_luns(geo, s->lun_begin, s->lun_end);
+}
+
+static int __nvm_config_extended(struct nvm_dev *dev,
+				 struct nvm_ioctl_create_extended *e)
+{
+	struct nvm_geo *geo = &dev->geo;
+
+	if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
+		e->lun_begin = 0;
+		e->lun_end = dev->geo.all_luns - 1;
+	}
+
+	/* op not set falls into target's default */
+	if (e->op == 0xFFFF)
+		e->op = NVM_TARGET_DEFAULT_OP;
+
+	if (e->op < NVM_TARGET_MIN_OP ||
+	    e->op > NVM_TARGET_MAX_OP) {
+		pr_err("nvm: invalid over provisioning value\n");
+		return -EINVAL;
+	}
+
+	return nvm_config_check_luns(geo, e->lun_begin, e->lun_end);
+}
+
 static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 {
-	struct nvm_ioctl_create_simple *s = &create->conf.s;
+	struct nvm_ioctl_create_extended e;
 	struct request_queue *tqueue;
 	struct gendisk *tdisk;
 	struct nvm_tgt_type *tt;
@@ -277,6 +327,28 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	void *targetdata;
 	int ret;
 
+	switch (create->conf.type) {
+	case NVM_CONFIG_TYPE_SIMPLE:
+		ret = __nvm_config_simple(dev, &create->conf.s);
+		if (ret)
+			return ret;
+
+		e.lun_begin = create->conf.s.lun_begin;
+		e.lun_end = create->conf.s.lun_end;
+		e.op = NVM_TARGET_DEFAULT_OP;
+		break;
+	case NVM_CONFIG_TYPE_EXTENDED:
+		ret = __nvm_config_extended(dev, &create->conf.e);
+		if (ret)
+			return ret;
+
+		e = create->conf.e;
+		break;
+	default:
+		pr_err("nvm: config type not valid\n");
+		return -EINVAL;
+	}
+
 	tt = nvm_find_target_type(create->tgttype);
 	if (!tt) {
 		pr_err("nvm: target type %s not found\n", create->tgttype);
@@ -289,7 +361,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 		return -EINVAL;
 	}
 
-	ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end);
+	ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end);
 	if (ret)
 		return ret;
 
@@ -299,7 +371,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 		goto err_reserve;
 	}
 
-	tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end);
+	tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
 	if (!tgt_dev) {
 		pr_err("nvm: could not create target device\n");
 		ret = -ENOMEM;
@@ -369,7 +441,7 @@ err_dev:
 err_t:
 	kfree(t);
 err_reserve:
-	nvm_release_luns_err(dev, s->lun_begin, s->lun_end);
+	nvm_release_luns_err(dev, e.lun_begin, e.lun_end);
 	return ret;
 }
 
@@ -949,7 +1021,6 @@ EXPORT_SYMBOL(nvm_unregister);
 static int __nvm_configure_create(struct nvm_ioctl_create *create)
 {
 	struct nvm_dev *dev;
-	struct nvm_ioctl_create_simple *s;
 
 	down_write(&nvm_lock);
 	dev = nvm_find_nvm_dev(create->dev);
@@ -960,23 +1031,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
 		return -EINVAL;
 	}
 
-	if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
-		pr_err("nvm: config type not valid\n");
-		return -EINVAL;
-	}
-	s = &create->conf.s;
-
-	if (s->lun_begin == -1 && s->lun_end == -1) {
-		s->lun_begin = 0;
-		s->lun_end = dev->geo.all_luns - 1;
-	}
-
-	if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.all_luns) {
-		pr_err("nvm: lun out of bound (%u:%u > %u)\n",
-			s->lun_begin, s->lun_end, dev->geo.all_luns - 1);
-		return -EINVAL;
-	}
-
 	return nvm_create_tgt(dev, create);
 }
 
@@ -1076,6 +1130,12 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
 	if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
 		return -EFAULT;
 
+	if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
+	    create.conf.e.rsv != 0) {
+		pr_err("nvm: reserved config field in use\n");
+		return -EINVAL;
+	}
+
 	create.dev[DISK_NAME_LEN - 1] = '\0';
 	create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0';
 	create.tgtname[DISK_NAME_LEN - 1] = '\0';
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index c8a718249e26..533f6908e238 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -585,7 +585,10 @@ static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
 	sector_t provisioned;
 	int sec_meta, blk_meta;
 
-	pblk->op = 20;
+	if (geo->op == NVM_TARGET_DEFAULT_OP)
+		pblk->op = PBLK_DEFAULT_OP;
+	else
+		pblk->op = geo->op;
 
 	provisioned = nr_free_blks;
 	provisioned *= (100 - pblk->op);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 1e719d4181ce..19e622c65e92 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -54,6 +54,8 @@
 /* Static pool sizes */
 #define PBLK_GEN_WS_POOL_SIZE (2)
 
+#define PBLK_DEFAULT_OP (11)
+
 enum {
 	PBLK_READ		= READ,
 	PBLK_WRITE		= WRITE,/* Write from write buffer */
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 8e43bfebd38d..7f4b60abdf27 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -218,6 +218,10 @@ struct nvm_target {
 
 #define ADDR_EMPTY (~0ULL)
 
+#define NVM_TARGET_DEFAULT_OP (101)
+#define NVM_TARGET_MIN_OP (3)
+#define NVM_TARGET_MAX_OP (80)
+
 #define NVM_VERSION_MAJOR 1
 #define NVM_VERSION_MINOR 0
 #define NVM_VERSION_PATCH 0
@@ -291,6 +295,8 @@ struct nvm_geo {
 
 	int max_rq_size;
 
+	int op;
+
 	struct nvm_addr_format ppaf;
 
 	/* Legacy 1.2 specific geometry */
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index 42d1a434af29..f9a1be7fc696 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -75,14 +75,23 @@ struct nvm_ioctl_create_simple {
 	__u32 lun_end;
 };
 
+struct nvm_ioctl_create_extended {
+	__u16 lun_begin;
+	__u16 lun_end;
+	__u16 op;
+	__u16 rsv;
+};
+
 enum {
 	NVM_CONFIG_TYPE_SIMPLE = 0,
+	NVM_CONFIG_TYPE_EXTENDED = 1,
 };
 
 struct nvm_ioctl_create_conf {
 	__u32 type;
 	union {
 		struct nvm_ioctl_create_simple s;
+		struct nvm_ioctl_create_extended e;
 	};
 };
 
-- 
cgit v1.2.3


From 6cc77e9cb08041627fe1d32ac3a743249deb8167 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Dec 2017 15:43:38 +0900
Subject: block: introduce zoned block devices zone write locking

Components relying only on the request_queue structure for accessing
block devices (e.g. I/O schedulers) have a limited knowledged of the
device characteristics. In particular, the device capacity cannot be
easily discovered, which for a zoned block device also result in the
inability to easily know the number of zones of the device (the zone
size is indicated by the chunk_sectors field of the queue limits).

Introduce the nr_zones field to the request_queue structure to simplify
access to this information. Also, add the bitmap seq_zone_bitmap which
indicates which zones of the device are sequential zones (write
preferred or write required) and the bitmap seq_zones_wlock which
indicates if a zone is write locked, that is, if a write request
targeting a zone was dispatched to the device. These fields are
initialized by the low level block device driver (sd.c for ZBC/ZAC
disks). They are not initialized by stacking drivers (device mappers)
handling zoned block devices (e.g. dm-linear).

Using this, I/O schedulers can introduce zone write locking to control
request dispatching to a zoned block device and avoid write request
reordering by limiting to at most a single write request per zone
outside of the scheduler at any time.

Based on previous patches from Damien Le Moal.

Signed-off-by: Christoph Hellwig <hch@lst.de>
[Damien]
* Fixed comments and identation in blkdev.h
* Changed helper functions
* Fixed this commit message
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |   1 +
 block/blk-zoned.c      |  42 +++++++++++++++++++
 include/linux/blkdev.h | 111 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 154 insertions(+)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index b8881750a3ac..e6e5bbc4c366 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1641,6 +1641,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 
 	lockdep_assert_held(q->queue_lock);
 
+	blk_req_zone_write_unlock(req);
 	blk_pm_put_request(req);
 
 	elv_completed_request(q, req);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ff57fb51b338..acb7252c7e81 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -21,6 +21,48 @@ static inline sector_t blk_zone_start(struct request_queue *q,
 	return sector & ~zone_mask;
 }
 
+/*
+ * Return true if a request is a write requests that needs zone write locking.
+ */
+bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+	if (!rq->q->seq_zones_wlock)
+		return false;
+
+	if (blk_rq_is_passthrough(rq))
+		return false;
+
+	switch (req_op(rq)) {
+	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_WRITE:
+		return blk_rq_zone_is_seq(rq);
+	default:
+		return false;
+	}
+}
+EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
+
+void __blk_req_zone_write_lock(struct request *rq)
+{
+	if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
+					  rq->q->seq_zones_wlock)))
+		return;
+
+	WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
+	rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
+
+void __blk_req_zone_write_unlock(struct request *rq)
+{
+	rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
+	if (rq->q->seq_zones_wlock)
+		WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
+						 rq->q->seq_zones_wlock));
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
+
 /*
  * Check that a zone report belongs to the partition.
  * If yes, fix its start sector and write pointer, copy it in the
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8089ca17db9a..46e606f5b44b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t;
 /* Look at ->special_vec for the actual data payload instead of the
    bio chain. */
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -546,6 +548,22 @@ struct request_queue {
 
 	struct queue_limits	limits;
 
+	/*
+	 * Zoned block device information for request dispatch control.
+	 * nr_zones is the total number of zones of the device. This is always
+	 * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
+	 * bits which indicates if a zone is conventional (bit clear) or
+	 * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones
+	 * bits which indicates if a zone is write locked, that is, if a write
+	 * request targeting the zone was dispatched. All three fields are
+	 * initialized by the low level device driver (e.g. scsi/sd.c).
+	 * Stacking drivers (device mappers) may or may not initialize
+	 * these fields.
+	 */
+	unsigned int		nr_zones;
+	unsigned long		*seq_zones_bitmap;
+	unsigned long		*seq_zones_wlock;
+
 	/*
 	 * sg stuff
 	 */
@@ -790,6 +808,27 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
 
+static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+{
+	return q->nr_zones;
+}
+
+static inline unsigned int blk_queue_zone_no(struct request_queue *q,
+					     sector_t sector)
+{
+	if (!blk_queue_is_zoned(q))
+		return 0;
+	return sector >> ilog2(q->limits.chunk_sectors);
+}
+
+static inline bool blk_queue_zone_is_seq(struct request_queue *q,
+					 sector_t sector)
+{
+	if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap)
+		return false;
+	return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
+}
+
 static inline bool rq_is_sync(struct request *rq)
 {
 	return op_is_sync(rq->cmd_flags);
@@ -1029,6 +1068,16 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 	return blk_rq_cur_bytes(rq) >> 9;
 }
 
+static inline unsigned int blk_rq_zone_no(struct request *rq)
+{
+	return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+}
+
+static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+{
+	return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+}
+
 /*
  * Some commands like WRITE SAME have a payload or data transfer size which
  * is different from the size of the request.  Any driver that supports such
@@ -1578,7 +1627,15 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
 
 	if (q)
 		return blk_queue_zone_sectors(q);
+	return 0;
+}
 
+static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return blk_queue_nr_zones(q);
 	return 0;
 }
 
@@ -1954,6 +2011,60 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+bool blk_req_needs_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_unlock(struct request *rq);
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+	if (blk_req_needs_zone_write_lock(rq))
+		__blk_req_zone_write_lock(rq);
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+	if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+		__blk_req_zone_write_unlock(rq);
+}
+
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+	return rq->q->seq_zones_wlock &&
+		test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+	if (!blk_req_needs_zone_write_lock(rq))
+		return true;
+	return !blk_req_zone_is_write_locked(rq);
+}
+#else
+static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+	return false;
+}
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+}
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+	return false;
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+	return true;
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
+
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit v1.2.3


From 86292abc5af206f64192a0b60da06fd604debdc0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 18 Dec 2017 20:22:03 +0800
Subject: block: introduce bio helpers for converting to multipage bvec

The following helpers are introduced for converting current users of
direct access to bvec table, and prepares for supporting multipage bvec:

	bio_pages_all()
	bio_first_bvec_all()
	bio_first_page_all()
	bio_last_bvec_all()

All are named as bio_*_all() to following bio_for_each_segment_all(),
they can only be used on bio of !bio_flagged(bio, BIO_CLONED), that means
the whole bvec table is covered.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 82f0c8fd7be8..435ddf04e889 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -300,6 +300,29 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
 		bv->bv_len = iter.bi_bvec_done;
 }
 
+static inline unsigned bio_pages_all(struct bio *bio)
+{
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+	return bio->bi_vcnt;
+}
+
+static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
+{
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+	return bio->bi_io_vec;
+}
+
+static inline struct page *bio_first_page_all(struct bio *bio)
+{
+	return bio_first_bvec_all(bio)->bv_page;
+}
+
+static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
+{
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+	return &bio->bi_io_vec[bio->bi_vcnt - 1];
+}
+
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
 	BIP_MAPPED_INTEGRITY	= 1 << 1, /* ref tag has been remapped */
-- 
cgit v1.2.3


From 3c892a098b0bfa3e571f1f0d2a7e72fbaeea691a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 18 Dec 2017 20:22:07 +0800
Subject: block: bounce: don't access bio->bi_io_vec in copy_to_high_bio_irq

Firstly this patch introduces BVEC_ITER_ALL_INIT for iterating one bio
from start to end.

As we need to support multipage bvecs, don't access bio->bi_io_vec
in copy_to_high_bio_irq(), and just use the standard iterator for that.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bounce.c       | 16 +++++++++++-----
 include/linux/bvec.h |  9 +++++++++
 2 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/block/bounce.c b/block/bounce.c
index 0274c31d6c05..c35a3d7f0528 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -113,24 +113,30 @@ int init_emergency_isa_pool(void)
 static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
 {
 	unsigned char *vfrom;
-	struct bio_vec tovec, *fromvec = from->bi_io_vec;
+	struct bio_vec tovec, fromvec;
 	struct bvec_iter iter;
+	/*
+	 * The bio of @from is created by bounce, so we can iterate
+	 * its bvec from start to end, but the @from->bi_iter can't be
+	 * trusted because it might be changed by splitting.
+	 */
+	struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
 
 	bio_for_each_segment(tovec, to, iter) {
-		if (tovec.bv_page != fromvec->bv_page) {
+		fromvec = bio_iter_iovec(from, from_iter);
+		if (tovec.bv_page != fromvec.bv_page) {
 			/*
 			 * fromvec->bv_offset and fromvec->bv_len might have
 			 * been modified by the block layer, so use the original
 			 * copy, bounce_copy_vec already uses tovec->bv_len
 			 */
-			vfrom = page_address(fromvec->bv_page) +
+			vfrom = page_address(fromvec.bv_page) +
 				tovec.bv_offset;
 
 			bounce_copy_vec(&tovec, vfrom);
 			flush_dcache_page(tovec.bv_page);
 		}
-
-		fromvec++;
+		bio_advance_iter(from, &from_iter, tovec.bv_len);
 	}
 }
 
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index ec8a4d7af6bd..fe7a22dd133b 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -125,4 +125,13 @@ static inline bool bvec_iter_rewind(const struct bio_vec *bv,
 		((bvl = bvec_iter_bvec((bio_vec), (iter))), 1);	\
 	     bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
 
+/* for iterating one bio from start to end */
+#define BVEC_ITER_ALL_INIT (struct bvec_iter)				\
+{									\
+	.bi_sector	= 0,						\
+	.bi_size	= UINT_MAX,					\
+	.bi_idx		= 0,						\
+	.bi_bvec_done	= 0,						\
+}
+
 #endif /* __LINUX_BVEC_ITER_H */
-- 
cgit v1.2.3


From 25d8be77e19224d8f21b363d77b5283c5dc21a57 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 18 Dec 2017 20:22:10 +0800
Subject: block: move bio_alloc_pages() to bcache

bcache is the only user of bio_alloc_pages(), so move this function into
bcache, and avoid it being misused in the future.

Also rename it to bch_bio_allo_pages() since it is bcache only.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                   | 28 ----------------------------
 drivers/md/bcache/btree.c     |  2 +-
 drivers/md/bcache/debug.c     |  2 +-
 drivers/md/bcache/movinggc.c  |  2 +-
 drivers/md/bcache/request.c   |  2 +-
 drivers/md/bcache/util.c      | 27 +++++++++++++++++++++++++++
 drivers/md/bcache/util.h      |  1 +
 drivers/md/bcache/writeback.c |  2 +-
 include/linux/bio.h           |  1 -
 9 files changed, 33 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 8bfdea58159b..fe1efbeaf4aa 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -968,34 +968,6 @@ void bio_advance(struct bio *bio, unsigned bytes)
 }
 EXPORT_SYMBOL(bio_advance);
 
-/**
- * bio_alloc_pages - allocates a single page for each bvec in a bio
- * @bio: bio to allocate pages for
- * @gfp_mask: flags for allocation
- *
- * Allocates pages up to @bio->bi_vcnt.
- *
- * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
- * freed.
- */
-int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
-{
-	int i;
-	struct bio_vec *bv;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		bv->bv_page = alloc_page(gfp_mask);
-		if (!bv->bv_page) {
-			while (--bv >= bio->bi_io_vec)
-				__free_page(bv->bv_page);
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(bio_alloc_pages);
-
 /**
  * bio_copy_data - copy contents of data buffers from one chain of bios to
  * another
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 02a4cf646fdc..ebb1874218e7 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b)
 	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
 		       bset_sector_offset(&b->keys, i));
 
-	if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
+	if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
 		int j;
 		struct bio_vec *bv;
 		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c7a02c4900da..879ab21074c6 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 		return;
 	check->bi_opf = REQ_OP_READ;
 
-	if (bio_alloc_pages(check, GFP_NOIO))
+	if (bch_bio_alloc_pages(check, GFP_NOIO))
 		goto out_put;
 
 	submit_bio_wait(check);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index d50c1c97da68..a24c3a95b2c0 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c)
 		bio_set_op_attrs(bio, REQ_OP_READ, 0);
 		bio->bi_end_io	= read_moving_endio;
 
-		if (bio_alloc_pages(bio, GFP_KERNEL))
+		if (bch_bio_alloc_pages(bio, GFP_KERNEL))
 			goto err;
 
 		trace_bcache_gc_copy(&w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 643c3021624f..c493fb947dc9 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -841,7 +841,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	cache_bio->bi_private	= &s->cl;
 
 	bch_bio_map(cache_bio, NULL);
-	if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
+	if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
 		goto out_put;
 
 	if (reada)
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 61813d230015..a23cd6a14b74 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -283,6 +283,33 @@ start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
 	}
 }
 
+/**
+ * bch_bio_alloc_pages - allocates a single page for each bvec in a bio
+ * @bio: bio to allocate pages for
+ * @gfp_mask: flags for allocation
+ *
+ * Allocates pages up to @bio->bi_vcnt.
+ *
+ * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+ * freed.
+ */
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+	int i;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		bv->bv_page = alloc_page(gfp_mask);
+		if (!bv->bv_page) {
+			while (--bv >= bio->bi_io_vec)
+				__free_page(bv->bv_page);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
  * use permitted, subject to terms of PostgreSQL license; see.)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ed5e8a412eb8..4df4c5c1cab2 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 }
 
 void bch_bio_map(struct bio *bio, void *base);
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
 
 static inline sector_t bdev_sectors(struct block_device *bdev)
 {
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 56a37884ca8b..1ac2af6128b1 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -278,7 +278,7 @@ static void read_dirty(struct cached_dev *dc)
 		bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
 		io->bio.bi_end_io	= read_dirty_endio;
 
-		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
+		if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
 			goto err_free;
 
 		trace_bcache_writeback(&w->key);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 435ddf04e889..367a979fd4a6 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -500,7 +500,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
-extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern void bio_free_pages(struct bio *bio);
 
 extern struct bio *bio_copy_user_iov(struct request_queue *,
-- 
cgit v1.2.3


From e80a0af4759a164214f02da157a3800753ce135f Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 5 Jan 2018 08:26:46 -0800
Subject: lib/scatterlist: Introduce sgl_alloc() and sgl_free()

Many kernel drivers contain code that allocates and frees both a
scatterlist and the pages that populate that scatterlist.
Introduce functions in lib/scatterlist.c that perform these tasks
instead of duplicating this functionality in multiple drivers.
Only include these functions in the build if CONFIG_SGL_ALLOC=y
to avoid that the kernel size increases if this functionality is
not used.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/scatterlist.h |  10 +++++
 lib/Kconfig                 |   4 ++
 lib/scatterlist.c           | 105 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 119 insertions(+)

(limited to 'include')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index b7c83254c566..b8a7c1d1dbe3 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -276,6 +276,16 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
 			      unsigned int n_pages, unsigned int offset,
 			      unsigned long size, gfp_t gfp_mask);
 
+#ifdef CONFIG_SGL_ALLOC
+struct scatterlist *sgl_alloc_order(unsigned long long length,
+				    unsigned int order, bool chainable,
+				    gfp_t gfp, unsigned int *nent_p);
+struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
+			      unsigned int *nent_p);
+void sgl_free_order(struct scatterlist *sgl, int order);
+void sgl_free(struct scatterlist *sgl);
+#endif /* CONFIG_SGL_ALLOC */
+
 size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
 		      size_t buflen, off_t skip, bool to_buffer);
 
diff --git a/lib/Kconfig b/lib/Kconfig
index c5e84fbcb30b..4dd5c11366f9 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -409,6 +409,10 @@ config HAS_DMA
 	depends on !NO_DMA
 	default y
 
+config SGL_ALLOC
+	bool
+	default n
+
 config DMA_NOOP_OPS
 	bool
 	depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 7c1c55f7daaa..9afc9b432083 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -474,6 +474,111 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
 }
 EXPORT_SYMBOL(sg_alloc_table_from_pages);
 
+#ifdef CONFIG_SGL_ALLOC
+
+/**
+ * sgl_alloc_order - allocate a scatterlist and its pages
+ * @length: Length in bytes of the scatterlist. Must be at least one
+ * @order: Second argument for alloc_pages()
+ * @chainable: Whether or not to allocate an extra element in the scatterlist
+ *	for scatterlist chaining purposes
+ * @gfp: Memory allocation flags
+ * @nent_p: [out] Number of entries in the scatterlist that have pages
+ *
+ * Returns: A pointer to an initialized scatterlist or %NULL upon failure.
+ */
+struct scatterlist *sgl_alloc_order(unsigned long long length,
+				    unsigned int order, bool chainable,
+				    gfp_t gfp, unsigned int *nent_p)
+{
+	struct scatterlist *sgl, *sg;
+	struct page *page;
+	unsigned int nent, nalloc;
+	u32 elem_len;
+
+	nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order);
+	/* Check for integer overflow */
+	if (length > (nent << (PAGE_SHIFT + order)))
+		return NULL;
+	nalloc = nent;
+	if (chainable) {
+		/* Check for integer overflow */
+		if (nalloc + 1 < nalloc)
+			return NULL;
+		nalloc++;
+	}
+	sgl = kmalloc_array(nalloc, sizeof(struct scatterlist),
+			    (gfp & ~GFP_DMA) | __GFP_ZERO);
+	if (!sgl)
+		return NULL;
+
+	sg_init_table(sgl, nent);
+	sg = sgl;
+	while (length) {
+		elem_len = min_t(u64, length, PAGE_SIZE << order);
+		page = alloc_pages(gfp, order);
+		if (!page) {
+			sgl_free(sgl);
+			return NULL;
+		}
+
+		sg_set_page(sg, page, elem_len, 0);
+		length -= elem_len;
+		sg = sg_next(sg);
+	}
+	WARN_ON_ONCE(sg);
+	if (nent_p)
+		*nent_p = nent;
+	return sgl;
+}
+EXPORT_SYMBOL(sgl_alloc_order);
+
+/**
+ * sgl_alloc - allocate a scatterlist and its pages
+ * @length: Length in bytes of the scatterlist
+ * @gfp: Memory allocation flags
+ * @nent_p: [out] Number of entries in the scatterlist
+ *
+ * Returns: A pointer to an initialized scatterlist or %NULL upon failure.
+ */
+struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
+			      unsigned int *nent_p)
+{
+	return sgl_alloc_order(length, 0, false, gfp, nent_p);
+}
+EXPORT_SYMBOL(sgl_alloc);
+
+/**
+ * sgl_free_order - free a scatterlist and its pages
+ * @sgl: Scatterlist with one or more elements
+ * @order: Second argument for __free_pages()
+ */
+void sgl_free_order(struct scatterlist *sgl, int order)
+{
+	struct scatterlist *sg;
+	struct page *page;
+
+	for (sg = sgl; sg; sg = sg_next(sg)) {
+		page = sg_page(sg);
+		if (page)
+			__free_pages(page, order);
+	}
+	kfree(sgl);
+}
+EXPORT_SYMBOL(sgl_free_order);
+
+/**
+ * sgl_free - free a scatterlist and its pages
+ * @sgl: Scatterlist with one or more elements
+ */
+void sgl_free(struct scatterlist *sgl)
+{
+	sgl_free_order(sgl, 0);
+}
+EXPORT_SYMBOL(sgl_free);
+
+#endif /* CONFIG_SGL_ALLOC */
+
 void __sg_page_iter_start(struct sg_page_iter *piter,
 			  struct scatterlist *sglist, unsigned int nents,
 			  unsigned long pgoffset)
-- 
cgit v1.2.3


From 1d9bd5161ba32db5665a617edc8b0723880f543e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2018 08:29:48 -0800
Subject: blk-mq: replace timeout synchronization with a RCU and generation
 based scheme

Currently, blk-mq timeout path synchronizes against the usual
issue/completion path using a complex scheme involving atomic
bitflags, REQ_ATOM_*, memory barriers and subtle memory coherence
rules.  Unfortunately, it contains quite a few holes.

There's a complex dancing around REQ_ATOM_STARTED and
REQ_ATOM_COMPLETE between issue/completion and timeout paths; however,
they don't have a synchronization point across request recycle
instances and it isn't clear what the barriers add.
blk_mq_check_expired() can easily read STARTED from N-2'th iteration,
deadline from N-1'th, blk_mark_rq_complete() against Nth instance.

In fact, it's pretty easy to make blk_mq_check_expired() terminate a
later instance of a request.  If we induce 5 sec delay before
time_after_eq() test in blk_mq_check_expired(), shorten the timeout to
2s, and issue back-to-back large IOs, blk-mq starts timing out
requests spuriously pretty quickly.  Nothing actually timed out.  It
just made the call on a recycle instance of a request and then
terminated a later instance long after the original instance finished.
The scenario isn't theoretical either.

This patch replaces the broken synchronization mechanism with a RCU
and generation number based one.

1. Each request has a u64 generation + state value, which can be
   updated only by the request owner.  Whenever a request becomes
   in-flight, the generation number gets bumped up too.  This provides
   the basis for the timeout path to distinguish different recycle
   instances of the request.

   Also, marking a request in-flight and setting its deadline are
   protected with a seqcount so that the timeout path can fetch both
   values coherently.

2. The timeout path fetches the generation, state and deadline.  If
   the verdict is timeout, it records the generation into a dedicated
   request abortion field and does RCU wait.

3. The completion path is also protected by RCU (from the previous
   patch) and checks whether the current generation number and state
   match the abortion field.  If so, it skips completion.

4. The timeout path, after RCU wait, scans requests again and
   terminates the ones whose generation and state still match the ones
   requested for abortion.

   By now, the timeout path knows that either the generation number
   and state changed if it lost the race or the completion will yield
   to it and can safely timeout the request.

While it's more lines of code, it's conceptually simpler, doesn't
depend on direct use of subtle memory ordering or coherence, and
hopefully doesn't terminate the wrong instance.

While this change makes REQ_ATOM_COMPLETE synchronization unnecessary
between issue/complete and timeout paths, REQ_ATOM_COMPLETE isn't
removed yet as it's still used in other places.  Future patches will
move all state tracking to the new mechanism and remove all bitops in
the hot paths.

Note that this patch adds a comment explaining a race condition in
BLK_EH_RESET_TIMER path.  The race has always been there and this
patch doesn't change it.  It's just documenting the existing race.

v2: - Fixed BLK_EH_RESET_TIMER handling as pointed out by Jianchao.
    - s/request->gstate_seqc/request->gstate_seq/ as suggested by Peter.
    - READ_ONCE() added in blk_mq_rq_update_state() as suggested by Peter.

v3: - Fixed possible extended seqcount / u64_stats_sync read looping
      spotted by Peter.
    - MQ_RQ_IDLE was incorrectly being set in complete_request instead
      of free_request.  Fixed.

v4: - Rebased on top of hctx_lock() refactoring patch.
    - Added comment explaining the use of hctx_lock() in completion path.

v5: - Added comments requested by Bart.
    - Note the addition of BLK_EH_RESET_TIMER race condition in the
      commit message.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |   2 +
 block/blk-mq.c         | 229 +++++++++++++++++++++++++++++++++----------------
 block/blk-mq.h         |  46 ++++++++++
 block/blk-timeout.c    |   2 +-
 block/blk.h            |   6 --
 include/linux/blk-mq.h |   1 +
 include/linux/blkdev.h |  23 +++++
 7 files changed, 230 insertions(+), 79 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2e0d041e2daf..f843ae4f858d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
 	rq->part = NULL;
+	seqcount_init(&rq->gstate_seq);
+	u64_stats_init(&rq->aborted_gstate_sync);
 }
 EXPORT_SYMBOL(blk_rq_init);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f5e57c80a82b..156203876c8c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq)
 	if (blk_rq_rl(rq))
 		blk_put_rl(blk_rq_rl(rq));
 
+	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
 	if (rq->tag != -1)
@@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq)
 	bool shared = false;
 	int cpu;
 
+	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
+
 	if (rq->internal_tag != -1)
 		blk_mq_sched_completed_request(rq);
 	if (rq->rq_flags & RQF_STATS) {
@@ -573,6 +576,36 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 		*srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
 }
 
+static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+{
+	unsigned long flags;
+
+	/*
+	 * blk_mq_rq_aborted_gstate() is used from the completion path and
+	 * can thus be called from irq context.  u64_stats_fetch in the
+	 * middle of update on the same CPU leads to lockup.  Disable irq
+	 * while updating.
+	 */
+	local_irq_save(flags);
+	u64_stats_update_begin(&rq->aborted_gstate_sync);
+	rq->aborted_gstate = gstate;
+	u64_stats_update_end(&rq->aborted_gstate_sync);
+	local_irq_restore(flags);
+}
+
+static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+{
+	unsigned int start;
+	u64 aborted_gstate;
+
+	do {
+		start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+		aborted_gstate = rq->aborted_gstate;
+	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+
+	return aborted_gstate;
+}
+
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:		the request being processed
@@ -590,8 +623,20 @@ void blk_mq_complete_request(struct request *rq)
 	if (unlikely(blk_should_fake_timeout(q)))
 		return;
 
+	/*
+	 * If @rq->aborted_gstate equals the current instance, timeout is
+	 * claiming @rq and we lost.  This is synchronized through
+	 * hctx_lock().  See blk_mq_timeout_work() for details.
+	 *
+	 * Completion path never blocks and we can directly use RCU here
+	 * instead of hctx_lock() which can be either RCU or SRCU.
+	 * However, that would complicate paths which want to synchronize
+	 * against us.  Let stay in sync with the issue path so that
+	 * hctx_lock() covers both issue and completion paths.
+	 */
 	hctx_lock(hctx, &srcu_idx);
-	if (!blk_mark_rq_complete(rq))
+	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
+	    !blk_mark_rq_complete(rq))
 		__blk_mq_complete_request(rq);
 	hctx_unlock(hctx, srcu_idx);
 }
@@ -617,34 +662,32 @@ void blk_mq_start_request(struct request *rq)
 		wbt_issue(q->rq_wb, &rq->issue_stat);
 	}
 
-	blk_add_timer(rq);
-
+	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 	WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
 
 	/*
-	 * Mark us as started and clear complete. Complete might have been
-	 * set if requeue raced with timeout, which then marked it as
-	 * complete. So be sure to clear complete again when we start
-	 * the request, otherwise we'll ignore the completion event.
+	 * Mark @rq in-flight which also advances the generation number,
+	 * and register for timeout.  Protect with a seqcount to allow the
+	 * timeout path to read both @rq->gstate and @rq->deadline
+	 * coherently.
 	 *
-	 * Ensure that ->deadline is visible before we set STARTED, such that
-	 * blk_mq_check_expired() is guaranteed to observe our ->deadline when
-	 * it observes STARTED.
+	 * This is the only place where a request is marked in-flight.  If
+	 * the timeout path reads an in-flight @rq->gstate, the
+	 * @rq->deadline it reads together under @rq->gstate_seq is
+	 * guaranteed to be the matching one.
 	 */
-	smp_wmb();
+	preempt_disable();
+	write_seqcount_begin(&rq->gstate_seq);
+
+	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
+	blk_add_timer(rq);
+
+	write_seqcount_end(&rq->gstate_seq);
+	preempt_enable();
+
 	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
-		/*
-		 * Coherence order guarantees these consecutive stores to a
-		 * single variable propagate in the specified order. Thus the
-		 * clear_bit() is ordered _after_ the set bit. See
-		 * blk_mq_check_expired().
-		 *
-		 * (the bits must be part of the same byte for this to be
-		 * true).
-		 */
+	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
 		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
-	}
 
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
@@ -677,6 +720,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 	blk_mq_sched_requeue_request(rq);
 
 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+		blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
 		if (q->dma_drain_size && blk_rq_bytes(rq))
 			rq->nr_phys_segments--;
 	}
@@ -774,6 +818,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
 struct blk_mq_timeout_data {
 	unsigned long next;
 	unsigned int next_set;
+	unsigned int nr_expired;
 };
 
 void blk_mq_rq_timed_out(struct request *req, bool reserved)
@@ -801,6 +846,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
 		__blk_mq_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
+		/*
+		 * As nothing prevents from completion happening while
+		 * ->aborted_gstate is set, this may lead to ignored
+		 * completions and further spurious timeouts.
+		 */
+		blk_mq_rq_update_aborted_gstate(req, 0);
 		blk_add_timer(req);
 		blk_clear_rq_complete(req);
 		break;
@@ -816,50 +867,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
 	struct blk_mq_timeout_data *data = priv;
-	unsigned long deadline;
+	unsigned long gstate, deadline;
+	int start;
+
+	might_sleep();
 
 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 		return;
 
-	/*
-	 * Ensures that if we see STARTED we must also see our
-	 * up-to-date deadline, see blk_mq_start_request().
-	 */
-	smp_rmb();
-
-	deadline = READ_ONCE(rq->deadline);
+	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
+	while (true) {
+		start = read_seqcount_begin(&rq->gstate_seq);
+		gstate = READ_ONCE(rq->gstate);
+		deadline = rq->deadline;
+		if (!read_seqcount_retry(&rq->gstate_seq, start))
+			break;
+		cond_resched();
+	}
 
-	/*
-	 * The rq being checked may have been freed and reallocated
-	 * out already here, we avoid this race by checking rq->deadline
-	 * and REQ_ATOM_COMPLETE flag together:
-	 *
-	 * - if rq->deadline is observed as new value because of
-	 *   reusing, the rq won't be timed out because of timing.
-	 * - if rq->deadline is observed as previous value,
-	 *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
-	 *   because we put a barrier between setting rq->deadline
-	 *   and clearing the flag in blk_mq_start_request(), so
-	 *   this rq won't be timed out too.
-	 */
-	if (time_after_eq(jiffies, deadline)) {
-		if (!blk_mark_rq_complete(rq)) {
-			/*
-			 * Again coherence order ensures that consecutive reads
-			 * from the same variable must be in that order. This
-			 * ensures that if we see COMPLETE clear, we must then
-			 * see STARTED set and we'll ignore this timeout.
-			 *
-			 * (There's also the MB implied by the test_and_clear())
-			 */
-			blk_mq_rq_timed_out(rq, reserved);
-		}
+	/* if in-flight && overdue, mark for abortion */
+	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
+	    time_after_eq(jiffies, deadline)) {
+		blk_mq_rq_update_aborted_gstate(rq, gstate);
+		data->nr_expired++;
+		hctx->nr_expired++;
 	} else if (!data->next_set || time_after(data->next, deadline)) {
 		data->next = deadline;
 		data->next_set = 1;
 	}
 }
 
+static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+		struct request *rq, void *priv, bool reserved)
+{
+	/*
+	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
+	 * completions that we lost to, they would have finished and
+	 * updated @rq->gstate by now; otherwise, the completion path is
+	 * now guaranteed to see @rq->aborted_gstate and yield.  If
+	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+	 */
+	if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
+	    !blk_mark_rq_complete(rq))
+		blk_mq_rq_timed_out(rq, reserved);
+}
+
 static void blk_mq_timeout_work(struct work_struct *work)
 {
 	struct request_queue *q =
@@ -867,7 +919,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	struct blk_mq_timeout_data data = {
 		.next		= 0,
 		.next_set	= 0,
+		.nr_expired	= 0,
 	};
+	struct blk_mq_hw_ctx *hctx;
 	int i;
 
 	/* A deadlock might occur if a request is stuck requiring a
@@ -886,14 +940,40 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
+	/* scan for the expired ones and set their ->aborted_gstate */
 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 
+	if (data.nr_expired) {
+		bool has_rcu = false;
+
+		/*
+		 * Wait till everyone sees ->aborted_gstate.  The
+		 * sequential waits for SRCUs aren't ideal.  If this ever
+		 * becomes a problem, we can add per-hw_ctx rcu_head and
+		 * wait in parallel.
+		 */
+		queue_for_each_hw_ctx(q, hctx, i) {
+			if (!hctx->nr_expired)
+				continue;
+
+			if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+				has_rcu = true;
+			else
+				synchronize_srcu(hctx->queue_rq_srcu);
+
+			hctx->nr_expired = 0;
+		}
+		if (has_rcu)
+			synchronize_rcu();
+
+		/* terminate the ones we won */
+		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+	}
+
 	if (data.next_set) {
 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
 		mod_timer(&q->timeout, data.next);
 	} else {
-		struct blk_mq_hw_ctx *hctx;
-
 		queue_for_each_hw_ctx(q, hctx, i) {
 			/* the hctx may be unmapped, so check it here */
 			if (blk_mq_hw_queue_mapped(hctx))
@@ -1893,6 +1973,22 @@ static size_t order_to_size(unsigned int order)
 	return (size_t)PAGE_SIZE << order;
 }
 
+static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+			       unsigned int hctx_idx, int node)
+{
+	int ret;
+
+	if (set->ops->init_request) {
+		ret = set->ops->init_request(set, rq, hctx_idx, node);
+		if (ret)
+			return ret;
+	}
+
+	seqcount_init(&rq->gstate_seq);
+	u64_stats_init(&rq->aborted_gstate_sync);
+	return 0;
+}
+
 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx, unsigned int depth)
 {
@@ -1954,12 +2050,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 			struct request *rq = p;
 
 			tags->static_rqs[i] = rq;
-			if (set->ops->init_request) {
-				if (set->ops->init_request(set, rq, hctx_idx,
-						node)) {
-					tags->static_rqs[i] = NULL;
-					goto fail;
-				}
+			if (blk_mq_init_request(set, rq, hctx_idx, node)) {
+				tags->static_rqs[i] = NULL;
+				goto fail;
 			}
 
 			p += rq_size;
@@ -2099,9 +2192,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (!hctx->fq)
 		goto sched_exit_hctx;
 
-	if (set->ops->init_request &&
-	    set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
-				   node))
+	if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
 		goto free_fq;
 
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -3019,12 +3110,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 
 static int __init blk_mq_init(void)
 {
-	/*
-	 * See comment in block/blk.h rq_atomic_flags enum
-	 */
-	BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
-			(REQ_ATOM_COMPLETE / BITS_PER_BYTE));
-
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
 				blk_mq_hctx_notify_dead);
 	return 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6c7c3ff5bf62..cf01f6f8c73d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,6 +27,19 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
+/*
+ * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value
+ * and the upper bits the generation number.
+ */
+enum mq_rq_state {
+	MQ_RQ_IDLE		= 0,
+	MQ_RQ_IN_FLIGHT		= 1,
+
+	MQ_RQ_STATE_BITS	= 2,
+	MQ_RQ_STATE_MASK	= (1 << MQ_RQ_STATE_BITS) - 1,
+	MQ_RQ_GEN_INC		= 1 << MQ_RQ_STATE_BITS,
+};
+
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -85,6 +98,39 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
 
 void blk_mq_release(struct request_queue *q);
 
+/**
+ * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
+ * @rq: target request.
+ */
+static inline int blk_mq_rq_state(struct request *rq)
+{
+	return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
+}
+
+/**
+ * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
+ * @rq: target request.
+ * @state: new state to set.
+ *
+ * Set @rq's state to @state.  The caller is responsible for ensuring that
+ * there are no other updaters.  A request can transition into IN_FLIGHT
+ * only from IDLE and doing so increments the generation number.
+ */
+static inline void blk_mq_rq_update_state(struct request *rq,
+					  enum mq_rq_state state)
+{
+	u64 old_val = READ_ONCE(rq->gstate);
+	u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
+
+	if (state == MQ_RQ_IN_FLIGHT) {
+		WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
+		new_val += MQ_RQ_GEN_INC;
+	}
+
+	/* avoid exposing interim values */
+	WRITE_ONCE(rq->gstate, new_val);
+}
+
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
 					   unsigned int cpu)
 {
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 764ecf9aeb30..6427be7ac363 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -208,7 +208,7 @@ void blk_add_timer(struct request *req)
 	if (!req->timeout)
 		req->timeout = q->rq_timeout;
 
-	WRITE_ONCE(req->deadline, jiffies + req->timeout);
+	req->deadline = jiffies + req->timeout;
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
diff --git a/block/blk.h b/block/blk.h
index 3f1446937aec..9cb2739edb6a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -123,12 +123,6 @@ void blk_account_io_done(struct request *req);
  * Internal atomic flags for request handling
  */
 enum rq_atomic_flags {
-	/*
-	 * Keep these two bits first - not because we depend on the
-	 * value of them, but we do depend on them being in the same
-	 * byte of storage to ensure ordering on writes. Keeping them
-	 * first will achieve that nicely.
-	 */
 	REQ_ATOM_COMPLETE = 0,
 	REQ_ATOM_STARTED,
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 95c9a5c862e2..460798dbac1f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx {
 	unsigned int		queue_num;
 
 	atomic_t		nr_active;
+	unsigned int		nr_expired;
 
 	struct hlist_node	cpuhp_dead;
 	struct kobject		kobj;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 46e606f5b44b..ae563d01b29d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,6 +27,8 @@
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
+#include <linux/seqlock.h>
+#include <linux/u64_stats_sync.h>
 
 struct module;
 struct scsi_ioctl_command;
@@ -230,6 +232,27 @@ struct request {
 
 	unsigned short write_hint;
 
+	/*
+	 * On blk-mq, the lower bits of ->gstate (generation number and
+	 * state) carry the MQ_RQ_* state value and the upper bits the
+	 * generation number which is monotonically incremented and used to
+	 * distinguish the reuse instances.
+	 *
+	 * ->gstate_seq allows updates to ->gstate and other fields
+	 * (currently ->deadline) during request start to be read
+	 * atomically from the timeout path, so that it can operate on a
+	 * coherent set of information.
+	 */
+	seqcount_t gstate_seq;
+	u64 gstate;
+
+	/*
+	 * ->aborted_gstate is used by the timeout to claim a specific
+	 * recycle instance of this request.  See blk_mq_timeout_work().
+	 */
+	struct u64_stats_sync aborted_gstate_sync;
+	u64 aborted_gstate;
+
 	unsigned long deadline;
 	struct list_head timeout_list;
 
-- 
cgit v1.2.3


From 634f9e4631a88025d3b90c1884e9a1b6a13d01d2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2018 08:29:51 -0800
Subject: blk-mq: remove REQ_ATOM_COMPLETE usages from blk-mq

After the recent updates to use generation number and state based
synchronization, blk-mq no longer depends on REQ_ATOM_COMPLETE except
to avoid firing the same timeout multiple times.

Remove all REQ_ATOM_COMPLETE usages and use a new rq_flags flag
RQF_MQ_TIMEOUT_EXPIRED to avoid firing the same timeout multiple
times.  This removes atomic bitops from hot paths too.

v2: Removed blk_clear_rq_complete() from blk_mq_rq_timed_out().

v3: Added RQF_MQ_TIMEOUT_EXPIRED flag.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 15 +++++++--------
 block/blk-timeout.c    |  1 +
 include/linux/blkdev.h |  2 ++
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 90f6910a83f6..d1000c6cbec6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -634,8 +634,7 @@ void blk_mq_complete_request(struct request *rq)
 	 * hctx_lock() covers both issue and completion paths.
 	 */
 	hctx_lock(hctx, &srcu_idx);
-	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
-	    !blk_mark_rq_complete(rq))
+	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
 		__blk_mq_complete_request(rq);
 	hctx_unlock(hctx, srcu_idx);
 }
@@ -685,8 +684,6 @@ void blk_mq_start_request(struct request *rq)
 	preempt_enable();
 
 	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
-		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
@@ -837,6 +834,8 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
 		return;
 
+	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
+
 	if (ops->timeout)
 		ret = ops->timeout(req, reserved);
 
@@ -852,7 +851,6 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 		 */
 		blk_mq_rq_update_aborted_gstate(req, 0);
 		blk_add_timer(req);
-		blk_clear_rq_complete(req);
 		break;
 	case BLK_EH_NOT_HANDLED:
 		break;
@@ -871,7 +869,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 
 	might_sleep();
 
-	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+	if ((rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) ||
+	    !test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 		return;
 
 	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
@@ -906,8 +905,8 @@ static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
 	 * now guaranteed to see @rq->aborted_gstate and yield.  If
 	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
 	 */
-	if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
-	    !blk_mark_rq_complete(rq))
+	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
+	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
 		blk_mq_rq_timed_out(rq, reserved);
 }
 
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 4f04cd1e0b74..ebe99963386c 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -214,6 +214,7 @@ void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	req->deadline = jiffies + req->timeout;
+	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ae563d01b29d..007a7cf1f262 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -125,6 +125,8 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
 /* The per-zone write lock is held for this request */
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
+/* timeout is expired */
+#define RQF_MQ_TIMEOUT_EXPIRED	((__force req_flags_t)(1 << 20))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
-- 
cgit v1.2.3


From 05707b64aed8f5f1674b25334fb720d651459d5e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2018 08:29:53 -0800
Subject: blk-mq: rename blk_mq_hw_ctx->queue_rq_srcu to ->srcu

The RCU protection has been expanded to cover both queueing and
completion paths making ->queue_rq_srcu a misnomer.  Rename it to
->srcu as suggested by Bart.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 14 +++++++-------
 include/linux/blk-mq.h |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 275812909d77..0269d44d512e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -219,7 +219,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->flags & BLK_MQ_F_BLOCKING)
-			synchronize_srcu(hctx->queue_rq_srcu);
+			synchronize_srcu(hctx->srcu);
 		else
 			rcu = true;
 	}
@@ -564,7 +564,7 @@ static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 		rcu_read_unlock();
 	else
-		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
+		srcu_read_unlock(hctx->srcu, srcu_idx);
 }
 
 static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
@@ -572,7 +572,7 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 		rcu_read_lock();
 	else
-		*srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
+		*srcu_idx = srcu_read_lock(hctx->srcu);
 }
 
 static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
@@ -937,7 +937,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
 			if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 				has_rcu = true;
 			else
-				synchronize_srcu(hctx->queue_rq_srcu);
+				synchronize_srcu(hctx->srcu);
 
 			hctx->nr_expired = 0;
 		}
@@ -2101,7 +2101,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 		set->ops->exit_hctx(hctx, hctx_idx);
 
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		cleanup_srcu_struct(hctx->queue_rq_srcu);
+		cleanup_srcu_struct(hctx->srcu);
 
 	blk_mq_remove_cpuhp(hctx);
 	blk_free_flush_queue(hctx->fq);
@@ -2174,7 +2174,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 		goto free_fq;
 
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		init_srcu_struct(hctx->queue_rq_srcu);
+		init_srcu_struct(hctx->srcu);
 
 	blk_mq_debugfs_register_hctx(q, hctx);
 
@@ -2463,7 +2463,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
 {
 	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
 
-	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
+	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
 			   __alignof__(struct blk_mq_hw_ctx)) !=
 		     sizeof(struct blk_mq_hw_ctx));
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 460798dbac1f..8efcf49796a3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -66,7 +66,7 @@ struct blk_mq_hw_ctx {
 #endif
 
 	/* Must be the last member - see also blk_mq_hw_ctx_size(). */
-	struct srcu_struct	queue_rq_srcu[0];
+	struct srcu_struct	srcu[0];
 };
 
 struct blk_mq_tag_set {
-- 
cgit v1.2.3


From 9111e5686c8cf3905191d4feb819acd874437500 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 9 Jan 2018 12:04:16 -0700
Subject: block: Provide blk_status_t decoding for path errors

This patch provides a common decoder for block status path related errors
that may be retried so various entities wishing to consult this do not
have to duplicate this decision.

Acked-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a1e628e032da..2d973ac54b09 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -39,6 +39,34 @@ typedef u8 __bitwise blk_status_t;
 
 #define BLK_STS_AGAIN		((__force blk_status_t)12)
 
+/**
+ * blk_path_error - returns true if error may be path related
+ * @error: status the request was completed with
+ *
+ * Description:
+ *     This classifies block error status into non-retryable errors and ones
+ *     that may be successful if retried on a failover path.
+ *
+ * Return:
+ *     %false - retrying failover path will not help
+ *     %true  - may succeed if retried
+ */
+static inline bool blk_path_error(blk_status_t error)
+{
+	switch (error) {
+	case BLK_STS_NOTSUPP:
+	case BLK_STS_NOSPC:
+	case BLK_STS_TARGET:
+	case BLK_STS_NEXUS:
+	case BLK_STS_MEDIUM:
+	case BLK_STS_PROTECTION:
+		return false;
+	}
+
+	/* Anything else could be a path failure, so should be retried */
+	return true;
+}
+
 struct blk_issue_stat {
 	u64 stat;
 };
-- 
cgit v1.2.3


From 76a86f9d027b342b8759a4b2f9f7fe046e284220 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Jan 2018 11:30:56 -0700
Subject: block: remove REQ_ATOM_POLL_SLEPT

We don't need this to be an atomic flag, it can be a regular
flag. We either end up on the same CPU for the polling, in which
case the state is sane, or we did the sleep which would imply
the needed barrier to ensure we see the right state.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 2 +-
 block/blk-mq.c         | 5 ++---
 block/blk.h            | 2 --
 include/linux/blkdev.h | 2 ++
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 25d41151073d..dd890d5e0fbd 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -290,13 +290,13 @@ static const char *const rqf_name[] = {
 	RQF_NAME(SPECIAL_PAYLOAD),
 	RQF_NAME(ZONE_WRITE_LOCKED),
 	RQF_NAME(MQ_TIMEOUT_EXPIRED),
+	RQF_NAME(MQ_POLL_SLEPT),
 };
 #undef RQF_NAME
 
 #define RQAF_NAME(name) [REQ_ATOM_##name] = #name
 static const char *const rqaf_name[] = {
 	RQAF_NAME(COMPLETE),
-	RQAF_NAME(POLL_SLEPT),
 };
 #undef RQAF_NAME
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 266fc4f6b046..3239ca9e199f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -483,7 +483,6 @@ void blk_mq_free_request(struct request *rq)
 		blk_put_rl(blk_rq_rl(rq));
 
 	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
-	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
 	if (rq->tag != -1)
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
@@ -2976,7 +2975,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	unsigned int nsecs;
 	ktime_t kt;
 
-	if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+	if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
 		return false;
 
 	/*
@@ -2996,7 +2995,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	if (!nsecs)
 		return false;
 
-	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+	rq->rq_flags |= RQF_MQ_POLL_SLEPT;
 
 	/*
 	 * This will be replaced with the stats tracking code, using
diff --git a/block/blk.h b/block/blk.h
index a68dbe312ea3..eb306c52121e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -124,8 +124,6 @@ void blk_account_io_done(struct request *req);
  */
 enum rq_atomic_flags {
 	REQ_ATOM_COMPLETE = 0,
-
-	REQ_ATOM_POLL_SLEPT,
 };
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 007a7cf1f262..ba31674d8581 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -127,6 +127,8 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
 /* timeout is expired */
 #define RQF_MQ_TIMEOUT_EXPIRED	((__force req_flags_t)(1 << 20))
+/* already slept for hybrid poll */
+#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 21))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
-- 
cgit v1.2.3


From 0a72e7f44964b9ada3e5c15820372e9cb119bf80 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 9 Jan 2018 14:23:42 -0700
Subject: block: add accessors for setting/querying request deadline

We reduce the resolution of request expiry, but since we're already
using jiffies for this where resolution depends on the kernel
configuration and since the timeout resolution is coarse anyway,
that should be fine.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         |  2 +-
 block/blk-timeout.c    | 14 ++++++++------
 block/blk.h            | 15 +++++++++++++++
 include/linux/blkdev.h |  4 +++-
 4 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3239ca9e199f..7035c305be45 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -858,7 +858,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 	while (true) {
 		start = read_seqcount_begin(&rq->gstate_seq);
 		gstate = READ_ONCE(rq->gstate);
-		deadline = rq->deadline;
+		deadline = blk_rq_deadline(rq);
 		if (!read_seqcount_retry(&rq->gstate_seq, start))
 			break;
 		cond_resched();
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index ebe99963386c..a05e3676d24a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req)
 static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
 			  unsigned int *next_set)
 {
-	if (time_after_eq(jiffies, rq->deadline)) {
+	const unsigned long deadline = blk_rq_deadline(rq);
+
+	if (time_after_eq(jiffies, deadline)) {
 		list_del_init(&rq->timeout_list);
 
 		/*
@@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
 		 */
 		if (!blk_mark_rq_complete(rq))
 			blk_rq_timed_out(rq);
-	} else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
-		*next_timeout = rq->deadline;
+	} else if (!*next_set || time_after(*next_timeout, deadline)) {
+		*next_timeout = deadline;
 		*next_set = 1;
 	}
 }
@@ -162,7 +164,7 @@ void blk_abort_request(struct request *req)
 		 * immediately and that scan sees the new timeout value.
 		 * No need for fancy synchronizations.
 		 */
-		req->deadline = jiffies;
+		blk_rq_set_deadline(req, jiffies);
 		mod_timer(&req->q->timeout, 0);
 	} else {
 		if (blk_mark_rq_complete(req))
@@ -213,7 +215,7 @@ void blk_add_timer(struct request *req)
 	if (!req->timeout)
 		req->timeout = q->rq_timeout;
 
-	req->deadline = jiffies + req->timeout;
+	blk_rq_set_deadline(req, jiffies + req->timeout);
 	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
 
 	/*
@@ -228,7 +230,7 @@ void blk_add_timer(struct request *req)
 	 * than an existing one, modify the timer. Round up to next nearest
 	 * second.
 	 */
-	expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
+	expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
 
 	if (!timer_pending(&q->timeout) ||
 	    time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk.h b/block/blk.h
index eb306c52121e..bcd9cf7db0d4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -236,6 +236,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 		q->last_merge = NULL;
 }
 
+/*
+ * Steal a bit from this field for legacy IO path atomic IO marking. Note that
+ * setting the deadline clears the bottom bit, potentially clearing the
+ * completed bit. The user has to be OK with this (current ones are fine).
+ */
+static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
+{
+	rq->__deadline = time & ~0x1UL;
+}
+
+static inline unsigned long blk_rq_deadline(struct request *rq)
+{
+	return rq->__deadline & ~0x1UL;
+}
+
 /*
  * Internal io_context interface
  */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ba31674d8581..aa6698cf483c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -257,7 +257,9 @@ struct request {
 	struct u64_stats_sync aborted_gstate_sync;
 	u64 aborted_gstate;
 
-	unsigned long deadline;
+	/* access through blk_rq_set_deadline, blk_rq_deadline */
+	unsigned long __deadline;
+
 	struct list_head timeout_list;
 
 	/*
-- 
cgit v1.2.3


From e14575b3d457f5806d79b85886ef94d9c29e3b2a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Jan 2018 11:34:25 -0700
Subject: block: convert REQ_ATOM_COMPLETE to stealing rq->__deadline bit

We only have one atomic flag left. Instead of using an entire
unsigned long for that, steal the bottom bit of the deadline
field that we already reserved.

Remove ->atomic_flags, since it's now unused.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  2 +-
 block/blk-mq-debugfs.c |  9 +--------
 block/blk-mq.c         |  2 +-
 block/blk.h            | 19 +++++++++----------
 include/linux/blkdev.h |  2 --
 5 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index f843ae4f858d..7ba607527487 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2853,7 +2853,7 @@ void blk_start_request(struct request *req)
 		wbt_issue(req->q->rq_wb, &req->issue_stat);
 	}
 
-	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
+	BUG_ON(blk_rq_is_complete(req));
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index dd890d5e0fbd..19db3f583bf1 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -294,12 +294,6 @@ static const char *const rqf_name[] = {
 };
 #undef RQF_NAME
 
-#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
-static const char *const rqaf_name[] = {
-	RQAF_NAME(COMPLETE),
-};
-#undef RQAF_NAME
-
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 {
 	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
@@ -316,8 +310,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 	seq_puts(m, ", .rq_flags=");
 	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
 		       ARRAY_SIZE(rqf_name));
-	seq_puts(m, ", .atomic_flags=");
-	blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
+	seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
 	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
 		   rq->internal_tag);
 	if (mq_ops->show_rq)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7035c305be45..87e6b10c8ecb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -294,7 +294,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
-	/* do not touch atomic flags, it needs atomic ops against the timer */
 	rq->cpu = -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
@@ -313,6 +312,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->special = NULL;
 	/* tag was already set */
 	rq->extra_len = 0;
+	rq->__deadline = 0;
 
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->timeout = 0;
diff --git a/block/blk.h b/block/blk.h
index bcd9cf7db0d4..c84ae0e21ebd 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -119,25 +119,24 @@ void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
 void blk_account_io_done(struct request *req);
 
-/*
- * Internal atomic flags for request handling
- */
-enum rq_atomic_flags {
-	REQ_ATOM_COMPLETE = 0,
-};
-
 /*
  * EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them succeeds
+ * sure that only one of them succeeds. Steal the bottom bit of the
+ * __deadline field for this.
  */
 static inline int blk_mark_rq_complete(struct request *rq)
 {
-	return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	return test_and_set_bit(0, &rq->__deadline);
 }
 
 static inline void blk_clear_rq_complete(struct request *rq)
 {
-	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	clear_bit(0, &rq->__deadline);
+}
+
+static inline bool blk_rq_is_complete(struct request *rq)
+{
+	return test_bit(0, &rq->__deadline);
 }
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aa6698cf483c..d4b2f7bb18d6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -156,8 +156,6 @@ struct request {
 
 	int internal_tag;
 
-	unsigned long atomic_flags;
-
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
 	int tag;
-- 
cgit v1.2.3


From 7c3fb70f0341f9d924818e648906774921f4bcb3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Jan 2018 11:46:39 -0700
Subject: block: rearrange a few request fields for better cache layout

Move completion related items (like the call single data) near the
end of the struct, instead of mixing them in with the initial
queueing related fields.

Move queuelist below the bio structures. Then we have all
queueing related bits in the first cache line.

This yields a 1.5-2% increase in IOPS for a null_blk test, both for
sync and for high thread count access. Sync test goes form 975K to
992K, 32-thread case from 20.8M to 21.2M IOPS.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 18 +++++++++---------
 include/linux/blkdev.h | 28 +++++++++++++++-------------
 2 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 87e6b10c8ecb..435a5a0d441f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -270,8 +270,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct request *rq = tags->static_rqs[tag];
 
-	rq->rq_flags = 0;
-
 	if (data->flags & BLK_MQ_REQ_INTERNAL) {
 		rq->tag = -1;
 		rq->internal_tag = tag;
@@ -285,26 +283,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		data->hctx->tags->rqs[rq->tag] = rq;
 	}
 
-	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = data->q;
 	rq->mq_ctx = data->ctx;
+	rq->rq_flags = 0;
+	rq->cpu = -1;
 	rq->cmd_flags = op;
 	if (data->flags & BLK_MQ_REQ_PREEMPT)
 		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
-	rq->cpu = -1;
+	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
 	rq->start_time = jiffies;
-#ifdef CONFIG_BLK_CGROUP
-	rq->rl = NULL;
-	set_start_time_ns(rq);
-	rq->io_start_time_ns = 0;
-#endif
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
@@ -321,6 +315,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
 
+#ifdef CONFIG_BLK_CGROUP
+	rq->rl = NULL;
+	set_start_time_ns(rq);
+	rq->io_start_time_ns = 0;
+#endif
+
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
 	return rq;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d4b2f7bb18d6..71a9371c8182 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -141,12 +141,6 @@ typedef __u32 __bitwise req_flags_t;
  * especially blk_mq_rq_ctx_init() to take care of the added fields.
  */
 struct request {
-	struct list_head queuelist;
-	union {
-		call_single_data_t csd;
-		u64 fifo_time;
-	};
-
 	struct request_queue *q;
 	struct blk_mq_ctx *mq_ctx;
 
@@ -164,6 +158,8 @@ struct request {
 	struct bio *bio;
 	struct bio *biotail;
 
+	struct list_head queuelist;
+
 	/*
 	 * The hash is used inside the scheduler, and killed once the
 	 * request reaches the dispatch list. The ipi_list is only used
@@ -211,19 +207,16 @@ struct request {
 	struct hd_struct *part;
 	unsigned long start_time;
 	struct blk_issue_stat issue_stat;
-#ifdef CONFIG_BLK_CGROUP
-	struct request_list *rl;		/* rl this rq is alloced from */
-	unsigned long long start_time_ns;
-	unsigned long long io_start_time_ns;    /* when passed to hardware */
-#endif
 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
+
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	unsigned short nr_integrity_segments;
 #endif
 
+	unsigned short write_hint;
 	unsigned short ioprio;
 
 	unsigned int timeout;
@@ -232,8 +225,6 @@ struct request {
 
 	unsigned int extra_len;	/* length of alignment and padding */
 
-	unsigned short write_hint;
-
 	/*
 	 * On blk-mq, the lower bits of ->gstate (generation number and
 	 * state) carry the MQ_RQ_* state value and the upper bits the
@@ -260,6 +251,11 @@ struct request {
 
 	struct list_head timeout_list;
 
+	union {
+		call_single_data_t csd;
+		u64 fifo_time;
+	};
+
 	/*
 	 * completion callback.
 	 */
@@ -268,6 +264,12 @@ struct request {
 
 	/* for bidi */
 	struct request *next_rq;
+
+#ifdef CONFIG_BLK_CGROUP
+	struct request_list *rl;		/* rl this rq is alloced from */
+	unsigned long long start_time_ns;
+	unsigned long long io_start_time_ns;    /* when passed to hardware */
+#endif
 };
 
 static inline bool blk_rq_is_scsi(struct request *rq)
-- 
cgit v1.2.3


From fa70d2e2c4a0a54ced98260c6a176cc94c876d27 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 8 Jan 2018 22:01:13 -0500
Subject: block: allow gendisk's request_queue registration to be deferred

Since I can remember DM has forced the block layer to allow the
allocation and initialization of the request_queue to be distinct
operations.  Reason for this is block/genhd.c:add_disk() has requires
that the request_queue (and associated bdi) be tied to the gendisk
before add_disk() is called -- because add_disk() also deals with
exposing the request_queue via blk_register_queue().

DM's dynamic creation of arbitrary device types (and associated
request_queue types) requires the DM device's gendisk be available so
that DM table loads can establish a master/slave relationship with
subordinate devices that are referenced by loaded DM tables -- using
bd_link_disk_holder().  But until these DM tables, and their associated
subordinate devices, are known DM cannot know what type of request_queue
it needs -- nor what its queue_limits should be.

This chicken and egg scenario has created all manner of problems for DM
and, at times, the block layer.

Summary of changes:

- Add device_add_disk_no_queue_reg() and add_disk_no_queue_reg() variant
  that drivers may use to add a disk without also calling
  blk_register_queue().  Driver must call blk_register_queue() once its
  request_queue is fully initialized.

- Return early from blk_unregister_queue() if QUEUE_FLAG_REGISTERED
  is not set.  It won't be set if driver used add_disk_no_queue_reg()
  but driver encounters an error and must del_gendisk() before calling
  blk_register_queue().

- Export blk_register_queue().

These changes allow DM to use add_disk_no_queue_reg() to anchor its
gendisk as the "master" for master/slave relationships DM must establish
with subordinate devices referenced in DM tables that get loaded.  Once
all "slave" devices for a DM device are known its request_queue can be
properly initialized and then advertised via sysfs -- important
improvement being that no request_queue resource initialization
performed by blk_register_queue() is missed for DM devices anymore.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c     |  5 +++++
 block/genhd.c         | 20 +++++++++++++++++---
 include/linux/genhd.h |  5 +++++
 3 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9272452ff456..4a6a40ffd78e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -921,6 +921,7 @@ unlock:
 	mutex_unlock(&q->sysfs_lock);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(blk_register_queue);
 
 void blk_unregister_queue(struct gendisk *disk)
 {
@@ -929,6 +930,10 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return;
 
+	/* Return early if disk->queue was never registered. */
+	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+		return;
+
 	/*
 	 * Protect against the 'queue' kobj being accessed
 	 * while/after it is removed.
diff --git a/block/genhd.c b/block/genhd.c
index 00620e01e043..88a53c188cb7 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -629,16 +629,18 @@ exit:
 }
 
 /**
- * device_add_disk - add partitioning information to kernel list
+ * __device_add_disk - add disk information to kernel list
  * @parent: parent device for the disk
  * @disk: per-device partitioning information
+ * @register_queue: register the queue if set to true
  *
  * This function registers the partitioning information in @disk
  * with the kernel.
  *
  * FIXME: error handling
  */
-void device_add_disk(struct device *parent, struct gendisk *disk)
+static void __device_add_disk(struct device *parent, struct gendisk *disk,
+			      bool register_queue)
 {
 	dev_t devt;
 	int retval;
@@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 				    exact_match, exact_lock, disk);
 	}
 	register_disk(parent, disk);
-	blk_register_queue(disk);
+	if (register_queue)
+		blk_register_queue(disk);
 
 	/*
 	 * Take an extra ref on queue which will be put on disk_release()
@@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 	disk_add_events(disk);
 	blk_integrity_add(disk);
 }
+
+void device_add_disk(struct device *parent, struct gendisk *disk)
+{
+	__device_add_disk(parent, disk, true);
+}
 EXPORT_SYMBOL(device_add_disk);
 
+void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
+{
+	__device_add_disk(parent, disk, false);
+}
+EXPORT_SYMBOL(device_add_disk_no_queue_reg);
+
 void del_gendisk(struct gendisk *disk)
 {
 	struct disk_part_iter piter;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5144ebe046c9..5e3531027b51 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk)
 {
 	device_add_disk(NULL, disk);
 }
+extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk);
+static inline void add_disk_no_queue_reg(struct gendisk *disk)
+{
+	device_add_disk_no_queue_reg(NULL, disk);
+}
 
 extern void del_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
-- 
cgit v1.2.3


From ddc212313f16cd65fcf5e8d9ae223f8374822e4d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 16 Jan 2018 16:01:36 +0100
Subject: blkcg: simplify statistic accumulation code

Some older compilers (gcc-4.4 through 4.6 in particular) struggle
with the way that blkg_rwstat_read() returns a structure, leading
to excessive stack usage and rather inefficient code:

block/blk-cgroup.c: In function 'blkg_destroy':
block/blk-cgroup.c:354:1: error: the frame size of 1296 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]
block/cfq-iosched.c: In function 'cfqg_stats_add_aux':
block/cfq-iosched.c:753:1: error: the frame size of 1928 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]
block/bfq-cgroup.c: In function 'bfqg_stats_add_aux':
block/bfq-cgroup.c:299:1: error: the frame size of 1928 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]

I also notice that there is no point in using atomic accesses
for the local variables, so storing the temporaries in simple 'u64'
variables not only avoids the stack usage on older compilers but
also improves the object code on modern versions.

Fixes: e6269c445467 ("blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats with it")
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e9825ff57b15..69bea82ebeb1 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -660,12 +660,14 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
 				       struct blkg_rwstat *from)
 {
-	struct blkg_rwstat v = blkg_rwstat_read(from);
+	u64 sum[BLKG_RWSTAT_NR];
 	int i;
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_add(atomic64_read(&v.aux_cnt[i]) +
-			     atomic64_read(&from->aux_cnt[i]),
+		sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
 			     &to->aux_cnt[i]);
 }
 
-- 
cgit v1.2.3


From 88de4598bca84e27b261685c06fff816b8d932a1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 20 Dec 2017 14:50:00 +0100
Subject: nvme-pci: clean up SMBSZ bit definitions

Define the bit positions instead of macros using the magic values,
and move the expanded helpers to calculate the size and size unit into
the implementation C file.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/nvme/host/pci.c | 23 +++++++++++++++++------
 include/linux/nvme.h    | 22 ++++++++++++++--------
 2 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index edb57e984865..a2ffb557b616 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1364,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 				int qid, int depth)
 {
-	if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
+	if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
 		unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
 						      dev->ctrl.page_size);
 		nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
@@ -1651,9 +1651,21 @@ static ssize_t nvme_cmb_show(struct device *dev,
 }
 static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
 
+static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
+{
+	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
+
+	return 1ULL << (12 + 4 * szu);
+}
+
+static u32 nvme_cmb_size(struct nvme_dev *dev)
+{
+	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
+}
+
 static void nvme_map_cmb(struct nvme_dev *dev)
 {
-	u64 szu, size, offset;
+	u64 size, offset;
 	resource_size_t bar_size;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	int bar;
@@ -1666,9 +1678,8 @@ static void nvme_map_cmb(struct nvme_dev *dev)
 	if (!use_cmb_sqes)
 		return;
 
-	szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
-	size = szu * NVME_CMB_SZ(dev->cmbsz);
-	offset = szu * NVME_CMB_OFST(dev->cmbloc);
+	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
+	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
 	bar = NVME_CMB_BIR(dev->cmbloc);
 	bar_size = pci_resource_len(pdev, bar);
 
@@ -1897,7 +1908,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	if (nr_io_queues == 0)
 		return 0;
 
-	if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
+	if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
 		result = nvme_cmb_qdepth(dev, nr_io_queues,
 				sizeof(struct nvme_command));
 		if (result > 0)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index aea87f0d917b..4112e2bd747f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -124,14 +124,20 @@ enum {
 
 #define NVME_CMB_BIR(cmbloc)	((cmbloc) & 0x7)
 #define NVME_CMB_OFST(cmbloc)	(((cmbloc) >> 12) & 0xfffff)
-#define NVME_CMB_SZ(cmbsz)	(((cmbsz) >> 12) & 0xfffff)
-#define NVME_CMB_SZU(cmbsz)	(((cmbsz) >> 8) & 0xf)
-
-#define NVME_CMB_WDS(cmbsz)	((cmbsz) & 0x10)
-#define NVME_CMB_RDS(cmbsz)	((cmbsz) & 0x8)
-#define NVME_CMB_LISTS(cmbsz)	((cmbsz) & 0x4)
-#define NVME_CMB_CQS(cmbsz)	((cmbsz) & 0x2)
-#define NVME_CMB_SQS(cmbsz)	((cmbsz) & 0x1)
+
+enum {
+	NVME_CMBSZ_SQS		= 1 << 0,
+	NVME_CMBSZ_CQS		= 1 << 1,
+	NVME_CMBSZ_LISTS	= 1 << 2,
+	NVME_CMBSZ_RDS		= 1 << 3,
+	NVME_CMBSZ_WDS		= 1 << 4,
+
+	NVME_CMBSZ_SZ_SHIFT	= 12,
+	NVME_CMBSZ_SZ_MASK	= 0xfffff,
+
+	NVME_CMBSZ_SZU_SHIFT	= 8,
+	NVME_CMBSZ_SZU_MASK	= 0xf,
+};
 
 /*
  * Submission and Completion Queue Entry Sizes for the NVM command set.
-- 
cgit v1.2.3


From 83d016ac86428dbca8a62d3e4fdc29e3ea39e535 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 17 Jan 2018 11:48:08 -0800
Subject: block: Unexport elv_register_queue() and elv_unregister_queue()

These two functions are only called from inside the block layer so
unexport them.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h              | 3 +++
 block/elevator.c         | 2 --
 include/linux/elevator.h | 2 --
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/block/blk.h b/block/blk.h
index c84ae0e21ebd..b1771851ed92 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -162,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
 		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
 
+int elv_register_queue(struct request_queue *q);
+void elv_unregister_queue(struct request_queue *q);
+
 struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
diff --git a/block/elevator.c b/block/elevator.c
index 138faeb08a7c..4f00b53cd5fd 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -886,7 +886,6 @@ int elv_register_queue(struct request_queue *q)
 	}
 	return error;
 }
-EXPORT_SYMBOL(elv_register_queue);
 
 void elv_unregister_queue(struct request_queue *q)
 {
@@ -900,7 +899,6 @@ void elv_unregister_queue(struct request_queue *q)
 		wbt_enable_default(q);
 	}
 }
-EXPORT_SYMBOL(elv_unregister_queue);
 
 int elv_register(struct elevator_type *e)
 {
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 3d794b3dc532..6d9e230dffd2 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
-extern int elv_register_queue(struct request_queue *q);
-extern void elv_unregister_queue(struct request_queue *q);
 extern int elv_may_queue(struct request_queue *, unsigned int);
 extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *q, struct request *rq,
-- 
cgit v1.2.3


From 8c7a8d1c4b9c30a2be3b31a2e6af1cefd45574eb Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 19 Jan 2018 11:00:54 -0800
Subject: lib/scatterlist: Fix chaining support in sgl_alloc_order()

This patch avoids that workloads with large block sizes (megabytes)
can trigger the following call stack with the ib_srpt driver (that
driver is the only driver that chains scatterlists allocated by
sgl_alloc_order()):

BUG: Bad page state in process kworker/0:1H  pfn:2423a78
page:fffffb03d08e9e00 count:-3 mapcount:0 mapping:          (null) index:0x0
flags: 0x57ffffc0000000()
raw: 0057ffffc0000000 0000000000000000 0000000000000000 fffffffdffffffff
raw: dead000000000100 dead000000000200 0000000000000000 0000000000000000
page dumped because: nonzero _count
CPU: 0 PID: 733 Comm: kworker/0:1H Tainted: G          I      4.15.0-rc7.bart+ #1
Hardware name: HP ProLiant DL380 G7, BIOS P67 08/16/2015
Workqueue: ib-comp-wq ib_cq_poll_work [ib_core]
Call Trace:
 dump_stack+0x5c/0x83
 bad_page+0xf5/0x10f
 get_page_from_freelist+0xa46/0x11b0
 __alloc_pages_nodemask+0x103/0x290
 sgl_alloc_order+0x101/0x180
 target_alloc_sgl+0x2c/0x40 [target_core_mod]
 srpt_alloc_rw_ctxs+0x173/0x2d0 [ib_srpt]
 srpt_handle_new_iu+0x61e/0x7f0 [ib_srpt]
 __ib_process_cq+0x55/0xa0 [ib_core]
 ib_cq_poll_work+0x1b/0x60 [ib_core]
 process_one_work+0x141/0x340
 worker_thread+0x47/0x3e0
 kthread+0xf5/0x130
 ret_from_fork+0x1f/0x30

Fixes: e80a0af4759a ("lib/scatterlist: Introduce sgl_alloc() and sgl_free()")
Reported-by: Laurence Oberman <loberman@redhat.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Cc: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/target/target_core_transport.c |  2 +-
 include/linux/scatterlist.h            |  1 +
 lib/scatterlist.c                      | 32 +++++++++++++++++++++++++++-----
 3 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index a001ba711cca..c03a78ee26cd 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -2300,7 +2300,7 @@ queue_full:
 
 void target_free_sgl(struct scatterlist *sgl, int nents)
 {
-	sgl_free(sgl);
+	sgl_free_n_order(sgl, nents, 0);
 }
 EXPORT_SYMBOL(target_free_sgl);
 
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index b8a7c1d1dbe3..22b2131bcdcd 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -282,6 +282,7 @@ struct scatterlist *sgl_alloc_order(unsigned long long length,
 				    gfp_t gfp, unsigned int *nent_p);
 struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
 			      unsigned int *nent_p);
+void sgl_free_n_order(struct scatterlist *sgl, int nents, int order);
 void sgl_free_order(struct scatterlist *sgl, int order);
 void sgl_free(struct scatterlist *sgl);
 #endif /* CONFIG_SGL_ALLOC */
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 9afc9b432083..53728d391d3a 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -512,7 +512,7 @@ struct scatterlist *sgl_alloc_order(unsigned long long length,
 	if (!sgl)
 		return NULL;
 
-	sg_init_table(sgl, nent);
+	sg_init_table(sgl, nalloc);
 	sg = sgl;
 	while (length) {
 		elem_len = min_t(u64, length, PAGE_SIZE << order);
@@ -526,7 +526,7 @@ struct scatterlist *sgl_alloc_order(unsigned long long length,
 		length -= elem_len;
 		sg = sg_next(sg);
 	}
-	WARN_ON_ONCE(sg);
+	WARN_ONCE(length, "length = %lld\n", length);
 	if (nent_p)
 		*nent_p = nent;
 	return sgl;
@@ -549,22 +549,44 @@ struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
 EXPORT_SYMBOL(sgl_alloc);
 
 /**
- * sgl_free_order - free a scatterlist and its pages
+ * sgl_free_n_order - free a scatterlist and its pages
  * @sgl: Scatterlist with one or more elements
+ * @nents: Maximum number of elements to free
  * @order: Second argument for __free_pages()
+ *
+ * Notes:
+ * - If several scatterlists have been chained and each chain element is
+ *   freed separately then it's essential to set nents correctly to avoid that a
+ *   page would get freed twice.
+ * - All pages in a chained scatterlist can be freed at once by setting @nents
+ *   to a high number.
  */
-void sgl_free_order(struct scatterlist *sgl, int order)
+void sgl_free_n_order(struct scatterlist *sgl, int nents, int order)
 {
 	struct scatterlist *sg;
 	struct page *page;
+	int i;
 
-	for (sg = sgl; sg; sg = sg_next(sg)) {
+	for_each_sg(sgl, sg, nents, i) {
+		if (!sg)
+			break;
 		page = sg_page(sg);
 		if (page)
 			__free_pages(page, order);
 	}
 	kfree(sgl);
 }
+EXPORT_SYMBOL(sgl_free_n_order);
+
+/**
+ * sgl_free_order - free a scatterlist and its pages
+ * @sgl: Scatterlist with one or more elements
+ * @order: Second argument for __free_pages()
+ */
+void sgl_free_order(struct scatterlist *sgl, int order)
+{
+	sgl_free_n_order(sgl, INT_MAX, order);
+}
 EXPORT_SYMBOL(sgl_free_order);
 
 /**
-- 
cgit v1.2.3


From f5ced52aaa5494c1feb9f80252cb2a2cde0dace8 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 19 Jan 2018 08:58:56 -0800
Subject: block: Remove kblockd_schedule_delayed_work{,_on}()

The previous patch removed all users of these two functions. Hence
also remove the functions themselves.

Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 14 --------------
 include/linux/blkdev.h |  2 --
 2 files changed, 16 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1645a1e54a37..cdae69be68e9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3441,20 +3441,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
 }
 EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
 
-int kblockd_schedule_delayed_work(struct delayed_work *dwork,
-				  unsigned long delay)
-{
-	return queue_delayed_work(kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work);
-
-int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
-				     unsigned long delay)
-{
-	return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
-
 /**
  * blk_start_plug - initialize blk_plug and track it inside the task_struct
  * @plug:	The &struct blk_plug that needs to be initialized
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 71a9371c8182..afc43fb63c16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1800,8 +1800,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 
 int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
-int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
-int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
-- 
cgit v1.2.3