diff options
Diffstat (limited to 'drivers/block')
32 files changed, 1053 insertions, 767 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 1bb8ec575352..025b1b77b11a 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -432,16 +432,6 @@ config VIRTIO_BLK This is the virtual block driver for virtio. It can be used with QEMU based VMMs (like KVM or Xen). Say Y or M. -config VIRTIO_BLK_SCSI - bool "SCSI passthrough request for the Virtio block driver" - depends on VIRTIO_BLK - select BLK_SCSI_REQUEST - ---help--- - Enable support for SCSI passthrough (e.g. the SG_IO ioctl) on - virtio-blk devices. This is only supported for the legacy - virtio protocol and not enabled by default by any hypervisor. - You probably want to use virtio-scsi instead. - config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index bd19f8af950b..7b32fb673375 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -329,6 +329,7 @@ static const struct block_device_operations aoe_bdops = { .open = aoeblk_open, .release = aoeblk_release, .ioctl = aoeblk_ioctl, + .compat_ioctl = blkdev_compat_ptr_ioctl, .getgeo = aoeblk_getgeo, .owner = THIS_MODULE, }; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index bd7d3bb8b890..1553d41f0b91 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -857,7 +857,7 @@ static void fd_calibrate( void ) } if (ATARIHW_PRESENT(FDCSPEED)) - dma_wd.fdc_speed = 0; /* always seek with 8 Mhz */; + dma_wd.fdc_speed = 0; /* always seek with 8 Mhz */ DPRINT(("fd_calibrate\n")); SET_IRQ_HANDLER( fd_calibrate_done ); /* we can't verify, since the speed may be incorrect */ diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c548a5a6c1a0..220c5e18aba0 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -297,6 +297,10 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) unsigned int len = bvec.bv_len; int err; + /* Don't support un-aligned buffer */ + WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) || + (len & (SECTOR_SIZE - 1))); + err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, bio_op(bio), sector); if (err) @@ -382,7 +386,6 @@ static struct brd_device *brd_alloc(int i) goto out_free_dev; blk_queue_make_request(brd->brd_queue, brd_make_request); - blk_queue_max_hw_sectors(brd->brd_queue, 1024); /* This is so fdisk will align partitions on 4k, because of * direct_access API needing 4k alignment, returning a PFN @@ -470,6 +473,25 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data) return kobj; } +static inline void brd_check_and_reset_par(void) +{ + if (unlikely(!max_part)) + max_part = 1; + + /* + * make sure 'max_part' can be divided exactly by (1U << MINORBITS), + * otherwise, it is possiable to get same dev_t when adding partitions. + */ + if ((1U << MINORBITS) % max_part != 0) + max_part = 1UL << fls(max_part); + + if (max_part > DISK_MAX_PARTS) { + pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n", + DISK_MAX_PARTS, DISK_MAX_PARTS); + max_part = DISK_MAX_PARTS; + } +} + static int __init brd_init(void) { struct brd_device *brd, *next; @@ -493,8 +515,7 @@ static int __init brd_init(void) if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) return -EIO; - if (unlikely(!max_part)) - max_part = 1; + brd_check_and_reset_par(); for (i = 0; i < rd_nr; i++) { brd = brd_alloc(i); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index ddbf56014c51..aae99a2d7bd4 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -622,7 +622,7 @@ struct fifo_buffer { int total; /* sum of all values */ int values[0]; }; -extern struct fifo_buffer *fifo_alloc(int fifo_size); +extern struct fifo_buffer *fifo_alloc(unsigned int fifo_size); /* flag bits per connection */ enum { diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c index c58986556161..651bd0236a99 100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@ -13,33 +13,10 @@ sector_t interval_end(struct rb_node *node) return this->end; } -/** - * compute_subtree_last - compute end of @node - * - * The end of an interval is the highest (start + (size >> 9)) value of this - * node and of its children. Called for @node and its parents whenever the end - * may have changed. - */ -static inline sector_t -compute_subtree_last(struct drbd_interval *node) -{ - sector_t max = node->sector + (node->size >> 9); - - if (node->rb.rb_left) { - sector_t left = interval_end(node->rb.rb_left); - if (left > max) - max = left; - } - if (node->rb.rb_right) { - sector_t right = interval_end(node->rb.rb_right); - if (right > max) - max = right; - } - return max; -} +#define NODE_END(node) ((node)->sector + ((node)->size >> 9)) -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb, - sector_t, end, compute_subtree_last); +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks, + struct drbd_interval, rb, sector_t, end, NODE_END); /** * drbd_insert_interval - insert a new interval into a tree diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 5b248763a672..a18155cdce41 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -786,7 +786,6 @@ int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cm if (nc->tentative && connection->agreed_pro_version < 92) { rcu_read_unlock(); - mutex_unlock(&sock->mutex); drbd_err(connection, "--dry-run is not supported by peer"); return -EOPNOTSUPP; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 5d52a2d32155..da4a3ebe04ef 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -268,19 +268,18 @@ static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, /* some more paranoia, if the request was over-determined */ if (adm_ctx->device && adm_ctx->resource && adm_ctx->device->resource != adm_ctx->resource) { - pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n", - adm_ctx->minor, adm_ctx->resource->name, - adm_ctx->device->resource->name); + pr_warn("request: minor=%u, resource=%s; but that minor belongs to resource %s\n", + adm_ctx->minor, adm_ctx->resource->name, + adm_ctx->device->resource->name); drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource"); return ERR_INVALID_REQUEST; } if (adm_ctx->device && adm_ctx->volume != VOLUME_UNSPECIFIED && adm_ctx->volume != adm_ctx->device->vnr) { - pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", - adm_ctx->minor, adm_ctx->volume, - adm_ctx->device->vnr, - adm_ctx->device->resource->name); + pr_warn("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", + adm_ctx->minor, adm_ctx->volume, + adm_ctx->device->vnr, adm_ctx->device->resource->name); drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume"); return ERR_INVALID_REQUEST; } @@ -1576,7 +1575,8 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) struct drbd_device *device; struct disk_conf *new_disk_conf, *old_disk_conf; struct fifo_buffer *old_plan = NULL, *new_plan = NULL; - int err, fifo_size; + int err; + unsigned int fifo_size; retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 2b3103c30857..79e216446030 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3887,7 +3887,7 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; const int apv = connection->agreed_pro_version; struct fifo_buffer *old_plan = NULL, *new_plan = NULL; - int fifo_size = 0; + unsigned int fifo_size = 0; int err; peer_device = conn_peer_device(connection, pi->vnr); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index f86cea4c0f8d..840c3aef3c5c 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -884,7 +884,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, start_new_tl_epoch(connection); mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); break; - }; + } return rv; } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 5bdcc70ad589..b7f605c6e231 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -482,11 +482,11 @@ static void fifo_add_val(struct fifo_buffer *fb, int value) fb->values[i] += value; } -struct fifo_buffer *fifo_alloc(int fifo_size) +struct fifo_buffer *fifo_alloc(unsigned int fifo_size) { struct fifo_buffer *fb; - fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO); + fb = kzalloc(struct_size(fb, values, fifo_size), GFP_NOIO); if (!fb) return NULL; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 0469aceaa230..cd3612e4e2e1 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3780,7 +3780,7 @@ static int compat_getdrvprm(int drive, v.native_format = UDP->native_format; mutex_unlock(&floppy_mutex); - if (copy_from_user(arg, &v, sizeof(struct compat_floppy_drive_params))) + if (copy_to_user(arg, &v, sizeof(struct compat_floppy_drive_params))) return -EFAULT; return 0; } @@ -3816,7 +3816,7 @@ static int compat_getdrvstat(int drive, bool poll, v.bufblocks = UDRS->bufblocks; mutex_unlock(&floppy_mutex); - if (copy_from_user(arg, &v, sizeof(struct compat_floppy_drive_struct))) + if (copy_to_user(arg, &v, sizeof(struct compat_floppy_drive_struct))) return -EFAULT; return 0; Eintr: @@ -3879,6 +3879,9 @@ static int fd_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int { int drive = (long)bdev->bd_disk->private_data; switch (cmd) { + case CDROMEJECT: /* CD-ROM eject */ + case 0x6470: /* SunOS floppy eject */ + case FDMSGON: case FDMSGOFF: case FDSETEMSGTRESH: diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ab7ca5989097..739b372a5112 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -417,18 +417,20 @@ out_free_page: return ret; } -static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos) +static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, + int mode) { /* - * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. + * We use fallocate to manipulate the space mappings used by the image + * a.k.a. discard/zerorange. However we do not support this if + * encryption is enabled, because it may give an attacker useful + * information. */ struct file *file = lo->lo_backing_file; - int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; int ret; + mode |= FALLOC_FL_KEEP_SIZE; + if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { ret = -EOPNOTSUPP; goto out; @@ -596,9 +598,17 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) switch (req_op(rq)) { case REQ_OP_FLUSH: return lo_req_flush(lo, rq); - case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: - return lo_discard(lo, rq, pos); + /* + * If the caller doesn't want deallocation, call zeroout to + * write zeroes the range. Otherwise, punch them out. + */ + return lo_fallocate(lo, rq, pos, + (rq->cmd_flags & REQ_NOUNMAP) ? + FALLOC_FL_ZERO_RANGE : + FALLOC_FL_PUNCH_HOLE); + case REQ_OP_DISCARD: + return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); case REQ_OP_WRITE: if (lo->transfer) return lo_write_transfer(lo, rq, pos); @@ -630,7 +640,9 @@ static void loop_reread_partitions(struct loop_device *lo, { int rc; - rc = blkdev_reread_part(bdev); + mutex_lock(&bdev->bd_mutex); + rc = bdev_disk_changed(bdev, false); + mutex_unlock(&bdev->bd_mutex); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", __func__, lo->lo_number, lo->lo_file_name, rc); @@ -994,6 +1006,16 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_write_cache(lo->lo_queue, true, false); + if (io_is_direct(lo->lo_backing_file) && inode->i_sb->s_bdev) { + /* In case of direct I/O, match underlying block size */ + unsigned short bsize = bdev_logical_block_size( + inode->i_sb->s_bdev); + + blk_queue_logical_block_size(lo->lo_queue, bsize); + blk_queue_physical_block_size(lo->lo_queue, bsize); + blk_queue_io_min(lo->lo_queue, bsize); + } + loop_update_rotational(lo); loop_update_dio(lo); set_capacity(lo->lo_disk, size); @@ -1144,10 +1166,11 @@ out_unlock: * must be at least one and it can only become zero when the * current holder is released. */ - if (release) - err = __blkdev_reread_part(bdev); - else - err = blkdev_reread_part(bdev); + if (!release) + mutex_lock(&bdev->bd_mutex); + err = bdev_disk_changed(bdev, false); + if (!release) + mutex_unlock(&bdev->bd_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo_number, err); @@ -1755,6 +1778,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, case LOOP_SET_FD: case LOOP_CHANGE_FD: case LOOP_SET_BLOCK_SIZE: + case LOOP_SET_DIRECT_IO: err = lo_ioctl(bdev, mode, cmd, arg); break; default: diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 964f78cfffa0..f6bafa9a68b9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -129,7 +129,7 @@ struct mtip_compat_ide_task_request_s { /* * This function check_for_surprise_removal is called * while card is removed from the system and it will - * read the vendor id from the configration space + * read the vendor id from the configuration space * * @pdev Pointer to the pci_dev structure. * diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index e21d2ded732b..78181908f0df 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -26,6 +26,7 @@ #include <linux/ioctl.h> #include <linux/mutex.h> #include <linux/compiler.h> +#include <linux/completion.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/slab.h> @@ -71,14 +72,17 @@ struct link_dead_args { int index; }; -#define NBD_TIMEDOUT 0 +#define NBD_RT_TIMEDOUT 0 +#define NBD_RT_DISCONNECT_REQUESTED 1 +#define NBD_RT_DISCONNECTED 2 +#define NBD_RT_HAS_PID_FILE 3 +#define NBD_RT_HAS_CONFIG_REF 4 +#define NBD_RT_BOUND 5 +#define NBD_RT_DESTROY_ON_DISCONNECT 6 +#define NBD_RT_DISCONNECT_ON_CLOSE 7 + +#define NBD_DESTROY_ON_DISCONNECT 0 #define NBD_DISCONNECT_REQUESTED 1 -#define NBD_DISCONNECTED 2 -#define NBD_HAS_PID_FILE 3 -#define NBD_HAS_CONFIG_REF 4 -#define NBD_BOUND 5 -#define NBD_DESTROY_ON_DISCONNECT 6 -#define NBD_DISCONNECT_ON_CLOSE 7 struct nbd_config { u32 flags; @@ -108,10 +112,14 @@ struct nbd_device { struct nbd_config *config; struct mutex config_lock; struct gendisk *disk; + struct workqueue_struct *recv_workq; struct list_head list; struct task_struct *task_recv; struct task_struct *task_setup; + + struct completion *destroy_complete; + unsigned long flags; }; #define NBD_CMD_REQUEUED 1 @@ -121,6 +129,7 @@ struct nbd_cmd { struct mutex lock; int index; int cookie; + int retries; blk_status_t status; unsigned long flags; u32 cmd_cookie; @@ -138,7 +147,6 @@ static struct dentry *nbd_dbg_dir; static unsigned int nbds_max = 16; static int max_part = 16; -static struct workqueue_struct *recv_workqueue; static int part_shift; static int nbd_dev_dbg_init(struct nbd_device *nbd); @@ -222,6 +230,16 @@ static void nbd_dev_remove(struct nbd_device *nbd) disk->private_data = NULL; put_disk(disk); } + + /* + * Place this in the last just before the nbd is freed to + * make sure that the disk and the related kobject are also + * totally removed to avoid duplicate creation of the same + * one. + */ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete) + complete(nbd->destroy_complete); + kfree(nbd); } @@ -230,15 +248,15 @@ static void nbd_put(struct nbd_device *nbd) if (refcount_dec_and_mutex_lock(&nbd->refs, &nbd_index_mutex)) { idr_remove(&nbd_index_idr, nbd->index); - mutex_unlock(&nbd_index_mutex); nbd_dev_remove(nbd); + mutex_unlock(&nbd_index_mutex); } } static int nbd_disconnected(struct nbd_config *config) { - return test_bit(NBD_DISCONNECTED, &config->runtime_flags) || - test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags); + return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) || + test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); } static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, @@ -256,9 +274,9 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, if (!nsock->dead) { kernel_sock_shutdown(nsock->sock, SHUT_RDWR); if (atomic_dec_return(&nbd->config->live_connections) == 0) { - if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED, + if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED, &nbd->config->runtime_flags)) { - set_bit(NBD_DISCONNECTED, + set_bit(NBD_RT_DISCONNECTED, &nbd->config->runtime_flags); dev_info(nbd_to_dev(nbd), "Disconnected due to user request.\n"); @@ -332,7 +350,7 @@ static void sock_shutdown(struct nbd_device *nbd) if (config->num_connections == 0) return; - if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return; for (i = 0; i < config->num_connections; i++) { @@ -344,6 +362,22 @@ static void sock_shutdown(struct nbd_device *nbd) dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n"); } +static u32 req_to_nbd_cmd_type(struct request *req) +{ + switch (req_op(req)) { + case REQ_OP_DISCARD: + return NBD_CMD_TRIM; + case REQ_OP_FLUSH: + return NBD_CMD_FLUSH; + case REQ_OP_WRITE: + return NBD_CMD_WRITE; + case REQ_OP_READ: + return NBD_CMD_READ; + default: + return U32_MAX; + } +} + static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, bool reserved) { @@ -351,15 +385,16 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, struct nbd_device *nbd = cmd->nbd; struct nbd_config *config; + if (!mutex_trylock(&cmd->lock)) + return BLK_EH_RESET_TIMER; + if (!refcount_inc_not_zero(&nbd->config_refs)) { cmd->status = BLK_STS_TIMEOUT; + mutex_unlock(&cmd->lock); goto done; } config = nbd->config; - if (!mutex_trylock(&cmd->lock)) - return BLK_EH_RESET_TIMER; - if (config->num_connections > 1) { dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out, retrying (%d/%d alive)\n", @@ -389,11 +424,26 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, nbd_config_put(nbd); return BLK_EH_DONE; } - } else { - dev_err_ratelimited(nbd_to_dev(nbd), - "Connection timed out\n"); } - set_bit(NBD_TIMEDOUT, &config->runtime_flags); + + if (!nbd->tag_set.timeout) { + /* + * Userspace sets timeout=0 to disable socket disconnection, + * so just warn and reset the timer. + */ + cmd->retries++; + dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n", + req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)), + (unsigned long long)blk_rq_pos(req) << 9, + blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries); + + mutex_unlock(&cmd->lock); + nbd_config_put(nbd); + return BLK_EH_RESET_TIMER; + } + + dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); + set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); sock_shutdown(nbd); @@ -480,22 +530,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); - switch (req_op(req)) { - case REQ_OP_DISCARD: - type = NBD_CMD_TRIM; - break; - case REQ_OP_FLUSH: - type = NBD_CMD_FLUSH; - break; - case REQ_OP_WRITE: - type = NBD_CMD_WRITE; - break; - case REQ_OP_READ: - type = NBD_CMD_READ; - break; - default: + type = req_to_nbd_cmd_type(req); + if (type == U32_MAX) return -EIO; - } if (rq_data_dir(req) == WRITE && (config->flags & NBD_FLAG_READ_ONLY)) { @@ -526,6 +563,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) } cmd->index = index; cmd->cookie = nsock->cookie; + cmd->retries = 0; request.type = htonl(type | nbd_cmd_flags); if (type != NBD_CMD_FLUSH) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); @@ -672,6 +710,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) ret = -ENOENT; goto out; } + if (cmd->status != BLK_STS_OK) { + dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n", + req); + ret = -ENOENT; + goto out; + } if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) { dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n", req); @@ -753,7 +797,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); + mutex_lock(&cmd->lock); cmd->status = BLK_STS_IOERR; + mutex_unlock(&cmd->lock); + blk_mq_complete_request(req); return true; } @@ -773,7 +820,7 @@ static int find_fallback(struct nbd_device *nbd, int index) struct nbd_sock *nsock = config->socks[index]; int fallback = nsock->fallback_index; - if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return new_index; if (config->num_connections <= 1) { @@ -814,7 +861,7 @@ static int wait_for_reconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; if (!config->dead_conn_timeout) return 0; - if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return 0; return wait_event_timeout(config->conn_wait, atomic_read(&config->live_connections) > 0, @@ -933,6 +980,26 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, return ret; } +static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd, + int *err) +{ + struct socket *sock; + + *err = 0; + sock = sockfd_lookup(fd, err); + if (!sock) + return NULL; + + if (sock->ops->shutdown == sock_no_shutdown) { + dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n"); + *err = -EINVAL; + sockfd_put(sock); + return NULL; + } + + return sock; +} + static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, bool netlink) { @@ -942,17 +1009,17 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, struct nbd_sock *nsock; int err; - sock = sockfd_lookup(arg, &err); + sock = nbd_get_socket(nbd, arg, &err); if (!sock) return err; if (!netlink && !nbd->task_setup && - !test_bit(NBD_BOUND, &config->runtime_flags)) + !test_bit(NBD_RT_BOUND, &config->runtime_flags)) nbd->task_setup = current; if (!netlink && (nbd->task_setup != current || - test_bit(NBD_BOUND, &config->runtime_flags))) { + test_bit(NBD_RT_BOUND, &config->runtime_flags))) { dev_err(disk_to_dev(nbd->disk), "Device being setup by another task"); sockfd_put(sock); @@ -965,14 +1032,15 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, sockfd_put(sock); return -ENOMEM; } + + config->socks = socks; + nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL); if (!nsock) { sockfd_put(sock); return -ENOMEM; } - config->socks = socks; - nsock->fallback_index = -1; nsock->dead = false; mutex_init(&nsock->tx_lock); @@ -994,7 +1062,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) int i; int err; - sock = sockfd_lookup(arg, &err); + sock = nbd_get_socket(nbd, arg, &err); if (!sock) return err; @@ -1031,12 +1099,12 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) mutex_unlock(&nsock->tx_lock); sockfd_put(old); - clear_bit(NBD_DISCONNECTED, &config->runtime_flags); + clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); /* We take the tx_mutex in an error path in the recv_work, so we * need to queue_work outside of the tx_mutex. */ - queue_work(recv_workqueue, &args->work); + queue_work(nbd->recv_workq, &args->work); atomic_inc(&config->live_connections); wake_up(&config->conn_wait); @@ -1102,7 +1170,8 @@ static int nbd_disconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); - set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags); + set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); + set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags); send_disconnects(nbd); return 0; } @@ -1121,7 +1190,7 @@ static void nbd_config_put(struct nbd_device *nbd) struct nbd_config *config = nbd->config; nbd_dev_dbg_close(nbd); nbd_size_clear(nbd); - if (test_and_clear_bit(NBD_HAS_PID_FILE, + if (test_and_clear_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags)) device_remove_file(disk_to_dev(nbd->disk), &pid_attr); nbd->task_recv = NULL; @@ -1137,6 +1206,10 @@ static void nbd_config_put(struct nbd_device *nbd) kfree(nbd->config); nbd->config = NULL; + if (nbd->recv_workq) + destroy_workqueue(nbd->recv_workq); + nbd->recv_workq = NULL; + nbd->tag_set.timeout = 0; nbd->disk->queue->limits.discard_granularity = 0; nbd->disk->queue->limits.discard_alignment = 0; @@ -1165,6 +1238,14 @@ static int nbd_start_device(struct nbd_device *nbd) return -EINVAL; } + nbd->recv_workq = alloc_workqueue("knbd%d-recv", + WQ_MEM_RECLAIM | WQ_HIGHPRI | + WQ_UNBOUND, 0, nbd->index); + if (!nbd->recv_workq) { + dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n"); + return -ENOMEM; + } + blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections); nbd->task_recv = current; @@ -1175,7 +1256,7 @@ static int nbd_start_device(struct nbd_device *nbd) dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); return error; } - set_bit(NBD_HAS_PID_FILE, &config->runtime_flags); + set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags); nbd_dev_dbg_init(nbd); for (i = 0; i < num_connections; i++) { @@ -1184,6 +1265,16 @@ static int nbd_start_device(struct nbd_device *nbd) args = kzalloc(sizeof(*args), GFP_KERNEL); if (!args) { sock_shutdown(nbd); + /* + * If num_connections is m (2 < m), + * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful. + * But NO.(n + 1) failed. We still have n recv threads. + * So, add flush_workqueue here to prevent recv threads + * dropping the last config_refs and trying to destroy + * the workqueue from inside the workqueue. + */ + if (i) + flush_workqueue(nbd->recv_workq); return -ENOMEM; } sk_set_memalloc(config->socks[i]->sock->sk); @@ -1195,7 +1286,7 @@ static int nbd_start_device(struct nbd_device *nbd) INIT_WORK(&args->work, recv_work); args->nbd = nbd; args->index = i; - queue_work(recv_workqueue, &args->work); + queue_work(nbd->recv_workq, &args->work); } nbd_size_update(nbd); return error; @@ -1217,12 +1308,14 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b atomic_read(&config->recv_threads) == 0); if (ret) sock_shutdown(nbd); + flush_workqueue(nbd->recv_workq); + mutex_lock(&nbd->config_lock); nbd_bdev_reset(bdev); /* user requested, ignore socket errors */ - if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags)) ret = 0; - if (test_bit(NBD_TIMEDOUT, &config->runtime_flags)) + if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags)) ret = -ETIMEDOUT; return ret; } @@ -1233,7 +1326,7 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd, sock_shutdown(nbd); __invalidate_device(bdev, true); nbd_bdev_reset(bdev); - if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); } @@ -1246,6 +1339,13 @@ static bool nbd_is_valid_blksize(unsigned long blksize) return true; } +static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout) +{ + nbd->tag_set.timeout = timeout * HZ; + if (timeout) + blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); +} + /* Must be called with config_lock held */ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) @@ -1276,10 +1376,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd_size_set(nbd, config->blksize, arg); return 0; case NBD_SET_TIMEOUT: - if (arg) { - nbd->tag_set.timeout = arg * HZ; - blk_queue_rq_timeout(nbd->disk->queue, arg * HZ); - } + nbd_set_cmd_timeout(nbd, arg); return 0; case NBD_SET_FLAGS: @@ -1324,7 +1421,7 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode, /* Don't allow ioctl operations on a nbd device that was created with * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine. */ - if (!test_bit(NBD_BOUND, &config->runtime_flags) || + if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK)) error = __nbd_ioctl(bdev, nbd, cmd, arg); else @@ -1395,7 +1492,7 @@ static void nbd_release(struct gendisk *disk, fmode_t mode) struct nbd_device *nbd = disk->private_data; struct block_device *bdev = bdget_disk(disk, 0); - if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && + if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && bdev->bd_openers == 0) nbd_disconnect_and_put(nbd); @@ -1596,6 +1693,7 @@ static int nbd_dev_add(int index) nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; + nbd->destroy_complete = NULL; err = blk_mq_alloc_tag_set(&nbd->tag_set); if (err) @@ -1710,6 +1808,7 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { + DECLARE_COMPLETION_ONSTACK(destroy_complete); struct nbd_device *nbd = NULL; struct nbd_config *config; int index = -1; @@ -1761,6 +1860,17 @@ again: mutex_unlock(&nbd_index_mutex); return -EINVAL; } + + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { + nbd->destroy_complete = &destroy_complete; + mutex_unlock(&nbd_index_mutex); + + /* Wait untill the the nbd stuff is totally destroyed */ + wait_for_completion(&destroy_complete); + goto again; + } + if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); if (index == -1) @@ -1793,17 +1903,15 @@ again: return -ENOMEM; } refcount_set(&nbd->config_refs, 1); - set_bit(NBD_BOUND, &config->runtime_flags); + set_bit(NBD_RT_BOUND, &config->runtime_flags); ret = nbd_genl_size_set(info, nbd); if (ret) goto out; - if (info->attrs[NBD_ATTR_TIMEOUT]) { - u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); - nbd->tag_set.timeout = timeout * HZ; - blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); - } + if (info->attrs[NBD_ATTR_TIMEOUT]) + nbd_set_cmd_timeout(nbd, + nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT])); if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { config->dead_conn_timeout = nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); @@ -1815,12 +1923,15 @@ again: if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { - set_bit(NBD_DESTROY_ON_DISCONNECT, + set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags); + set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); put_dev = true; + } else { + clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { - set_bit(NBD_DISCONNECT_ON_CLOSE, + set_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } } @@ -1859,7 +1970,7 @@ again: out: mutex_unlock(&nbd->config_lock); if (!ret) { - set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags); + set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags); refcount_inc(&nbd->config_refs); nbd_connect_reply(info, nbd->index); } @@ -1875,7 +1986,13 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) nbd_disconnect(nbd); nbd_clear_sock(nbd); mutex_unlock(&nbd->config_lock); - if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + /* + * Make sure recv thread has finished, so it does not drop the last + * config ref and try to destroy the workqueue from inside the work + * queue. + */ + flush_workqueue(nbd->recv_workq); + if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); } @@ -1959,7 +2076,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) mutex_lock(&nbd->config_lock); config = nbd->config; - if (!test_bit(NBD_BOUND, &config->runtime_flags) || + if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || !nbd->task_recv) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); @@ -1971,11 +2088,9 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (ret) goto out; - if (info->attrs[NBD_ATTR_TIMEOUT]) { - u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); - nbd->tag_set.timeout = timeout * HZ; - blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); - } + if (info->attrs[NBD_ATTR_TIMEOUT]) + nbd_set_cmd_timeout(nbd, + nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT])); if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { config->dead_conn_timeout = nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); @@ -1984,20 +2099,22 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { - if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, + if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) put_dev = true; + set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } else { - if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, + if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) refcount_inc(&nbd->refs); + clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { - set_bit(NBD_DISCONNECT_ON_CLOSE, + set_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } else { - clear_bit(NBD_DISCONNECT_ON_CLOSE, + clear_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } } @@ -2261,20 +2378,12 @@ static int __init nbd_init(void) if (nbds_max > 1UL << (MINORBITS - part_shift)) return -EINVAL; - recv_workqueue = alloc_workqueue("knbd-recv", - WQ_MEM_RECLAIM | WQ_HIGHPRI | - WQ_UNBOUND, 0); - if (!recv_workqueue) - return -ENOMEM; - if (register_blkdev(NBD_MAJOR, "nbd")) { - destroy_workqueue(recv_workqueue); + if (register_blkdev(NBD_MAJOR, "nbd")) return -EIO; - } if (genl_register_family(&nbd_genl_family)) { unregister_blkdev(NBD_MAJOR, "nbd"); - destroy_workqueue(recv_workqueue); return -EINVAL; } nbd_dbg_init(); @@ -2316,7 +2425,6 @@ static void __exit nbd_cleanup(void) idr_destroy(&nbd_index_idr); genl_unregister_family(&nbd_genl_family); - destroy_workqueue(recv_workqueue); unregister_blkdev(NBD_MAJOR, "nbd"); } diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index a1b9929bd911..bc837862b767 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -2,6 +2,9 @@ #ifndef __BLK_NULL_BLK_H #define __BLK_NULL_BLK_H +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/blk-mq.h> @@ -88,28 +91,32 @@ struct nullb { #ifdef CONFIG_BLK_DEV_ZONED int null_zone_init(struct nullb_device *dev); void null_zone_exit(struct nullb_device *dev); -int null_zone_report(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones); -void null_zone_write(struct nullb_cmd *cmd, sector_t sector, - unsigned int nr_sectors); -void null_zone_reset(struct nullb_cmd *cmd, sector_t sector); +int null_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); +blk_status_t null_handle_zoned(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + sector_t nr_sectors); +size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, unsigned int len); #else static inline int null_zone_init(struct nullb_device *dev) { - pr_err("null_blk: CONFIG_BLK_DEV_ZONED not enabled\n"); + pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); return -EINVAL; } static inline void null_zone_exit(struct nullb_device *dev) {} -static inline int null_zone_report(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, - unsigned int *nr_zones) +static inline blk_status_t null_handle_zoned(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + sector_t nr_sectors) { - return -EOPNOTSUPP; + return BLK_STS_NOTSUPP; } -static inline void null_zone_write(struct nullb_cmd *cmd, sector_t sector, - unsigned int nr_sectors) +static inline size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, + unsigned int len) { + return len; } -static inline void null_zone_reset(struct nullb_cmd *cmd, sector_t sector) {} +#define null_report_zones NULL #endif /* CONFIG_BLK_DEV_ZONED */ #endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 99328ded60d1..16510795e377 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -141,8 +141,8 @@ static int g_bs = 512; module_param_named(bs, g_bs, int, 0444); MODULE_PARM_DESC(bs, "Block size (in bytes)"); -static int nr_devices = 1; -module_param(nr_devices, int, 0444); +static unsigned int nr_devices = 1; +module_param(nr_devices, uint, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); static bool g_blocking; @@ -227,7 +227,7 @@ static ssize_t nullb_device_uint_attr_store(unsigned int *val, int result; result = kstrtouint(page, 0, &tmp); - if (result) + if (result < 0) return result; *val = tmp; @@ -241,7 +241,7 @@ static ssize_t nullb_device_ulong_attr_store(unsigned long *val, unsigned long tmp; result = kstrtoul(page, 0, &tmp); - if (result) + if (result < 0) return result; *val = tmp; @@ -255,7 +255,7 @@ static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, int result; result = kstrtobool(page, &tmp); - if (result) + if (result < 0) return result; *val = tmp; @@ -263,42 +263,68 @@ static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, } /* The following macro should only be used with TYPE = {uint, ulong, bool}. */ -#define NULLB_DEVICE_ATTR(NAME, TYPE) \ -static ssize_t \ -nullb_device_##NAME##_show(struct config_item *item, char *page) \ -{ \ - return nullb_device_##TYPE##_attr_show( \ - to_nullb_device(item)->NAME, page); \ -} \ -static ssize_t \ -nullb_device_##NAME##_store(struct config_item *item, const char *page, \ - size_t count) \ -{ \ - if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags)) \ - return -EBUSY; \ - return nullb_device_##TYPE##_attr_store( \ - &to_nullb_device(item)->NAME, page, count); \ -} \ +#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ +static ssize_t \ +nullb_device_##NAME##_show(struct config_item *item, char *page) \ +{ \ + return nullb_device_##TYPE##_attr_show( \ + to_nullb_device(item)->NAME, page); \ +} \ +static ssize_t \ +nullb_device_##NAME##_store(struct config_item *item, const char *page, \ + size_t count) \ +{ \ + int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ + struct nullb_device *dev = to_nullb_device(item); \ + TYPE uninitialized_var(new_value); \ + int ret; \ + \ + ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ + if (ret < 0) \ + return ret; \ + if (apply_fn) \ + ret = apply_fn(dev, new_value); \ + else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ + ret = -EBUSY; \ + if (ret < 0) \ + return ret; \ + dev->NAME = new_value; \ + return count; \ +} \ CONFIGFS_ATTR(nullb_device_, NAME); -NULLB_DEVICE_ATTR(size, ulong); -NULLB_DEVICE_ATTR(completion_nsec, ulong); -NULLB_DEVICE_ATTR(submit_queues, uint); -NULLB_DEVICE_ATTR(home_node, uint); -NULLB_DEVICE_ATTR(queue_mode, uint); -NULLB_DEVICE_ATTR(blocksize, uint); -NULLB_DEVICE_ATTR(irqmode, uint); -NULLB_DEVICE_ATTR(hw_queue_depth, uint); -NULLB_DEVICE_ATTR(index, uint); -NULLB_DEVICE_ATTR(blocking, bool); -NULLB_DEVICE_ATTR(use_per_node_hctx, bool); -NULLB_DEVICE_ATTR(memory_backed, bool); -NULLB_DEVICE_ATTR(discard, bool); -NULLB_DEVICE_ATTR(mbps, uint); -NULLB_DEVICE_ATTR(cache_size, ulong); -NULLB_DEVICE_ATTR(zoned, bool); -NULLB_DEVICE_ATTR(zone_size, ulong); -NULLB_DEVICE_ATTR(zone_nr_conv, uint); +static int nullb_apply_submit_queues(struct nullb_device *dev, + unsigned int submit_queues) +{ + struct nullb *nullb = dev->nullb; + struct blk_mq_tag_set *set; + + if (!nullb) + return 0; + + set = nullb->tag_set; + blk_mq_update_nr_hw_queues(set, submit_queues); + return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; +} + +NULLB_DEVICE_ATTR(size, ulong, NULL); +NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); +NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); +NULLB_DEVICE_ATTR(home_node, uint, NULL); +NULLB_DEVICE_ATTR(queue_mode, uint, NULL); +NULLB_DEVICE_ATTR(blocksize, uint, NULL); +NULLB_DEVICE_ATTR(irqmode, uint, NULL); +NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); +NULLB_DEVICE_ATTR(index, uint, NULL); +NULLB_DEVICE_ATTR(blocking, bool, NULL); +NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); +NULLB_DEVICE_ATTR(memory_backed, bool, NULL); +NULLB_DEVICE_ATTR(discard, bool, NULL); +NULLB_DEVICE_ATTR(mbps, uint, NULL); +NULLB_DEVICE_ATTR(cache_size, ulong, NULL); +NULLB_DEVICE_ATTR(zoned, bool, NULL); +NULLB_DEVICE_ATTR(zone_size, ulong, NULL); +NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -467,7 +493,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n"); + return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_nr_conv\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -996,6 +1022,16 @@ next: return 0; } +static void nullb_fill_pattern(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off) +{ + void *dst; + + dst = kmap_atomic(page); + memset(dst + off, 0xFF, len); + kunmap_atomic(dst); +} + static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) { size_t temp; @@ -1036,10 +1072,24 @@ static int null_transfer(struct nullb *nullb, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector, bool is_fua) { + struct nullb_device *dev = nullb->dev; + unsigned int valid_len = len; int err = 0; if (!is_write) { - err = copy_from_nullb(nullb, page, off, sector, len); + if (dev->zoned) + valid_len = null_zone_valid_read_len(nullb, + sector, len); + + if (valid_len) { + err = copy_from_nullb(nullb, page, off, + sector, valid_len); + off += valid_len; + len -= valid_len; + } + + if (len) + nullb_fill_pattern(nullb, page, len, off); flush_dcache_page(page); } else { flush_dcache_page(page); @@ -1133,93 +1183,61 @@ static void null_restart_queue_async(struct nullb *nullb) blk_mq_start_stopped_hw_queues(q, true); } -static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) +static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; - int err = 0; + blk_status_t sts = BLK_STS_OK; + struct request *rq = cmd->rq; - if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { - struct request *rq = cmd->rq; - - if (!hrtimer_active(&nullb->bw_timer)) - hrtimer_restart(&nullb->bw_timer); - - if (atomic_long_sub_return(blk_rq_bytes(rq), - &nullb->cur_bytes) < 0) { - null_stop_queue(nullb); - /* race with timer */ - if (atomic_long_read(&nullb->cur_bytes) > 0) - null_restart_queue_async(nullb); - /* requeue request */ - return BLK_STS_DEV_RESOURCE; - } - } + if (!hrtimer_active(&nullb->bw_timer)) + hrtimer_restart(&nullb->bw_timer); - if (nullb->dev->badblocks.shift != -1) { - int bad_sectors; - sector_t sector, size, first_bad; - bool is_flush = true; - - if (dev->queue_mode == NULL_Q_BIO && - bio_op(cmd->bio) != REQ_OP_FLUSH) { - is_flush = false; - sector = cmd->bio->bi_iter.bi_sector; - size = bio_sectors(cmd->bio); - } - if (dev->queue_mode != NULL_Q_BIO && - req_op(cmd->rq) != REQ_OP_FLUSH) { - is_flush = false; - sector = blk_rq_pos(cmd->rq); - size = blk_rq_sectors(cmd->rq); - } - if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector, - size, &first_bad, &bad_sectors)) { - cmd->error = BLK_STS_IOERR; - goto out; - } + if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { + null_stop_queue(nullb); + /* race with timer */ + if (atomic_long_read(&nullb->cur_bytes) > 0) + null_restart_queue_async(nullb); + /* requeue request */ + sts = BLK_STS_DEV_RESOURCE; } + return sts; +} - if (dev->memory_backed) { - if (dev->queue_mode == NULL_Q_BIO) { - if (bio_op(cmd->bio) == REQ_OP_FLUSH) - err = null_handle_flush(nullb); - else - err = null_handle_bio(cmd); - } else { - if (req_op(cmd->rq) == REQ_OP_FLUSH) - err = null_handle_flush(nullb); - else - err = null_handle_rq(cmd); - } - } - cmd->error = errno_to_blk_status(err); +static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, + sector_t sector, + sector_t nr_sectors) +{ + struct badblocks *bb = &cmd->nq->dev->badblocks; + sector_t first_bad; + int bad_sectors; - if (!cmd->error && dev->zoned) { - sector_t sector; - unsigned int nr_sectors; - enum req_opf op; + if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) + return BLK_STS_IOERR; - if (dev->queue_mode == NULL_Q_BIO) { - op = bio_op(cmd->bio); - sector = cmd->bio->bi_iter.bi_sector; - nr_sectors = cmd->bio->bi_iter.bi_size >> 9; - } else { - op = req_op(cmd->rq); - sector = blk_rq_pos(cmd->rq); - nr_sectors = blk_rq_sectors(cmd->rq); - } + return BLK_STS_OK; +} - if (op == REQ_OP_WRITE) - null_zone_write(cmd, sector, nr_sectors); - else if (op == REQ_OP_ZONE_RESET) - null_zone_reset(cmd, sector); - } -out: +static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, + enum req_opf op) +{ + struct nullb_device *dev = cmd->nq->dev; + int err; + + if (dev->queue_mode == NULL_Q_BIO) + err = null_handle_bio(cmd); + else + err = null_handle_rq(cmd); + + return errno_to_blk_status(err); +} + +static inline void nullb_complete_cmd(struct nullb_cmd *cmd) +{ /* Complete IO by inline, softirq or timer */ - switch (dev->irqmode) { + switch (cmd->nq->dev->irqmode) { case NULL_IRQ_SOFTIRQ: - switch (dev->queue_mode) { + switch (cmd->nq->dev->queue_mode) { case NULL_Q_MQ: blk_mq_complete_request(cmd->rq); break; @@ -1238,6 +1256,40 @@ out: null_cmd_end_timer(cmd); break; } +} + +static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, + sector_t nr_sectors, enum req_opf op) +{ + struct nullb_device *dev = cmd->nq->dev; + struct nullb *nullb = dev->nullb; + blk_status_t sts; + + if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { + sts = null_handle_throttled(cmd); + if (sts != BLK_STS_OK) + return sts; + } + + if (op == REQ_OP_FLUSH) { + cmd->error = errno_to_blk_status(null_handle_flush(nullb)); + goto out; + } + + if (nullb->dev->badblocks.shift != -1) { + cmd->error = null_handle_badblocks(cmd, sector, nr_sectors); + if (cmd->error != BLK_STS_OK) + goto out; + } + + if (dev->memory_backed) + cmd->error = null_handle_memory_backed(cmd, op); + + if (!cmd->error && dev->zoned) + cmd->error = null_handle_zoned(cmd, op, sector, nr_sectors); + +out: + nullb_complete_cmd(cmd); return BLK_STS_OK; } @@ -1280,6 +1332,8 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb) static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) { + sector_t sector = bio->bi_iter.bi_sector; + sector_t nr_sectors = bio_sectors(bio); struct nullb *nullb = q->queuedata; struct nullb_queue *nq = nullb_to_queue(nullb); struct nullb_cmd *cmd; @@ -1287,7 +1341,7 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) cmd = alloc_cmd(nq, 1); cmd->bio = bio; - null_handle_cmd(cmd); + null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); return BLK_QC_T_NONE; } @@ -1311,7 +1365,7 @@ static bool should_requeue_request(struct request *rq) static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) { - pr_info("null: rq %p timed out\n", rq); + pr_info("rq %p timed out\n", rq); blk_mq_complete_request(rq); return BLK_EH_DONE; } @@ -1321,6 +1375,8 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); struct nullb_queue *nq = hctx->driver_data; + sector_t nr_sectors = blk_rq_sectors(bd->rq); + sector_t sector = blk_rq_pos(bd->rq); might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); @@ -1349,7 +1405,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, if (should_timeout_request(bd->rq)) return BLK_STS_OK; - return null_handle_cmd(cmd); + return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); } static const struct blk_mq_ops null_mq_ops = { @@ -1412,20 +1468,9 @@ static void null_config_discard(struct nullb *nullb) blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); } -static int null_open(struct block_device *bdev, fmode_t mode) -{ - return 0; -} - -static void null_release(struct gendisk *disk, fmode_t mode) -{ -} - -static const struct block_device_operations null_fops = { - .owner = THIS_MODULE, - .open = null_open, - .release = null_release, - .report_zones = null_zone_report, +static const struct block_device_operations null_ops = { + .owner = THIS_MODULE, + .report_zones = null_report_zones, }; static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) @@ -1514,29 +1559,35 @@ static int init_driver_queues(struct nullb *nullb) static int null_gendisk_register(struct nullb *nullb) { + sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; struct gendisk *disk; - sector_t size; disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); if (!disk) return -ENOMEM; - size = (sector_t)nullb->dev->size * 1024 * 1024ULL; - set_capacity(disk, size >> 9); + set_capacity(disk, size); disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; disk->first_minor = nullb->index; - disk->fops = &null_fops; + disk->fops = &null_ops; disk->private_data = nullb; disk->queue = nullb->q; strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); +#ifdef CONFIG_BLK_DEV_ZONED if (nullb->dev->zoned) { - int ret = blk_revalidate_disk_zones(disk); - - if (ret != 0) - return ret; + if (queue_is_mq(nullb->q)) { + int ret = blk_revalidate_disk_zones(disk); + if (ret) + return ret; + } else { + blk_queue_chunk_sectors(nullb->q, + nullb->dev->zone_size_sects); + nullb->q->nr_zones = blkdev_nr_zones(disk); + } } +#endif add_disk(disk); return 0; @@ -1562,7 +1613,7 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) return blk_mq_alloc_tag_set(set); } -static void null_validate_conf(struct nullb_device *dev) +static int null_validate_conf(struct nullb_device *dev) { dev->blocksize = round_down(dev->blocksize, 512); dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); @@ -1589,6 +1640,14 @@ static void null_validate_conf(struct nullb_device *dev) /* can not stop a queue */ if (dev->queue_mode == NULL_Q_BIO) dev->mbps = 0; + + if (dev->zoned && + (!dev->zone_size || !is_power_of_2(dev->zone_size))) { + pr_err("zone_size must be power-of-two\n"); + return -EINVAL; + } + + return 0; } #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION @@ -1621,7 +1680,9 @@ static int null_add_dev(struct nullb_device *dev) struct nullb *nullb; int rv; - null_validate_conf(dev); + rv = null_validate_conf(dev); + if (rv) + return rv; nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); if (!nullb) { @@ -1686,8 +1747,10 @@ static int null_add_dev(struct nullb_device *dev) if (rv) goto out_cleanup_blk_queue; - blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects); nullb->q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q); + blk_queue_required_elevator_features(nullb->q, + ELEVATOR_F_ZBD_SEQ_WRITE); } nullb->q->queuedata = nullb; @@ -1739,28 +1802,23 @@ static int __init null_init(void) struct nullb_device *dev; if (g_bs > PAGE_SIZE) { - pr_warn("null_blk: invalid block size\n"); - pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); + pr_warn("invalid block size\n"); + pr_warn("defaults block size to %lu\n", PAGE_SIZE); g_bs = PAGE_SIZE; } - if (!is_power_of_2(g_zone_size)) { - pr_err("null_blk: zone_size must be power-of-two\n"); - return -EINVAL; - } - if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { - pr_err("null_blk: invalid home_node value\n"); + pr_err("invalid home_node value\n"); g_home_node = NUMA_NO_NODE; } if (g_queue_mode == NULL_Q_RQ) { - pr_err("null_blk: legacy IO path no longer available\n"); + pr_err("legacy IO path no longer available\n"); return -EINVAL; } if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { - pr_warn("null_blk: submit_queues param is set to %u.\n", + pr_warn("submit_queues param is set to %u.\n", nr_online_nodes); g_submit_queues = nr_online_nodes; } @@ -1803,7 +1861,7 @@ static int __init null_init(void) } } - pr_info("null: module loaded\n"); + pr_info("module loaded\n"); return 0; err_dev: diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index cb28d93f2bd1..ed34785dd64b 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -17,7 +17,7 @@ int null_zone_init(struct nullb_device *dev) unsigned int i; if (!is_power_of_2(dev->zone_size)) { - pr_err("null_blk: zone_size must be power-of-two\n"); + pr_err("zone_size must be power-of-two\n"); return -EINVAL; } @@ -31,7 +31,7 @@ int null_zone_init(struct nullb_device *dev) if (dev->zone_nr_conv >= dev->nr_zones) { dev->zone_nr_conv = dev->nr_zones - 1; - pr_info("null_blk: changed the number of conventional zones to %u", + pr_info("changed the number of conventional zones to %u", dev->zone_nr_conv); } @@ -66,25 +66,56 @@ void null_zone_exit(struct nullb_device *dev) kvfree(dev->zones); } -int null_zone_report(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) +int null_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) { struct nullb *nullb = disk->private_data; struct nullb_device *dev = nullb->dev; - unsigned int zno, nrz = 0; - - zno = null_zone_no(dev, sector); - if (zno < dev->nr_zones) { - nrz = min_t(unsigned int, *nr_zones, dev->nr_zones - zno); - memcpy(zones, &dev->zones[zno], nrz * sizeof(struct blk_zone)); + unsigned int first_zone, i; + struct blk_zone zone; + int error; + + first_zone = null_zone_no(dev, sector); + if (first_zone >= dev->nr_zones) + return 0; + + nr_zones = min(nr_zones, dev->nr_zones - first_zone); + for (i = 0; i < nr_zones; i++) { + /* + * Stacked DM target drivers will remap the zone information by + * modifying the zone information passed to the report callback. + * So use a local copy to avoid corruption of the device zone + * array. + */ + memcpy(&zone, &dev->zones[first_zone + i], + sizeof(struct blk_zone)); + error = cb(&zone, i, data); + if (error) + return error; } - *nr_zones = nrz; + return nr_zones; +} - return 0; +size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, unsigned int len) +{ + struct nullb_device *dev = nullb->dev; + struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; + unsigned int nr_sectors = len >> SECTOR_SHIFT; + + /* Read must be below the write pointer position */ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || + sector + nr_sectors <= zone->wp) + return len; + + if (sector > zone->wp) + return 0; + + return (zone->wp - sector) << SECTOR_SHIFT; } -void null_zone_write(struct nullb_cmd *cmd, sector_t sector, +static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, unsigned int nr_sectors) { struct nullb_device *dev = cmd->nq->dev; @@ -95,16 +126,16 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector, case BLK_ZONE_COND_FULL: /* Cannot write to a full zone */ cmd->error = BLK_STS_IOERR; - break; + return BLK_STS_IOERR; case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: /* Writes must be at the write pointer position */ - if (sector != zone->wp) { - cmd->error = BLK_STS_IOERR; - break; - } + if (sector != zone->wp) + return BLK_STS_IOERR; - if (zone->cond == BLK_ZONE_COND_EMPTY) + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) zone->cond = BLK_ZONE_COND_IMP_OPEN; zone->wp += nr_sectors; @@ -115,22 +146,79 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector, break; default: /* Invalid zone condition */ - cmd->error = BLK_STS_IOERR; - break; + return BLK_STS_IOERR; } + return BLK_STS_OK; } -void null_zone_reset(struct nullb_cmd *cmd, sector_t sector) +static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector) { struct nullb_device *dev = cmd->nq->dev; - unsigned int zno = null_zone_no(dev, sector); - struct blk_zone *zone = &dev->zones[zno]; + struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; + size_t i; + + switch (op) { + case REQ_OP_ZONE_RESET_ALL: + for (i = 0; i < dev->nr_zones; i++) { + if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL) + continue; + zone[i].cond = BLK_ZONE_COND_EMPTY; + zone[i].wp = zone[i].start; + } + break; + case REQ_OP_ZONE_RESET: + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { - cmd->error = BLK_STS_IOERR; - return; + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + break; + case REQ_OP_ZONE_OPEN: + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + if (zone->cond == BLK_ZONE_COND_FULL) + return BLK_STS_IOERR; + + zone->cond = BLK_ZONE_COND_EXP_OPEN; + break; + case REQ_OP_ZONE_CLOSE: + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + if (zone->cond == BLK_ZONE_COND_FULL) + return BLK_STS_IOERR; + + if (zone->wp == zone->start) + zone->cond = BLK_ZONE_COND_EMPTY; + else + zone->cond = BLK_ZONE_COND_CLOSED; + break; + case REQ_OP_ZONE_FINISH: + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zone->len; + break; + default: + return BLK_STS_NOTSUPP; } + return BLK_STS_OK; +} - zone->cond = BLK_ZONE_COND_EMPTY; - zone->wp = zone->start; +blk_status_t null_handle_zoned(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector, sector_t nr_sectors) +{ + switch (op) { + case REQ_OP_WRITE: + return null_zone_write(cmd, sector, nr_sectors); + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + return null_zone_mgmt(cmd, op, sector); + default: + return BLK_STS_OK; + } } diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 001dbdcbf355..117cfc8cd05a 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -275,6 +275,9 @@ static const struct block_device_operations pcd_bdops = { .open = pcd_block_open, .release = pcd_block_release, .ioctl = pcd_block_ioctl, +#ifdef CONFIG_COMPAT + .ioctl = blkdev_compat_ptr_ioctl, +#endif .check_events = pcd_block_check_events, }; @@ -314,8 +317,8 @@ static void pcd_init_units(void) disk->queue = blk_mq_init_sq_queue(&cd->tag_set, &pcd_mq_ops, 1, BLK_MQ_F_SHOULD_MERGE); if (IS_ERR(disk->queue)) { - put_disk(disk); disk->queue = NULL; + put_disk(disk); continue; } @@ -723,9 +726,9 @@ static int pcd_detect(void) k = 0; if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ cd = pcd; - if (pi_init(cd->pi, 1, -1, -1, -1, -1, -1, pcd_buffer, - PI_PCD, verbose, cd->name)) { - if (!pcd_probe(cd, -1, id) && cd->disk) { + if (cd->disk && pi_init(cd->pi, 1, -1, -1, -1, -1, -1, + pcd_buffer, PI_PCD, verbose, cd->name)) { + if (!pcd_probe(cd, -1, id)) { cd->present = 1; k++; } else @@ -736,11 +739,13 @@ static int pcd_detect(void) int *conf = *drives[unit]; if (!conf[D_PRT]) continue; + if (!cd->disk) + continue; if (!pi_init(cd->pi, 0, conf[D_PRT], conf[D_MOD], conf[D_UNI], conf[D_PRO], conf[D_DLY], pcd_buffer, PI_PCD, verbose, cd->name)) continue; - if (!pcd_probe(cd, conf[D_SLV], id) && cd->disk) { + if (!pcd_probe(cd, conf[D_SLV], id)) { cd->present = 1; k++; } else diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 6f9ad3fc716f..c0967507d085 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -874,6 +874,7 @@ static const struct block_device_operations pd_fops = { .open = pd_open, .release = pd_release, .ioctl = pd_ioctl, + .compat_ioctl = pd_ioctl, .getgeo = pd_getgeo, .check_events = pd_check_events, .revalidate_disk= pd_revalidate diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 1e9c50a7256c..bb09f21ce21a 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -276,6 +276,7 @@ static const struct block_device_operations pf_fops = { .open = pf_open, .release = pf_release, .ioctl = pf_ioctl, + .compat_ioctl = pf_ioctl, .getgeo = pf_getgeo, .check_events = pf_check_events, }; @@ -300,8 +301,8 @@ static void __init pf_init_units(void) disk->queue = blk_mq_init_sq_queue(&pf->tag_set, &pf_mq_ops, 1, BLK_MQ_F_SHOULD_MERGE); if (IS_ERR(disk->queue)) { - put_disk(disk); disk->queue = NULL; + put_disk(disk); continue; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 024060165afa..5f970a7d32c0 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2594,7 +2594,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (ret) return ret; if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { - WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); return -EINVAL; } @@ -2685,6 +2684,7 @@ static const struct block_device_operations pktcdvd_ops = { .open = pkt_open, .release = pkt_close, .ioctl = pkt_ioctl, + .compat_ioctl = blkdev_compat_ptr_ioctl, .check_events = pkt_check_events, }; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3327192bb71f..6343402c09e6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -34,7 +34,7 @@ #include <linux/ceph/cls_lock_client.h> #include <linux/ceph/striper.h> #include <linux/ceph/decode.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/bsearch.h> #include <linux/kernel.h> @@ -377,7 +377,6 @@ struct rbd_client_id { struct rbd_mapping { u64 size; - u64 features; }; /* @@ -462,8 +461,9 @@ struct rbd_device { * by rbd_dev->lock */ enum rbd_dev_flags { - RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ + RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */ RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ + RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */ }; static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ @@ -514,6 +514,16 @@ static int minor_to_rbd_dev_id(int minor) return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; } +static bool rbd_is_ro(struct rbd_device *rbd_dev) +{ + return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags); +} + +static bool rbd_is_snap(struct rbd_device *rbd_dev) +{ + return rbd_dev->spec->snap_id != CEPH_NOSNAP; +} + static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) { lockdep_assert_held(&rbd_dev->lock_rwsem); @@ -633,8 +643,6 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u64 snap_id); static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, u8 *order, u64 *snap_size); -static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, - u64 *snap_features); static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev); static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result); @@ -695,9 +703,16 @@ static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) if (get_user(ro, (int __user *)arg)) return -EFAULT; - /* Snapshots can't be marked read-write */ - if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) - return -EROFS; + /* + * Both images mapped read-only and snapshots can't be marked + * read-write. + */ + if (!ro) { + if (rbd_is_ro(rbd_dev)) + return -EROFS; + + rbd_assert(!rbd_is_snap(rbd_dev)); + } /* Let blkdev_roset() handle it */ return -ENOTTY; @@ -823,34 +838,29 @@ enum { Opt_queue_depth, Opt_alloc_size, Opt_lock_timeout, - Opt_last_int, /* int args above */ Opt_pool_ns, - Opt_last_string, /* string args above */ Opt_read_only, Opt_read_write, Opt_lock_on_read, Opt_exclusive, Opt_notrim, - Opt_err }; -static match_table_t rbd_opts_tokens = { - {Opt_queue_depth, "queue_depth=%d"}, - {Opt_alloc_size, "alloc_size=%d"}, - {Opt_lock_timeout, "lock_timeout=%d"}, - /* int args above */ - {Opt_pool_ns, "_pool_ns=%s"}, - /* string args above */ - {Opt_read_only, "read_only"}, - {Opt_read_only, "ro"}, /* Alternate spelling */ - {Opt_read_write, "read_write"}, - {Opt_read_write, "rw"}, /* Alternate spelling */ - {Opt_lock_on_read, "lock_on_read"}, - {Opt_exclusive, "exclusive"}, - {Opt_notrim, "notrim"}, - {Opt_err, NULL} +static const struct fs_parameter_spec rbd_parameters[] = { + fsparam_u32 ("alloc_size", Opt_alloc_size), + fsparam_flag ("exclusive", Opt_exclusive), + fsparam_flag ("lock_on_read", Opt_lock_on_read), + fsparam_u32 ("lock_timeout", Opt_lock_timeout), + fsparam_flag ("notrim", Opt_notrim), + fsparam_string ("_pool_ns", Opt_pool_ns), + fsparam_u32 ("queue_depth", Opt_queue_depth), + fsparam_flag ("read_only", Opt_read_only), + fsparam_flag ("read_write", Opt_read_write), + fsparam_flag ("ro", Opt_read_only), + fsparam_flag ("rw", Opt_read_write), + {} }; struct rbd_options { @@ -871,87 +881,12 @@ struct rbd_options { #define RBD_EXCLUSIVE_DEFAULT false #define RBD_TRIM_DEFAULT true -struct parse_rbd_opts_ctx { +struct rbd_parse_opts_ctx { struct rbd_spec *spec; + struct ceph_options *copts; struct rbd_options *opts; }; -static int parse_rbd_opts_token(char *c, void *private) -{ - struct parse_rbd_opts_ctx *pctx = private; - substring_t argstr[MAX_OPT_ARGS]; - int token, intval, ret; - - token = match_token(c, rbd_opts_tokens, argstr); - if (token < Opt_last_int) { - ret = match_int(&argstr[0], &intval); - if (ret < 0) { - pr_err("bad option arg (not int) at '%s'\n", c); - return ret; - } - dout("got int token %d val %d\n", token, intval); - } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, argstr[0].from); - } else { - dout("got token %d\n", token); - } - - switch (token) { - case Opt_queue_depth: - if (intval < 1) { - pr_err("queue_depth out of range\n"); - return -EINVAL; - } - pctx->opts->queue_depth = intval; - break; - case Opt_alloc_size: - if (intval < SECTOR_SIZE) { - pr_err("alloc_size out of range\n"); - return -EINVAL; - } - if (!is_power_of_2(intval)) { - pr_err("alloc_size must be a power of 2\n"); - return -EINVAL; - } - pctx->opts->alloc_size = intval; - break; - case Opt_lock_timeout: - /* 0 is "wait forever" (i.e. infinite timeout) */ - if (intval < 0 || intval > INT_MAX / 1000) { - pr_err("lock_timeout out of range\n"); - return -EINVAL; - } - pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000); - break; - case Opt_pool_ns: - kfree(pctx->spec->pool_ns); - pctx->spec->pool_ns = match_strdup(argstr); - if (!pctx->spec->pool_ns) - return -ENOMEM; - break; - case Opt_read_only: - pctx->opts->read_only = true; - break; - case Opt_read_write: - pctx->opts->read_only = false; - break; - case Opt_lock_on_read: - pctx->opts->lock_on_read = true; - break; - case Opt_exclusive: - pctx->opts->exclusive = true; - break; - case Opt_notrim: - pctx->opts->trim = false; - break; - default: - /* libceph prints "bad option" msg */ - return -EINVAL; - } - - return 0; -} - static char* obj_op_name(enum obj_operation_type op_type) { switch (op_type) { @@ -1302,51 +1237,23 @@ static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, return 0; } -static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, - u64 *snap_features) -{ - rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); - if (snap_id == CEPH_NOSNAP) { - *snap_features = rbd_dev->header.features; - } else if (rbd_dev->image_format == 1) { - *snap_features = 0; /* No features for format 1 */ - } else { - u64 features = 0; - int ret; - - ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); - if (ret) - return ret; - - *snap_features = features; - } - return 0; -} - static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) { u64 snap_id = rbd_dev->spec->snap_id; u64 size = 0; - u64 features = 0; int ret; ret = rbd_snap_size(rbd_dev, snap_id, &size); if (ret) return ret; - ret = rbd_snap_features(rbd_dev, snap_id, &features); - if (ret) - return ret; rbd_dev->mapping.size = size; - rbd_dev->mapping.features = features; - return 0; } static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) { rbd_dev->mapping.size = 0; - rbd_dev->mapping.features = 0; } static void zero_bvec(struct bio_vec *bv) @@ -1754,8 +1661,6 @@ static struct rbd_img_request *rbd_img_request_create( mutex_init(&img_request->state_mutex); kref_init(&img_request->kref); - dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, - obj_op_name(op_type), img_request); return img_request; } @@ -1834,6 +1739,17 @@ static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno) static bool use_object_map(struct rbd_device *rbd_dev) { + /* + * An image mapped read-only can't use the object map -- it isn't + * loaded because the header lock isn't acquired. Someone else can + * write to the image and update the object map behind our back. + * + * A snapshot can't be written to, so using the object map is always + * safe. + */ + if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev)) + return false; + return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) && !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)); } @@ -2089,7 +2005,7 @@ static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req, struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; struct ceph_osd_data *osd_data; u64 objno; - u8 state, new_state, current_state; + u8 state, new_state, uninitialized_var(current_state); bool has_current_state; void *p; @@ -2741,7 +2657,7 @@ static int rbd_img_fill_nodata(struct rbd_img_request *img_req, u64 off, u64 len) { struct ceph_file_extent ex = { off, len }; - union rbd_img_fill_iter dummy; + union rbd_img_fill_iter dummy = {}; struct rbd_img_fill_ctx fctx = { .pos_type = OBJ_REQUEST_NODATA, .pos = &dummy, @@ -2944,6 +2860,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) __set_bit(IMG_REQ_CHILD, &child_img_req->flags); child_img_req->obj_request = obj_req; + dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req, + obj_req); + if (!rbd_img_is_write(img_req)) { switch (img_req->data_type) { case OBJ_REQUEST_BIO: @@ -3038,6 +2957,17 @@ again: } return true; case RBD_OBJ_READ_PARENT: + /* + * The parent image is read only up to the overlap -- zero-fill + * from the overlap to the end of the request. + */ + if (!*result) { + u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req); + + if (obj_overlap < obj_req->ex.oe_len) + rbd_obj_zero_range(obj_req, obj_overlap, + obj_req->ex.oe_len - obj_overlap); + } return true; default: BUG(); @@ -3543,7 +3473,7 @@ static bool need_exclusive_lock(struct rbd_img_request *img_req) if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) return false; - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) + if (rbd_is_ro(rbd_dev)) return false; rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); @@ -4218,7 +4148,7 @@ again: * lock owner acked, but resend if we don't see them * release the lock */ - dout("%s rbd_dev %p requeueing lock_dwork\n", __func__, + dout("%s rbd_dev %p requeuing lock_dwork\n", __func__, rbd_dev); mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); @@ -4814,24 +4744,14 @@ static void rbd_queue_workfn(struct work_struct *work) goto err_rq; } - if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) { - rbd_warn(rbd_dev, "%s on read-only snapshot", - obj_op_name(op_type)); - result = -EIO; - goto err; - } - - /* - * Quit early if the mapped snapshot no longer exists. It's - * still possible the snapshot will have disappeared by the - * time our request arrives at the osd, but there's no sense in - * sending it if we already know. - */ - if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { - dout("request for non-existent snapshot"); - rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); - result = -ENXIO; - goto err_rq; + if (op_type != OBJ_OP_READ) { + if (rbd_is_ro(rbd_dev)) { + rbd_warn(rbd_dev, "%s on read-only mapping", + obj_op_name(op_type)); + result = -EIO; + goto err; + } + rbd_assert(!rbd_is_snap(rbd_dev)); } if (offset && length > U64_MAX - offset + 1) { @@ -4866,6 +4786,9 @@ static void rbd_queue_workfn(struct work_struct *work) img_request->rq = rq; snapc = NULL; /* img_request consumes a ref */ + dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev, + img_request, obj_op_name(op_type), offset, length); + if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) result = rbd_img_fill_nodata(img_request, offset, length); else @@ -5010,25 +4933,6 @@ out: return ret; } -/* - * Clear the rbd device's EXISTS flag if the snapshot it's mapped to - * has disappeared from the (just updated) snapshot context. - */ -static void rbd_exists_validate(struct rbd_device *rbd_dev) -{ - u64 snap_id; - - if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) - return; - - snap_id = rbd_dev->spec->snap_id; - if (snap_id == CEPH_NOSNAP) - return; - - if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) - clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); -} - static void rbd_dev_update_size(struct rbd_device *rbd_dev) { sector_t size; @@ -5069,12 +4973,8 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) goto out; } - if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { - rbd_dev->mapping.size = rbd_dev->header.image_size; - } else { - /* validate mapped snapshot's EXISTS flag */ - rbd_exists_validate(rbd_dev); - } + rbd_assert(!rbd_is_snap(rbd_dev)); + rbd_dev->mapping.size = rbd_dev->header.image_size; out: up_write(&rbd_dev->header_rwsem); @@ -5196,17 +5096,12 @@ static ssize_t rbd_size_show(struct device *dev, (unsigned long long)rbd_dev->mapping.size); } -/* - * Note this shows the features for whatever's mapped, which is not - * necessarily the base image. - */ static ssize_t rbd_features_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "0x%016llx\n", - (unsigned long long)rbd_dev->mapping.features); + return sprintf(buf, "0x%016llx\n", rbd_dev->header.features); } static ssize_t rbd_major_show(struct device *dev, @@ -5658,17 +5553,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) { + size_t size; void *reply_buf; int ret; void *p; - reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); + /* Response will be an encoded string, which includes a length */ + size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX; + reply_buf = kzalloc(size, GFP_KERNEL); if (!reply_buf) return -ENOMEM; ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, &rbd_dev->header_oloc, "get_object_prefix", - NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); + NULL, 0, reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -5691,9 +5589,12 @@ out: } static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, - u64 *snap_features) + bool read_only, u64 *snap_features) { - __le64 snapid = cpu_to_le64(snap_id); + struct { + __le64 snap_id; + u8 read_only; + } features_in; struct { __le64 features; __le64 incompat; @@ -5701,9 +5602,12 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 unsup; int ret; + features_in.snap_id = cpu_to_le64(snap_id); + features_in.read_only = read_only; + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, &rbd_dev->header_oloc, "get_features", - &snapid, sizeof(snapid), + &features_in, sizeof(features_in), &features_buf, sizeof(features_buf)); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) @@ -5731,7 +5635,8 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, static int rbd_dev_v2_features(struct rbd_device *rbd_dev) { return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, - &rbd_dev->header.features); + rbd_is_ro(rbd_dev), + &rbd_dev->header.features); } /* @@ -6438,6 +6343,118 @@ static inline char *dup_token(const char **buf, size_t *lenp) return dup; } +static int rbd_parse_param(struct fs_parameter *param, + struct rbd_parse_opts_ctx *pctx) +{ + struct rbd_options *opt = pctx->opts; + struct fs_parse_result result; + struct p_log log = {.prefix = "rbd"}; + int token, ret; + + ret = ceph_parse_param(param, pctx->copts, NULL); + if (ret != -ENOPARAM) + return ret; + + token = __fs_parse(&log, rbd_parameters, param, &result); + dout("%s fs_parse '%s' token %d\n", __func__, param->key, token); + if (token < 0) { + if (token == -ENOPARAM) + return inval_plog(&log, "Unknown parameter '%s'", + param->key); + return token; + } + + switch (token) { + case Opt_queue_depth: + if (result.uint_32 < 1) + goto out_of_range; + opt->queue_depth = result.uint_32; + break; + case Opt_alloc_size: + if (result.uint_32 < SECTOR_SIZE) + goto out_of_range; + if (!is_power_of_2(result.uint_32)) + return inval_plog(&log, "alloc_size must be a power of 2"); + opt->alloc_size = result.uint_32; + break; + case Opt_lock_timeout: + /* 0 is "wait forever" (i.e. infinite timeout) */ + if (result.uint_32 > INT_MAX / 1000) + goto out_of_range; + opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000); + break; + case Opt_pool_ns: + kfree(pctx->spec->pool_ns); + pctx->spec->pool_ns = param->string; + param->string = NULL; + break; + case Opt_read_only: + opt->read_only = true; + break; + case Opt_read_write: + opt->read_only = false; + break; + case Opt_lock_on_read: + opt->lock_on_read = true; + break; + case Opt_exclusive: + opt->exclusive = true; + break; + case Opt_notrim: + opt->trim = false; + break; + default: + BUG(); + } + + return 0; + +out_of_range: + return inval_plog(&log, "%s out of range", param->key); +} + +/* + * This duplicates most of generic_parse_monolithic(), untying it from + * fs_context and skipping standard superblock and security options. + */ +static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx) +{ + char *key; + int ret = 0; + + dout("%s '%s'\n", __func__, options); + while ((key = strsep(&options, ",")) != NULL) { + if (*key) { + struct fs_parameter param = { + .key = key, + .type = fs_value_is_flag, + }; + char *value = strchr(key, '='); + size_t v_len = 0; + + if (value) { + if (value == key) + continue; + *value++ = 0; + v_len = strlen(value); + param.string = kmemdup_nul(value, v_len, + GFP_KERNEL); + if (!param.string) + return -ENOMEM; + param.type = fs_value_is_string; + } + param.size = v_len; + + ret = rbd_parse_param(¶m, pctx); + kfree(param.string); + if (ret) + break; + } + } + + return ret; +} + /* * Parse the options provided for an "rbd add" (i.e., rbd image * mapping) request. These arrive via a write to /sys/bus/rbd/add, @@ -6489,8 +6506,7 @@ static int rbd_add_parse_args(const char *buf, const char *mon_addrs; char *snap_name; size_t mon_addrs_size; - struct parse_rbd_opts_ctx pctx = { 0 }; - struct ceph_options *copts; + struct rbd_parse_opts_ctx pctx = { 0 }; int ret; /* The first four tokens are required */ @@ -6501,7 +6517,7 @@ static int rbd_add_parse_args(const char *buf, return -EINVAL; } mon_addrs = buf; - mon_addrs_size = len + 1; + mon_addrs_size = len; buf += len; ret = -EINVAL; @@ -6551,6 +6567,10 @@ static int rbd_add_parse_args(const char *buf, *(snap_name + len) = '\0'; pctx.spec->snap_name = snap_name; + pctx.copts = ceph_alloc_options(); + if (!pctx.copts) + goto out_mem; + /* Initialize all rbd options to the defaults */ pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL); @@ -6565,27 +6585,27 @@ static int rbd_add_parse_args(const char *buf, pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; pctx.opts->trim = RBD_TRIM_DEFAULT; - copts = ceph_parse_options(options, mon_addrs, - mon_addrs + mon_addrs_size - 1, - parse_rbd_opts_token, &pctx); - if (IS_ERR(copts)) { - ret = PTR_ERR(copts); + ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL); + if (ret) + goto out_err; + + ret = rbd_parse_options(options, &pctx); + if (ret) goto out_err; - } - kfree(options); - *ceph_opts = copts; + *ceph_opts = pctx.copts; *opts = pctx.opts; *rbd_spec = pctx.spec; - + kfree(options); return 0; + out_mem: ret = -ENOMEM; out_err: kfree(pctx.opts); + ceph_destroy_options(pctx.copts); rbd_spec_put(pctx.spec); kfree(options); - return ret; } @@ -6614,17 +6634,20 @@ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) return -EINVAL; } - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) + if (rbd_is_ro(rbd_dev)) return 0; rbd_assert(!rbd_is_lock_owner(rbd_dev)); queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait, ceph_timeout_jiffies(rbd_dev->opts->lock_timeout)); - if (ret > 0) + if (ret > 0) { ret = rbd_dev->acquire_err; - else if (!ret) - ret = -ETIMEDOUT; + } else { + cancel_delayed_work_sync(&rbd_dev->lock_dwork); + if (!ret) + ret = -ETIMEDOUT; + } if (ret) { rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret); @@ -6685,7 +6708,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) dout("rbd id object name is %s\n", oid.name); /* Response will be an encoded string, which includes a length */ - size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; response = kzalloc(size, GFP_NOIO); if (!response) { @@ -6697,7 +6719,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX); + response, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret == -ENOENT) { image_id = kstrdup("", GFP_KERNEL); @@ -6818,6 +6840,8 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) __rbd_get_client(rbd_dev->rbd_client); rbd_spec_get(rbd_dev->parent_spec); + __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags); + ret = rbd_dev_image_probe(parent, depth); if (ret < 0) goto out_err; @@ -6869,7 +6893,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) goto err_out_blkdev; set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); - set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); + set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev)); ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); if (ret) @@ -6907,6 +6931,24 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) return ret; } +static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap) +{ + if (!is_snap) { + pr_info("image %s/%s%s%s does not exist\n", + rbd_dev->spec->pool_name, + rbd_dev->spec->pool_ns ?: "", + rbd_dev->spec->pool_ns ? "/" : "", + rbd_dev->spec->image_name); + } else { + pr_info("snap %s/%s%s%s@%s does not exist\n", + rbd_dev->spec->pool_name, + rbd_dev->spec->pool_ns ?: "", + rbd_dev->spec->pool_ns ? "/" : "", + rbd_dev->spec->image_name, + rbd_dev->spec->snap_name); + } +} + static void rbd_dev_image_release(struct rbd_device *rbd_dev) { rbd_dev_unprobe(rbd_dev); @@ -6925,6 +6967,7 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) */ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) { + bool need_watch = !rbd_is_ro(rbd_dev); int ret; /* @@ -6941,22 +6984,21 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) if (ret) goto err_out_format; - if (!depth) { + if (need_watch) { ret = rbd_register_watch(rbd_dev); if (ret) { if (ret == -ENOENT) - pr_info("image %s/%s%s%s does not exist\n", - rbd_dev->spec->pool_name, - rbd_dev->spec->pool_ns ?: "", - rbd_dev->spec->pool_ns ? "/" : "", - rbd_dev->spec->image_name); + rbd_print_dne(rbd_dev, false); goto err_out_format; } } ret = rbd_dev_header_info(rbd_dev); - if (ret) + if (ret) { + if (ret == -ENOENT && !need_watch) + rbd_print_dne(rbd_dev, false); goto err_out_watch; + } /* * If this image is the one being mapped, we have pool name and @@ -6970,12 +7012,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) ret = rbd_spec_fill_names(rbd_dev); if (ret) { if (ret == -ENOENT) - pr_info("snap %s/%s%s%s@%s does not exist\n", - rbd_dev->spec->pool_name, - rbd_dev->spec->pool_ns ?: "", - rbd_dev->spec->pool_ns ? "/" : "", - rbd_dev->spec->image_name, - rbd_dev->spec->snap_name); + rbd_print_dne(rbd_dev, true); goto err_out_probe; } @@ -6983,7 +7020,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) if (ret) goto err_out_probe; - if (rbd_dev->spec->snap_id != CEPH_NOSNAP && + if (rbd_is_snap(rbd_dev) && (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) { ret = rbd_object_map_load(rbd_dev); if (ret) @@ -7007,7 +7044,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: - if (!depth) + if (need_watch) rbd_unregister_watch(rbd_dev); err_out_format: rbd_dev->image_format = 0; @@ -7059,6 +7096,11 @@ static ssize_t do_rbd_add(struct bus_type *bus, spec = NULL; /* rbd_dev now owns this */ rbd_opts = NULL; /* rbd_dev now owns this */ + /* if we are mapping a snapshot it will be a read-only mapping */ + if (rbd_dev->opts->read_only || + strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME)) + __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags); + rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); if (!rbd_dev->config_info) { rc = -ENOMEM; @@ -7072,10 +7114,6 @@ static ssize_t do_rbd_add(struct bus_type *bus, goto err_out_rbd_dev; } - /* If we are mapping a snapshot it must be marked read-only */ - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) - rbd_dev->opts->read_only = true; - if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) { rbd_warn(rbd_dev, "alloc_size adjusted to %u", rbd_dev->layout.object_size); @@ -7096,7 +7134,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, if (rc) goto err_out_image_lock; - add_disk(rbd_dev->disk); + device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL); /* see rbd_init_disk() */ blk_put_queue(rbd_dev->disk->queue); diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 76b73ddf8fd7..10f6368117d8 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -1000,8 +1000,10 @@ static void rsxx_pci_remove(struct pci_dev *dev) cancel_work_sync(&card->event_work); + destroy_workqueue(card->event_wq); rsxx_destroy_dev(card); rsxx_dma_destroy(card); + destroy_workqueue(card->creg_ctrl.creg_wq); spin_lock_irqsave(&card->irq_lock, flags); rsxx_disable_ier_and_isr(card, CR_INTR_ALL); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 6b2fd630de85..39aeebc6837d 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -171,6 +171,7 @@ static const struct block_device_operations vdc_fops = { .owner = THIS_MODULE, .getgeo = vdc_getgeo, .ioctl = vdc_ioctl, + .compat_ioctl = blkdev_compat_ptr_ioctl, }; static void vdc_blk_queue_start(struct vdc_port *port) @@ -634,7 +635,7 @@ static int generic_request(struct vdc_port *port, u8 op, void *buf, int len) case VD_OP_GET_EFI: case VD_OP_SET_EFI: return -EOPNOTSUPP; - }; + } map_perm |= LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO; diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 1f3f9e0f02a8..4eaf97d7a170 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -827,7 +827,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) goto failed_req_csr; } - card->csr_remap = ioremap_nocache(csr_base, csr_len); + card->csr_remap = ioremap(csr_base, csr_len); if (!card->csr_remap) { dev_printk(KERN_ERR, &card->dev->dev, "Unable to remap memory region\n"); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 7ffd719d89de..54158766334b 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -11,7 +11,6 @@ #include <linux/virtio_blk.h> #include <linux/scatterlist.h> #include <linux/string_helpers.h> -#include <scsi/scsi_cmnd.h> #include <linux/idr.h> #include <linux/blk-mq.h> #include <linux/blk-mq-virtio.h> @@ -56,11 +55,6 @@ struct virtio_blk { }; struct virtblk_req { -#ifdef CONFIG_VIRTIO_BLK_SCSI - struct scsi_request sreq; /* for SCSI passthrough, must be first */ - u8 sense[SCSI_SENSE_BUFFERSIZE]; - struct virtio_scsi_inhdr in_hdr; -#endif struct virtio_blk_outhdr out_hdr; u8 status; struct scatterlist sg[]; @@ -78,80 +72,6 @@ static inline blk_status_t virtblk_result(struct virtblk_req *vbr) } } -/* - * If this is a packet command we need a couple of additional headers. Behind - * the normal outhdr we put a segment with the scsi command block, and before - * the normal inhdr we put the sense data and the inhdr with additional status - * information. - */ -#ifdef CONFIG_VIRTIO_BLK_SCSI -static int virtblk_add_req_scsi(struct virtqueue *vq, struct virtblk_req *vbr, - struct scatterlist *data_sg, bool have_data) -{ - struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; - unsigned int num_out = 0, num_in = 0; - - sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); - sgs[num_out++] = &hdr; - sg_init_one(&cmd, vbr->sreq.cmd, vbr->sreq.cmd_len); - sgs[num_out++] = &cmd; - - if (have_data) { - if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) - sgs[num_out++] = data_sg; - else - sgs[num_out + num_in++] = data_sg; - } - - sg_init_one(&sense, vbr->sense, SCSI_SENSE_BUFFERSIZE); - sgs[num_out + num_in++] = &sense; - sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); - sgs[num_out + num_in++] = &inhdr; - sg_init_one(&status, &vbr->status, sizeof(vbr->status)); - sgs[num_out + num_in++] = &status; - - return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); -} - -static inline void virtblk_scsi_request_done(struct request *req) -{ - struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); - struct virtio_blk *vblk = req->q->queuedata; - struct scsi_request *sreq = &vbr->sreq; - - sreq->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual); - sreq->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len); - sreq->result = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors); -} - -static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long data) -{ - struct gendisk *disk = bdev->bd_disk; - struct virtio_blk *vblk = disk->private_data; - - /* - * Only allow the generic SCSI ioctls if the host can support it. - */ - if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI)) - return -ENOTTY; - - return scsi_cmd_blk_ioctl(bdev, mode, cmd, - (void __user *)data); -} -#else -static inline int virtblk_add_req_scsi(struct virtqueue *vq, - struct virtblk_req *vbr, struct scatterlist *data_sg, - bool have_data) -{ - return -EIO; -} -static inline void virtblk_scsi_request_done(struct request *req) -{ -} -#define virtblk_ioctl NULL -#endif /* CONFIG_VIRTIO_BLK_SCSI */ - static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, struct scatterlist *data_sg, bool have_data) { @@ -216,13 +136,6 @@ static inline void virtblk_request_done(struct request *req) req->special_vec.bv_offset); } - switch (req_op(req)) { - case REQ_OP_SCSI_IN: - case REQ_OP_SCSI_OUT: - virtblk_scsi_request_done(req); - break; - } - blk_mq_end_request(req, virtblk_result(vbr)); } @@ -299,10 +212,6 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, type = VIRTIO_BLK_T_WRITE_ZEROES; unmap = !(req->cmd_flags & REQ_NOUNMAP); break; - case REQ_OP_SCSI_IN: - case REQ_OP_SCSI_OUT: - type = VIRTIO_BLK_T_SCSI_CMD; - break; case REQ_OP_DRV_IN: type = VIRTIO_BLK_T_GET_ID; break; @@ -333,10 +242,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, } spin_lock_irqsave(&vblk->vqs[qid].lock, flags); - if (blk_rq_is_scsi(req)) - err = virtblk_add_req_scsi(vblk->vqs[qid].vq, vbr, vbr->sg, num); - else - err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); + err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); if (err) { virtqueue_kick(vblk->vqs[qid].vq); blk_mq_stop_hw_queue(hctx); @@ -404,7 +310,6 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) } static const struct block_device_operations virtblk_fops = { - .ioctl = virtblk_ioctl, .owner = THIS_MODULE, .getgeo = virtblk_getgeo, }; @@ -683,9 +588,6 @@ static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq, struct virtio_blk *vblk = set->driver_data; struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); -#ifdef CONFIG_VIRTIO_BLK_SCSI - vbr->sreq.sense = vbr->sense; -#endif sg_init_table(vbr->sg, vblk->sg_elems); return 0; } @@ -698,23 +600,11 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set) vblk->vdev, 0); } -#ifdef CONFIG_VIRTIO_BLK_SCSI -static void virtblk_initialize_rq(struct request *req) -{ - struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); - - scsi_req_init(&vbr->sreq); -} -#endif - static const struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, .commit_rqs = virtio_commit_rqs, .complete = virtblk_request_done, .init_request = virtblk_init_request, -#ifdef CONFIG_VIRTIO_BLK_SCSI - .initialize_rq_fn = virtblk_initialize_rq, -#endif .map_queues = virtblk_map_queues, }; @@ -991,9 +881,6 @@ static const struct virtio_device_id id_table[] = { static unsigned int features_legacy[] = { VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, -#ifdef CONFIG_VIRTIO_BLK_SCSI - VIRTIO_BLK_F_SCSI, -#endif VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES, } diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index fd1e19f1a49f..c2f71265af4b 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -62,8 +62,8 @@ * IO workloads. */ -static int xen_blkif_max_buffer_pages = 1024; -module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644); +static int max_buffer_pages = 1024; +module_param_named(max_buffer_pages, max_buffer_pages, int, 0644); MODULE_PARM_DESC(max_buffer_pages, "Maximum number of free pages to keep in each block backend buffer"); @@ -78,8 +78,8 @@ MODULE_PARM_DESC(max_buffer_pages, * algorithm. */ -static int xen_blkif_max_pgrants = 1056; -module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); +static int max_pgrants = 1056; +module_param_named(max_persistent_grants, max_pgrants, int, 0644); MODULE_PARM_DESC(max_persistent_grants, "Maximum number of grants to map persistently"); @@ -88,8 +88,8 @@ MODULE_PARM_DESC(max_persistent_grants, * use. The time is in seconds, 0 means indefinitely long. */ -static unsigned int xen_blkif_pgrant_timeout = 60; -module_param_named(persistent_grant_unused_seconds, xen_blkif_pgrant_timeout, +static unsigned int pgrant_timeout = 60; +module_param_named(persistent_grant_unused_seconds, pgrant_timeout, uint, 0644); MODULE_PARM_DESC(persistent_grant_unused_seconds, "Time in seconds an unused persistent grant is allowed to " @@ -137,9 +137,8 @@ module_param(log_stats, int, 0644); static inline bool persistent_gnt_timeout(struct persistent_gnt *persistent_gnt) { - return xen_blkif_pgrant_timeout && - (jiffies - persistent_gnt->last_used >= - HZ * xen_blkif_pgrant_timeout); + return pgrant_timeout && (jiffies - persistent_gnt->last_used >= + HZ * pgrant_timeout); } static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page) @@ -234,7 +233,7 @@ static int add_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *this; struct xen_blkif *blkif = ring->blkif; - if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) { + if (ring->persistent_gnt_c >= max_pgrants) { if (!blkif->vbd.overflow_max_grants) blkif->vbd.overflow_max_grants = 1; return -EBUSY; @@ -397,14 +396,13 @@ static void purge_persistent_gnt(struct xen_blkif_ring *ring) goto out; } - if (ring->persistent_gnt_c < xen_blkif_max_pgrants || - (ring->persistent_gnt_c == xen_blkif_max_pgrants && + if (ring->persistent_gnt_c < max_pgrants || + (ring->persistent_gnt_c == max_pgrants && !ring->blkif->vbd.overflow_max_grants)) { num_clean = 0; } else { - num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; - num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + - num_clean; + num_clean = (max_pgrants / 100) * LRU_PERCENT_CLEAN; + num_clean = ring->persistent_gnt_c - max_pgrants + num_clean; num_clean = min(ring->persistent_gnt_c, num_clean); pr_debug("Going to purge at least %u persistent grants\n", num_clean); @@ -599,8 +597,7 @@ static void print_stats(struct xen_blkif_ring *ring) current->comm, ring->st_oo_req, ring->st_rd_req, ring->st_wr_req, ring->st_f_req, ring->st_ds_req, - ring->persistent_gnt_c, - xen_blkif_max_pgrants); + ring->persistent_gnt_c, max_pgrants); ring->st_print = jiffies + msecs_to_jiffies(10 * 1000); ring->st_rd_req = 0; ring->st_wr_req = 0; @@ -656,8 +653,11 @@ purge_gnt_list: ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); } - /* Shrink if we have more than xen_blkif_max_buffer_pages */ - shrink_free_pagepool(ring, xen_blkif_max_buffer_pages); + /* Shrink the free pages pool if it is too large. */ + if (time_before(jiffies, blkif->buffer_squeeze_end)) + shrink_free_pagepool(ring, 0); + else + shrink_free_pagepool(ring, max_buffer_pages); if (log_stats && time_after(jiffies, ring->st_print)) print_stats(ring); @@ -884,7 +884,7 @@ again: continue; } if (use_persistent_gnts && - ring->persistent_gnt_c < xen_blkif_max_pgrants) { + ring->persistent_gnt_c < max_pgrants) { /* * We are using persistent grants, the grant is * not mapped but we might have room for it. @@ -911,7 +911,7 @@ again: pages[seg_idx]->persistent_gnt = persistent_gnt; pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", persistent_gnt->gnt, ring->persistent_gnt_c, - xen_blkif_max_pgrants); + max_pgrants); goto next; } if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { @@ -936,6 +936,8 @@ next: out_of_memory: pr_alert("%s: out of memory\n", __func__); put_free_pages(ring, pages_to_gnt, segs_to_map); + for (i = last_map; i < num; i++) + pages[i]->handle = BLKBACK_INVALID_HANDLE; return -ENOMEM; } @@ -1504,5 +1506,13 @@ static int __init xen_blkif_init(void) module_init(xen_blkif_init); +static void __exit xen_blkif_fini(void) +{ + xen_blkif_xenbus_fini(); + xen_blkif_interface_fini(); +} + +module_exit(xen_blkif_fini); + MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("xen-backend:vbd"); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 1d3002d773f7..a3eeccf3ac5f 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -319,6 +319,7 @@ struct xen_blkif { /* All rings for this device. */ struct xen_blkif_ring *rings; unsigned int nr_rings; + unsigned long buffer_squeeze_end; }; struct seg_buf { @@ -375,9 +376,12 @@ struct phys_req { struct block_device *bdev; blkif_sector_t sector_number; }; + int xen_blkif_interface_init(void); +void xen_blkif_interface_fini(void); int xen_blkif_xenbus_init(void); +void xen_blkif_xenbus_fini(void); irqreturn_t xen_blkif_be_int(int irq, void *dev_id); int xen_blkif_schedule(void *arg); diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index b90dbcd99c03..42944d41aea0 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -171,6 +171,15 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) blkif->domid = domid; atomic_set(&blkif->refcnt, 1); init_completion(&blkif->drain_complete); + + /* + * Because freeing back to the cache may be deferred, it is not + * safe to unload the module (and hence destroy the cache) until + * this has completed. To prevent premature unloading, take an + * extra module reference here and release only when the object + * has been freed back to the cache. + */ + __module_get(THIS_MODULE); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); return blkif; @@ -181,6 +190,9 @@ static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref, { int err; struct xen_blkif *blkif = ring->blkif; + const struct blkif_common_sring *sring_common; + RING_IDX rsp_prod, req_prod; + unsigned int size; /* Already connected through? */ if (ring->irq) @@ -191,46 +203,62 @@ static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref, if (err < 0) return err; + sring_common = (struct blkif_common_sring *)ring->blk_ring; + rsp_prod = READ_ONCE(sring_common->rsp_prod); + req_prod = READ_ONCE(sring_common->req_prod); + switch (blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: { - struct blkif_sring *sring; - sring = (struct blkif_sring *)ring->blk_ring; - BACK_RING_INIT(&ring->blk_rings.native, sring, - XEN_PAGE_SIZE * nr_grefs); + struct blkif_sring *sring_native = + (struct blkif_sring *)ring->blk_ring; + + BACK_RING_ATTACH(&ring->blk_rings.native, sring_native, + rsp_prod, XEN_PAGE_SIZE * nr_grefs); + size = __RING_SIZE(sring_native, XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_32: { - struct blkif_x86_32_sring *sring_x86_32; - sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring; - BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32, - XEN_PAGE_SIZE * nr_grefs); + struct blkif_x86_32_sring *sring_x86_32 = + (struct blkif_x86_32_sring *)ring->blk_ring; + + BACK_RING_ATTACH(&ring->blk_rings.x86_32, sring_x86_32, + rsp_prod, XEN_PAGE_SIZE * nr_grefs); + size = __RING_SIZE(sring_x86_32, XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_64: { - struct blkif_x86_64_sring *sring_x86_64; - sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring; - BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64, - XEN_PAGE_SIZE * nr_grefs); + struct blkif_x86_64_sring *sring_x86_64 = + (struct blkif_x86_64_sring *)ring->blk_ring; + + BACK_RING_ATTACH(&ring->blk_rings.x86_64, sring_x86_64, + rsp_prod, XEN_PAGE_SIZE * nr_grefs); + size = __RING_SIZE(sring_x86_64, XEN_PAGE_SIZE * nr_grefs); break; } default: BUG(); } + err = -EIO; + if (req_prod - rsp_prod > size) + goto fail; + err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, xen_blkif_be_int, 0, "blkif-backend", ring); - if (err < 0) { - xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); - ring->blk_rings.common.sring = NULL; - return err; - } + if (err < 0) + goto fail; ring->irq = err; return 0; + +fail: + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); + ring->blk_rings.common.sring = NULL; + return err; } static int xen_blkif_disconnect(struct xen_blkif *blkif) @@ -320,6 +348,7 @@ static void xen_blkif_free(struct xen_blkif *blkif) /* Make sure everything is drained before shutting down */ kmem_cache_free(xen_blkif_cachep, blkif); + module_put(THIS_MODULE); } int __init xen_blkif_interface_init(void) @@ -333,6 +362,12 @@ int __init xen_blkif_interface_init(void) return 0; } +void xen_blkif_interface_fini(void) +{ + kmem_cache_destroy(xen_blkif_cachep); + xen_blkif_cachep = NULL; +} + /* * sysfs interface for VBD I/O requests */ @@ -432,7 +467,6 @@ static void xenvbd_sysfs_delif(struct xenbus_device *dev) device_remove_file(&dev->dev, &dev_attr_physical_device); } - static void xen_vbd_free(struct xen_vbd *vbd) { if (vbd->bdev) @@ -489,6 +523,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, handle, blkif->domid); return 0; } + static int xen_blkbk_remove(struct xenbus_device *dev) { struct backend_info *be = dev_get_drvdata(&dev->dev); @@ -572,6 +607,7 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info if (err) dev_warn(&dev->dev, "writing feature-discard (%d)", err); } + int xen_blkbk_barrier(struct xenbus_transaction xbt, struct backend_info *be, int state) { @@ -656,7 +692,6 @@ fail: return err; } - /* * Callback received when the hotplug scripts have placed the physical-device * node. Read it and the mode node, and create a vbd. If the frontend is @@ -748,7 +783,6 @@ static void backend_changed(struct xenbus_watch *watch, } } - /* * Callback received when the frontend's state changes. */ @@ -823,9 +857,27 @@ static void frontend_changed(struct xenbus_device *dev, } } +/* Once a memory pressure is detected, squeeze free page pools for a while. */ +static unsigned int buffer_squeeze_duration_ms = 10; +module_param_named(buffer_squeeze_duration_ms, + buffer_squeeze_duration_ms, int, 0644); +MODULE_PARM_DESC(buffer_squeeze_duration_ms, +"Duration in ms to squeeze pages buffer when a memory pressure is detected"); -/* ** Connection ** */ +/* + * Callback received when the memory pressure is detected. + */ +static void reclaim_memory(struct xenbus_device *dev) +{ + struct backend_info *be = dev_get_drvdata(&dev->dev); + if (!be) + return; + be->blkif->buffer_squeeze_end = jiffies + + msecs_to_jiffies(buffer_squeeze_duration_ms); +} + +/* ** Connection ** */ /* * Write the physical details regarding the block device to the store, and @@ -1115,10 +1167,17 @@ static struct xenbus_driver xen_blkbk_driver = { .ids = xen_blkbk_ids, .probe = xen_blkbk_probe, .remove = xen_blkbk_remove, - .otherend_changed = frontend_changed + .otherend_changed = frontend_changed, + .allow_rebind = true, + .reclaim_memory = reclaim_memory, }; int xen_blkif_xenbus_init(void) { return xenbus_register_backend(&xen_blkbk_driver); } + +void xen_blkif_xenbus_fini(void) +{ + xenbus_unregister_driver(&xen_blkbk_driver); +} diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index a74d03913822..e2ad6bba2281 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -151,9 +151,6 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the #define BLK_RING_SIZE(info) \ __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages) -#define BLK_MAX_RING_SIZE \ - __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) - /* * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 * characters are enough. Define to 20 to keep consistent with backend. @@ -177,12 +174,12 @@ struct blkfront_ring_info { unsigned int evtchn, irq; struct work_struct work; struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_MAX_RING_SIZE]; struct list_head indirect_pages; struct list_head grants; unsigned int persistent_gnts_c; unsigned long shadow_free; struct blkfront_info *dev_info; + struct blk_shadow shadow[]; }; /* @@ -1113,8 +1110,8 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (!VDEV_IS_EXTENDED(info->vdevice)) { err = xen_translate_vdev(info->vdevice, &minor, &offset); if (err) - return err; - nr_parts = PARTS_PER_DISK; + return err; + nr_parts = PARTS_PER_DISK; } else { minor = BLKIF_MINOR_EXT(info->vdevice); nr_parts = PARTS_PER_EXT_DISK; @@ -1915,7 +1912,8 @@ static int negotiate_mq(struct blkfront_info *info) info->nr_rings = 1; info->rinfo = kvcalloc(info->nr_rings, - sizeof(struct blkfront_ring_info), + struct_size(info->rinfo, shadow, + BLK_RING_SIZE(info)), GFP_KERNEL); if (!info->rinfo) { xenbus_dev_fatal(info->xbdev, -ENOMEM, "allocating ring_info structure"); @@ -2632,6 +2630,7 @@ static const struct block_device_operations xlvbd_block_fops = .release = blkif_release, .getgeo = blkif_getgeo, .ioctl = blkif_ioctl, + .compat_ioctl = blkdev_compat_ptr_ioctl, }; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d58a359a6622..1bdb5793842b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -207,14 +207,17 @@ static inline void zram_fill_page(void *ptr, unsigned long len, static bool page_same_filled(void *ptr, unsigned long *element) { - unsigned int pos; unsigned long *page; unsigned long val; + unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; page = (unsigned long *)ptr; val = page[0]; - for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (val != page[last_pos]) + return false; + + for (pos = 1; pos < last_pos; pos++) { if (val != page[pos]) return false; } @@ -413,13 +416,14 @@ static void reset_bdev(struct zram *zram) static ssize_t backing_dev_show(struct device *dev, struct device_attribute *attr, char *buf) { + struct file *file; struct zram *zram = dev_to_zram(dev); - struct file *file = zram->backing_dev; char *p; ssize_t ret; down_read(&zram->init_lock); - if (!zram->backing_dev) { + file = zram->backing_dev; + if (!file) { memcpy(buf, "none\n", 5); up_read(&zram->init_lock); return 5; @@ -625,7 +629,7 @@ static ssize_t writeback_store(struct device *dev, struct bio bio; struct bio_vec bio_vec; struct page *page; - ssize_t ret; + ssize_t ret = len; int mode; unsigned long blk_idx = 0; @@ -761,7 +765,6 @@ next: if (blk_idx) free_block_bdev(zram, blk_idx); - ret = len; __free_page(page); release_init_lock: up_read(&zram->init_lock); |