diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/cciss.c | 6 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 23 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 92 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 28 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 485 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nla.c | 1 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_protocol.h | 12 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 196 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 74 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 6 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_state.c | 38 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 107 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_wrappers.h | 54 | ||||
-rw-r--r-- | drivers/block/floppy.c | 2 | ||||
-rw-r--r-- | drivers/block/mtip32xx/mtip32xx.c | 1089 | ||||
-rw-r--r-- | drivers/block/mtip32xx/mtip32xx.h | 32 | ||||
-rw-r--r-- | drivers/block/null_blk.c | 2 | ||||
-rw-r--r-- | drivers/block/skd_main.c | 7 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 4 |
20 files changed, 1154 insertions, 1106 deletions
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 73894ca33956..4595c22f33f7 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4080,7 +4080,7 @@ static void cciss_interrupt_mode(ctlr_info_t *h) goto default_int_mode; if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) { - err = pci_enable_msix(h->pdev, cciss_msix_entries, 4); + err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4); if (!err) { h->intr[0] = cciss_msix_entries[0].vector; h->intr[1] = cciss_msix_entries[1].vector; @@ -4088,10 +4088,6 @@ static void cciss_interrupt_mode(ctlr_info_t *h) h->intr[3] = cciss_msix_entries[3].vector; h->msix_vector = 1; return; - } - if (err > 0) { - dev_warn(&h->pdev->dev, - "only %d MSI-X vectors available\n", err); } else { dev_warn(&h->pdev->dev, "MSI-X init failed %d\n", err); diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 90ae4ba8f9ee..05a1780ffa85 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -29,7 +29,6 @@ #include <linux/drbd_limits.h> #include <linux/dynamic_debug.h> #include "drbd_int.h" -#include "drbd_wrappers.h" enum al_transaction_types { @@ -204,7 +203,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd BUG_ON(!bdev->md_bdev); - drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", + dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", (void*)_RET_IP_ ); @@ -276,7 +275,6 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval return _al_get(device, first, true); } -static bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) { /* for bios crossing activity log extent boundaries, @@ -846,7 +844,7 @@ void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size, int wake_up = 0; unsigned long flags; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; @@ -920,7 +918,7 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size if (size == 0) return 0; - if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "sector: %llus, size: %d\n", (unsigned long long)sector, size); return 0; @@ -1023,8 +1021,7 @@ int drbd_rs_begin_io(struct drbd_device *device, sector_t sector) unsigned int enr = BM_SECT_TO_EXT(sector); struct bm_extent *bm_ext; int i, sig; - int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait. - 200 times -> 20 seconds. */ + bool sa; retry: sig = wait_event_interruptible(device->al_wait, @@ -1035,12 +1032,15 @@ retry: if (test_bit(BME_LOCKED, &bm_ext->flags)) return 0; + /* step aside only while we are above c-min-rate; unless disabled. */ + sa = drbd_rs_c_min_rate_throttle(device); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { sig = wait_event_interruptible(device->al_wait, !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || - test_bit(BME_PRIORITY, &bm_ext->flags)); + (sa && test_bit(BME_PRIORITY, &bm_ext->flags))); - if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) { + if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) { spin_lock_irq(&device->al_lock); if (lc_put(device->resync, &bm_ext->lce) == 0) { bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ @@ -1052,9 +1052,6 @@ retry: return -EINTR; if (schedule_timeout_interruptible(HZ/10)) return -EINTR; - if (sa && --sa == 0) - drbd_warn(device, "drbd_rs_begin_io() stepped aside for 20sec." - "Resync stalled?\n"); goto retry; } } @@ -1288,7 +1285,7 @@ void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size) sector_t esector, nr_sectors; int wake_up = 0; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e7093d4291f1..a76ceb344d64 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -382,6 +382,12 @@ enum { __EE_CALL_AL_COMPLETE_IO, __EE_MAY_SET_IN_SYNC, + /* is this a TRIM aka REQ_DISCARD? */ + __EE_IS_TRIM, + /* our lower level cannot handle trim, + * and we want to fall back to zeroout instead */ + __EE_IS_TRIM_USE_ZEROOUT, + /* In case a barrier failed, * we need to resubmit without the barrier flag. */ __EE_RESUBMITTED, @@ -405,7 +411,9 @@ enum { }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) -#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) +#define EE_IS_TRIM (1<<__EE_IS_TRIM) +#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT) +#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) #define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) @@ -579,6 +587,7 @@ struct drbd_resource { struct list_head resources; struct res_opts res_opts; struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ + struct mutex adm_mutex; /* mutex to serialize administrative requests */ spinlock_t req_lock; unsigned susp:1; /* IO suspended by user */ @@ -609,6 +618,7 @@ struct drbd_connection { struct drbd_socket data; /* data/barrier/cstate/parameter packets */ struct drbd_socket meta; /* ping/ack (metadata) packets */ int agreed_pro_version; /* actually used protocol version */ + u32 agreed_features; unsigned long last_received; /* in jiffies, either socket */ unsigned int ko_count; @@ -814,6 +824,28 @@ struct drbd_device { struct submit_worker submit; }; +struct drbd_config_context { + /* assigned from drbd_genlmsghdr */ + unsigned int minor; + /* assigned from request attributes, if present */ + unsigned int volume; +#define VOLUME_UNSPECIFIED (-1U) + /* pointer into the request skb, + * limited lifetime! */ + char *resource_name; + struct nlattr *my_addr; + struct nlattr *peer_addr; + + /* reply buffer */ + struct sk_buff *reply_skb; + /* pointer into reply buffer */ + struct drbd_genlmsghdr *reply_dh; + /* resolved from attributes, if possible */ + struct drbd_device *device; + struct drbd_resource *resource; + struct drbd_connection *connection; +}; + static inline struct drbd_device *minor_to_device(unsigned int minor) { return (struct drbd_device *)idr_find(&drbd_devices, minor); @@ -821,7 +853,7 @@ static inline struct drbd_device *minor_to_device(unsigned int minor) static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device) { - return list_first_entry(&device->peer_devices, struct drbd_peer_device, peer_devices); + return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); } #define for_each_resource(resource, _resources) \ @@ -1139,6 +1171,12 @@ struct bm_extent { #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ #define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ +/* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ +#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE +#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) + extern int drbd_bm_init(struct drbd_device *device); extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); extern void drbd_bm_cleanup(struct drbd_device *device); @@ -1229,9 +1267,9 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); extern rwlock_t global_state_lock; extern int conn_lowest_minor(struct drbd_connection *connection); -enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr); +extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); extern void drbd_destroy_device(struct kref *kref); -extern void drbd_delete_device(struct drbd_device *mdev); +extern void drbd_delete_device(struct drbd_device *device); extern struct drbd_resource *drbd_create_resource(const char *name); extern void drbd_free_resource(struct drbd_resource *resource); @@ -1257,7 +1295,7 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); /* drbd_nl.c */ -extern int drbd_msg_put_info(const char *info); +extern int drbd_msg_put_info(struct sk_buff *skb, const char *info); extern void drbd_suspend_io(struct drbd_device *device); extern void drbd_resume_io(struct drbd_device *device); extern char *ppsize(char *buf, unsigned long long size); @@ -1283,6 +1321,10 @@ extern void conn_try_outdate_peer_async(struct drbd_connection *connection); extern int drbd_khelper(struct drbd_device *device, char *cmd); /* drbd_worker.c */ +/* bi_end_io handlers */ +extern void drbd_md_io_complete(struct bio *bio, int error); +extern void drbd_peer_request_endio(struct bio *bio, int error); +extern void drbd_request_endio(struct bio *bio, int error); extern int drbd_worker(struct drbd_thread *thi); enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor); void drbd_resync_after_changed(struct drbd_device *device); @@ -1332,16 +1374,20 @@ extern int w_start_resync(struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); extern void start_resync_timer_fn(unsigned long data); +extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); + /* drbd_receiver.c */ extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_asender(struct drbd_thread *thi); -extern int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); +extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); +extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); extern int drbd_submit_peer_request(struct drbd_device *, struct drbd_peer_request *, const unsigned, const int); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, sector_t, unsigned int, + bool, gfp_t) __must_hold(local); extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, int); @@ -1401,6 +1447,37 @@ static inline void drbd_tcp_quickack(struct socket *sock) (char*)&val, sizeof(val)); } +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(struct drbd_device *device, + sector_t size) +{ + /* set_capacity(device->this_bdev->bd_disk, size); */ + set_capacity(device->vdisk, size); + device->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(struct drbd_device *device, + int fault_type, struct bio *bio) +{ + __release(local); + if (!bio->bi_bdev) { + printk(KERN_ERR "drbd%d: drbd_generic_make_request: " + "bio->bi_bdev == NULL\n", + device_to_minor(device)); + dump_stack(); + bio_endio(bio, -ENODEV); + return; + } + + if (drbd_insert_fault(device, fault_type)) + bio_endio(bio, -EIO); + else + generic_make_request(bio); +} + void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo); /* drbd_proc.c */ @@ -1410,6 +1487,7 @@ extern const char *drbd_conn_str(enum drbd_conns s); extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ +extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i); extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate); extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i); @@ -2144,7 +2222,7 @@ static inline void drbd_md_flush(struct drbd_device *device) static inline struct drbd_connection *first_connection(struct drbd_resource *resource) { - return list_first_entry(&resource->connections, + return list_first_entry_or_null(&resource->connections, struct drbd_connection, connections); } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 331e5cc1227d..960645c26e6f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1607,8 +1607,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long b return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; } -/* Used to send write requests - * R_PRIMARY -> Peer (P_DATA) +/* Used to send write or TRIM aka REQ_DISCARD requests + * R_PRIMARY -> Peer (P_DATA, P_TRIM) */ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req) { @@ -1640,6 +1640,16 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * dp_flags |= DP_SEND_WRITE_ACK; } p->dp_flags = cpu_to_be32(dp_flags); + + if (dp_flags & DP_DISCARD) { + struct p_trim *t = (struct p_trim*)p; + t->size = cpu_to_be32(req->i.size); + err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); + goto out; + } + + /* our digest is still only over the payload. + * TRIM does not carry any payload. */ if (dgs) drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); @@ -1675,6 +1685,7 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * ... Be noisy about digest too large ... } */ } +out: mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ return err; @@ -2570,6 +2581,7 @@ struct drbd_resource *drbd_create_resource(const char *name) INIT_LIST_HEAD(&resource->connections); list_add_tail_rcu(&resource->resources, &drbd_resources); mutex_init(&resource->conf_update); + mutex_init(&resource->adm_mutex); spin_lock_init(&resource->req_lock); return resource; @@ -2687,14 +2699,16 @@ static int init_submitter(struct drbd_device *device) return 0; } -enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr) +enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor) { + struct drbd_resource *resource = adm_ctx->resource; struct drbd_connection *connection; struct drbd_device *device; struct drbd_peer_device *peer_device, *tmp_peer_device; struct gendisk *disk; struct request_queue *q; int id; + int vnr = adm_ctx->volume; enum drbd_ret_code err = ERR_NOMEM; device = minor_to_device(minor); @@ -2763,7 +2777,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); } goto out_no_minor_idr; } @@ -2773,7 +2787,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); } goto out_idr_remove_minor; } @@ -2794,7 +2808,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_INVALID_REQUEST; - drbd_msg_put_info("requested volume exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already"); } goto out_idr_remove_from_resource; } @@ -2803,7 +2817,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (init_submitter(device)) { err = ERR_NOMEM; - drbd_msg_put_info("unable to create submit workqueue"); + drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue"); goto out_idr_remove_vol; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 526414bc2cab..1b35c45c92b7 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -34,7 +34,6 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" -#include "drbd_wrappers.h" #include <asm/unaligned.h> #include <linux/drbd_limits.h> #include <linux/kthread.h> @@ -82,32 +81,6 @@ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); /* used blkdev_get_by_path, to claim our meta data device(s) */ static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; -/* Configuration is strictly serialized, because generic netlink message - * processing is strictly serialized by the genl_lock(). - * Which means we can use one static global drbd_config_context struct. - */ -static struct drbd_config_context { - /* assigned from drbd_genlmsghdr */ - unsigned int minor; - /* assigned from request attributes, if present */ - unsigned int volume; -#define VOLUME_UNSPECIFIED (-1U) - /* pointer into the request skb, - * limited lifetime! */ - char *resource_name; - struct nlattr *my_addr; - struct nlattr *peer_addr; - - /* reply buffer */ - struct sk_buff *reply_skb; - /* pointer into reply buffer */ - struct drbd_genlmsghdr *reply_dh; - /* resolved from attributes, if possible */ - struct drbd_device *device; - struct drbd_resource *resource; - struct drbd_connection *connection; -} adm_ctx; - static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) { genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); @@ -117,9 +90,8 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only * reason it could fail was no space in skb, and there are 4k available. */ -int drbd_msg_put_info(const char *info) +int drbd_msg_put_info(struct sk_buff *skb, const char *info) { - struct sk_buff *skb = adm_ctx.reply_skb; struct nlattr *nla; int err = -EMSGSIZE; @@ -143,42 +115,46 @@ int drbd_msg_put_info(const char *info) * and per-family private info->pointers. * But we need to stay compatible with older kernels. * If it returns successfully, adm_ctx members are valid. + * + * At this point, we still rely on the global genl_lock(). + * If we want to avoid that, and allow "genl_family.parallel_ops", we may need + * to add additional synchronization against object destruction/modification. */ #define DRBD_ADM_NEED_MINOR 1 #define DRBD_ADM_NEED_RESOURCE 2 #define DRBD_ADM_NEED_CONNECTION 4 -static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, - unsigned flags) +static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, + struct sk_buff *skb, struct genl_info *info, unsigned flags) { struct drbd_genlmsghdr *d_in = info->userhdr; const u8 cmd = info->genlhdr->cmd; int err; - memset(&adm_ctx, 0, sizeof(adm_ctx)); + memset(adm_ctx, 0, sizeof(*adm_ctx)); /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) return -EPERM; - adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); - if (!adm_ctx.reply_skb) { + adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!adm_ctx->reply_skb) { err = -ENOMEM; goto fail; } - adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, + adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb, info, &drbd_genl_family, 0, cmd); /* put of a few bytes into a fresh skb of >= 4k will always succeed. * but anyways */ - if (!adm_ctx.reply_dh) { + if (!adm_ctx->reply_dh) { err = -ENOMEM; goto fail; } - adm_ctx.reply_dh->minor = d_in->minor; - adm_ctx.reply_dh->ret_code = NO_ERROR; + adm_ctx->reply_dh->minor = d_in->minor; + adm_ctx->reply_dh->ret_code = NO_ERROR; - adm_ctx.volume = VOLUME_UNSPECIFIED; + adm_ctx->volume = VOLUME_UNSPECIFIED; if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { struct nlattr *nla; /* parse and validate only */ @@ -188,111 +164,131 @@ static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, /* It was present, and valid, * copy it over to the reply skb. */ - err = nla_put_nohdr(adm_ctx.reply_skb, + err = nla_put_nohdr(adm_ctx->reply_skb, info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, info->attrs[DRBD_NLA_CFG_CONTEXT]); if (err) goto fail; - /* and assign stuff to the global adm_ctx */ + /* and assign stuff to the adm_ctx */ nla = nested_attr_tb[__nla_type(T_ctx_volume)]; if (nla) - adm_ctx.volume = nla_get_u32(nla); + adm_ctx->volume = nla_get_u32(nla); nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; if (nla) - adm_ctx.resource_name = nla_data(nla); - adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; - adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; - if ((adm_ctx.my_addr && - nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.connection->my_addr)) || - (adm_ctx.peer_addr && - nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.connection->peer_addr))) { + adm_ctx->resource_name = nla_data(nla); + adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; + adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; + if ((adm_ctx->my_addr && + nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) || + (adm_ctx->peer_addr && + nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) { err = -EINVAL; goto fail; } } - adm_ctx.minor = d_in->minor; - adm_ctx.device = minor_to_device(d_in->minor); - if (adm_ctx.resource_name) { - adm_ctx.resource = drbd_find_resource(adm_ctx.resource_name); + adm_ctx->minor = d_in->minor; + adm_ctx->device = minor_to_device(d_in->minor); + + /* We are protected by the global genl_lock(). + * But we may explicitly drop it/retake it in drbd_adm_set_role(), + * so make sure this object stays around. */ + if (adm_ctx->device) + kref_get(&adm_ctx->device->kref); + + if (adm_ctx->resource_name) { + adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name); } - if (!adm_ctx.device && (flags & DRBD_ADM_NEED_MINOR)) { - drbd_msg_put_info("unknown minor"); + if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor"); return ERR_MINOR_INVALID; } - if (!adm_ctx.resource && (flags & DRBD_ADM_NEED_RESOURCE)) { - drbd_msg_put_info("unknown resource"); - if (adm_ctx.resource_name) + if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource"); + if (adm_ctx->resource_name) return ERR_RES_NOT_KNOWN; return ERR_INVALID_REQUEST; } if (flags & DRBD_ADM_NEED_CONNECTION) { - if (adm_ctx.resource) { - drbd_msg_put_info("no resource name expected"); + if (adm_ctx->resource) { + drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected"); return ERR_INVALID_REQUEST; } - if (adm_ctx.device) { - drbd_msg_put_info("no minor number expected"); + if (adm_ctx->device) { + drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected"); return ERR_INVALID_REQUEST; } - if (adm_ctx.my_addr && adm_ctx.peer_addr) - adm_ctx.connection = conn_get_by_addrs(nla_data(adm_ctx.my_addr), - nla_len(adm_ctx.my_addr), - nla_data(adm_ctx.peer_addr), - nla_len(adm_ctx.peer_addr)); - if (!adm_ctx.connection) { - drbd_msg_put_info("unknown connection"); + if (adm_ctx->my_addr && adm_ctx->peer_addr) + adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr), + nla_len(adm_ctx->my_addr), + nla_data(adm_ctx->peer_addr), + nla_len(adm_ctx->peer_addr)); + if (!adm_ctx->connection) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection"); return ERR_INVALID_REQUEST; } } /* some more paranoia, if the request was over-determined */ - if (adm_ctx.device && adm_ctx.resource && - adm_ctx.device->resource != adm_ctx.resource) { + if (adm_ctx->device && adm_ctx->resource && + adm_ctx->device->resource != adm_ctx->resource) { pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n", - adm_ctx.minor, adm_ctx.resource->name, - adm_ctx.device->resource->name); - drbd_msg_put_info("minor exists in different resource"); + adm_ctx->minor, adm_ctx->resource->name, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource"); return ERR_INVALID_REQUEST; } - if (adm_ctx.device && - adm_ctx.volume != VOLUME_UNSPECIFIED && - adm_ctx.volume != adm_ctx.device->vnr) { + if (adm_ctx->device && + adm_ctx->volume != VOLUME_UNSPECIFIED && + adm_ctx->volume != adm_ctx->device->vnr) { pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", - adm_ctx.minor, adm_ctx.volume, - adm_ctx.device->vnr, - adm_ctx.device->resource->name); - drbd_msg_put_info("minor exists as different volume"); + adm_ctx->minor, adm_ctx->volume, + adm_ctx->device->vnr, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume"); return ERR_INVALID_REQUEST; } + /* still, provide adm_ctx->resource always, if possible. */ + if (!adm_ctx->resource) { + adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource + : adm_ctx->connection ? adm_ctx->connection->resource : NULL; + if (adm_ctx->resource) + kref_get(&adm_ctx->resource->kref); + } + return NO_ERROR; fail: - nlmsg_free(adm_ctx.reply_skb); - adm_ctx.reply_skb = NULL; + nlmsg_free(adm_ctx->reply_skb); + adm_ctx->reply_skb = NULL; return err; } -static int drbd_adm_finish(struct genl_info *info, int retcode) +static int drbd_adm_finish(struct drbd_config_context *adm_ctx, + struct genl_info *info, int retcode) { - if (adm_ctx.connection) { - kref_put(&adm_ctx.connection->kref, drbd_destroy_connection); - adm_ctx.connection = NULL; + if (adm_ctx->device) { + kref_put(&adm_ctx->device->kref, drbd_destroy_device); + adm_ctx->device = NULL; } - if (adm_ctx.resource) { - kref_put(&adm_ctx.resource->kref, drbd_destroy_resource); - adm_ctx.resource = NULL; + if (adm_ctx->connection) { + kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection); + adm_ctx->connection = NULL; + } + if (adm_ctx->resource) { + kref_put(&adm_ctx->resource->kref, drbd_destroy_resource); + adm_ctx->resource = NULL; } - if (!adm_ctx.reply_skb) + if (!adm_ctx->reply_skb) return -ENOMEM; - adm_ctx.reply_dh->ret_code = retcode; - drbd_adm_send_reply(adm_ctx.reply_skb, info); + adm_ctx->reply_dh->ret_code = retcode; + drbd_adm_send_reply(adm_ctx->reply_skb, info); return 0; } @@ -426,6 +422,14 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec } rcu_read_unlock(); + if (fp == FP_NOT_AVAIL) { + /* IO Suspending works on the whole resource. + Do it only for one device. */ + vnr = 0; + peer_device = idr_get_next(&connection->peer_devices, &vnr); + drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0)); + } + return fp; } @@ -438,12 +442,13 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) char *ex_to_string; int r; + spin_lock_irq(&connection->resource->req_lock); if (connection->cstate >= C_WF_REPORT_PARAMS) { drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); + spin_unlock_irq(&connection->resource->req_lock); return false; } - spin_lock_irq(&connection->resource->req_lock); connect_cnt = connection->connect_cnt; spin_unlock_irq(&connection->resource->req_lock); @@ -654,11 +659,11 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) put_ldev(device); } } else { - mutex_lock(&device->resource->conf_update); + /* Called from drbd_adm_set_role only. + * We are still holding the conf_update mutex. */ nc = first_peer_device(device)->connection->net_conf; if (nc) nc->discard_my_data = 0; /* without copy; single bit op is atomic */ - mutex_unlock(&device->resource->conf_update); set_disk_ro(device->vdisk, false); if (get_ldev(device)) { @@ -700,11 +705,12 @@ static const char *from_attrs_err_to_txt(int err) int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct set_role_parms parms; int err; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -715,17 +721,22 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) err = set_role_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + genl_unlock(); + mutex_lock(&adm_ctx.resource->adm_mutex); if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); else retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); + + mutex_unlock(&adm_ctx.resource->adm_mutex); + genl_lock(); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -1104,15 +1115,18 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_ struct request_queue * const q = device->rq_queue; unsigned int max_hw_sectors = max_bio_size >> 9; unsigned int max_segments = 0; + struct request_queue *b = NULL; if (get_ldev_if_state(device, D_ATTACHING)) { - struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; + b = device->ldev->backing_bdev->bd_disk->queue; max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); rcu_read_lock(); max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; rcu_read_unlock(); - put_ldev(device); + + blk_set_stacking_limits(&q->limits); + blk_queue_max_write_same_sectors(q, 0); } blk_queue_logical_block_size(q, 512); @@ -1121,8 +1135,25 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); - if (get_ldev_if_state(device, D_ATTACHING)) { - struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; + if (b) { + struct drbd_connection *connection = first_peer_device(device)->connection; + + if (blk_queue_discard(b) && + (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { + /* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ + q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS; + + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + /* REALLY? Is stacking secdiscard "legal"? */ + if (blk_queue_secdiscard(b)) + queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); + } else { + q->limits.max_discard_sectors = 0; + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); + } blk_queue_stack_limits(q, b); @@ -1164,8 +1195,14 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device) peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ else peer = DRBD_MAX_BIO_SIZE; - } + /* We may later detach and re-attach on a disconnected Primary. + * Avoid this setting to jump back in that case. + * We want to store what we know the peer DRBD can handle, + * not what the peer IO backend can handle. */ + if (peer > device->peer_max_bio_size) + device->peer_max_bio_size = peer; + } new = min(local, peer); if (device->state.role == R_PRIMARY && new < now) @@ -1258,19 +1295,21 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct drbd_device *device; struct disk_conf *new_disk_conf, *old_disk_conf; struct fifo_buffer *old_plan = NULL, *new_plan = NULL; int err, fifo_size; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); /* we also need a disk * to change the options on */ @@ -1294,7 +1333,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs_for_change(new_disk_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail_unlock; } @@ -1385,12 +1424,15 @@ fail_unlock: success: put_ldev(device); out: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int err; enum drbd_ret_code retcode; @@ -1406,13 +1448,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) enum drbd_state_rv rv; struct net_conf *nc; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto finish; device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); conn_reconfig_start(first_peer_device(device)->connection); /* if you want to reconfigure, please tear down first */ @@ -1455,7 +1498,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs(new_disk_conf, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -1619,7 +1662,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } if (device->state.conn < C_CONNECTED && - device->state.role == R_PRIMARY && + device->state.role == R_PRIMARY && device->ed_uuid && (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { drbd_err(device, "Can only attach to data with current UUID=%016llX\n", (unsigned long long)device->ed_uuid); @@ -1797,7 +1840,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); put_ldev(device); conn_reconfig_done(first_peer_device(device)->connection); - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; force_diskless_dec: @@ -1819,9 +1863,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kfree(new_disk_conf); lc_destroy(resync_lru); kfree(new_plan); - + mutex_unlock(&adm_ctx.resource->adm_mutex); finish: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -1860,11 +1904,12 @@ out: * Only then we have finally detached. */ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct detach_parms parms = { }; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -1874,14 +1919,16 @@ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) err = detach_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = adm_detach(adm_ctx.device, parms.force_detach); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2055,6 +2102,7 @@ static void free_crypto(struct crypto *crypto) int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct drbd_connection *connection; struct net_conf *old_net_conf, *new_net_conf = NULL; @@ -2063,13 +2111,14 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) int rsr; /* re-sync running */ struct crypto crypto = { }; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; connection = adm_ctx.connection; + mutex_lock(&adm_ctx.resource->adm_mutex); new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); if (!new_net_conf) { @@ -2084,7 +2133,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) old_net_conf = connection->net_conf; if (!old_net_conf) { - drbd_msg_put_info("net conf missing, try connect"); + drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect"); retcode = ERR_INVALID_REQUEST; goto fail; } @@ -2096,7 +2145,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs_for_change(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2167,12 +2216,15 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) done: conn_reconfig_done(connection); out: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_peer_device *peer_device; struct net_conf *old_net_conf, *new_net_conf = NULL; struct crypto crypto = { }; @@ -2182,14 +2234,14 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) int i; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { - drbd_msg_put_info("connection endpoint(s) missing"); + drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing"); retcode = ERR_INVALID_REQUEST; goto out; } @@ -2215,6 +2267,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) } } + mutex_lock(&adm_ctx.resource->adm_mutex); connection = first_connection(adm_ctx.resource); conn_reconfig_start(connection); @@ -2235,7 +2288,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2284,7 +2337,8 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); conn_reconfig_done(connection); - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; fail: @@ -2292,8 +2346,9 @@ fail: kfree(new_net_conf); conn_reconfig_done(connection); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2356,13 +2411,14 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct disconnect_parms parms; struct drbd_connection *connection; enum drbd_state_rv rv; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2374,18 +2430,20 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) err = disconnect_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } } + mutex_lock(&adm_ctx.resource->adm_mutex); rv = conn_try_disconnect(connection, parms.force_disconnect); if (rv < SS_SUCCESS) retcode = rv; /* FIXME: Type mismatch. */ else retcode = NO_ERROR; + mutex_unlock(&adm_ctx.resource->adm_mutex); fail: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2407,6 +2465,7 @@ void resync_after_online_grow(struct drbd_device *device) int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct disk_conf *old_disk_conf, *new_disk_conf = NULL; struct resize_parms rs; struct drbd_device *device; @@ -2417,12 +2476,13 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) sector_t u_size; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto fail; + goto finish; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (!get_ldev(device)) { retcode = ERR_NO_DISK; @@ -2436,7 +2496,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) err = resize_parms_from_attrs(&rs, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail_ldev; } } @@ -2482,7 +2542,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) goto fail_ldev; } - if (device->state.conn != C_CONNECTED) { + if (device->state.conn != C_CONNECTED && !rs.resize_force) { retcode = ERR_MD_LAYOUT_CONNECTED; goto fail_ldev; } @@ -2528,7 +2588,9 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) } fail: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; fail_ldev: @@ -2538,11 +2600,12 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2555,33 +2618,37 @@ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } + mutex_lock(&adm_ctx.resource->adm_mutex); err = set_resource_options(adm_ctx.resource, &res_opts); if (err) { retcode = ERR_INVALID_REQUEST; if (err == -ENOMEM) retcode = ERR_NOMEM; } + mutex_unlock(&adm_ctx.resource->adm_mutex); fail: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; /* If there is still bitmap IO pending, probably because of a previous @@ -2605,26 +2672,29 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, union drbd_state mask, union drbd_state val) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = drbd_request_state(adm_ctx.device, mask, val); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2639,15 +2709,17 @@ static int drbd_bmio_set_susp_al(struct drbd_device *device) int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; int retcode; /* drbd_ret_code, drbd_state_rv */ struct drbd_device *device; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; /* If there is still bitmap IO pending, probably because of a previous @@ -2674,40 +2746,45 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) retcode = ERR_PAUSE_IS_SET; + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; union drbd_dev_state s; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { s = adm_ctx.device->state; if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { @@ -2717,9 +2794,9 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) retcode = ERR_PAUSE_IS_CLEAR; } } - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2730,15 +2807,17 @@ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (test_bit(NEW_CUR_UUID, &device->flags)) { drbd_uuid_new_current(device); @@ -2753,9 +2832,9 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO); } drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2931,10 +3010,11 @@ nla_put_failure: int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2946,7 +3026,7 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) return err; } out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -3133,11 +3213,12 @@ dump: int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct timeout_parms tp; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3154,17 +3235,18 @@ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) return err; } out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; enum drbd_ret_code retcode; struct start_ov_parms parms; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3179,10 +3261,12 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) int err = start_ov_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + mutex_lock(&adm_ctx.resource->adm_mutex); + /* w_make_ov_request expects position to be aligned */ device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); device->ov_stop_sector = parms.ov_stop_sector; @@ -3193,21 +3277,24 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); retcode = drbd_request_state(device, NS(conn, C_VERIFY_S)); drbd_resume_io(device); + + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; enum drbd_ret_code retcode; int skip_initial_sync = 0; int err; struct new_c_uuid_parms args; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3219,11 +3306,12 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) err = new_c_uuid_parms_from_attrs(&args, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out_nolock; } } + mutex_lock(&adm_ctx.resource->adm_mutex); mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */ if (!get_ldev(device)) { @@ -3268,22 +3356,24 @@ out_dec: put_ldev(device); out: mutex_unlock(device->state_mutex); + mutex_unlock(&adm_ctx.resource->adm_mutex); out_nolock: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } static enum drbd_ret_code -drbd_check_resource_name(const char *name) +drbd_check_resource_name(struct drbd_config_context *adm_ctx) { + const char *name = adm_ctx->resource_name; if (!name || !name[0]) { - drbd_msg_put_info("resource name missing"); + drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing"); return ERR_MANDATORY_TAG; } /* if we want to use these in sysfs/configfs/debugfs some day, * we must not allow slashes */ if (strchr(name, '/')) { - drbd_msg_put_info("invalid resource name"); + drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name"); return ERR_INVALID_REQUEST; } return NO_ERROR; @@ -3291,11 +3381,12 @@ drbd_check_resource_name(const char *name) int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(skb, info, 0); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3305,48 +3396,50 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } - retcode = drbd_check_resource_name(adm_ctx.resource_name); + retcode = drbd_check_resource_name(&adm_ctx); if (retcode != NO_ERROR) goto out; if (adm_ctx.resource) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { retcode = ERR_INVALID_REQUEST; - drbd_msg_put_info("resource exists"); + drbd_msg_put_info(adm_ctx.reply_skb, "resource exists"); } /* else: still NO_ERROR */ goto out; } + /* not yet safe for genl_family.parallel_ops */ if (!conn_create(adm_ctx.resource_name, &res_opts)) retcode = ERR_NOMEM; out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_genlmsghdr *dh = info->userhdr; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; if (dh->minor > MINORMASK) { - drbd_msg_put_info("requested minor out of range"); + drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range"); retcode = ERR_INVALID_REQUEST; goto out; } if (adm_ctx.volume > DRBD_VOLUME_MAX) { - drbd_msg_put_info("requested volume id out of range"); + drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range"); retcode = ERR_INVALID_REQUEST; goto out; } @@ -3360,9 +3453,11 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) goto out; } - retcode = drbd_create_device(adm_ctx.resource, dh->minor, adm_ctx.volume); + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = drbd_create_device(&adm_ctx, dh->minor); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -3383,35 +3478,40 @@ static enum drbd_ret_code adm_del_minor(struct drbd_device *device) int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = adm_del_minor(adm_ctx.device); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_resource *resource; struct drbd_connection *connection; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ unsigned i; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; resource = adm_ctx.resource; + mutex_lock(&resource->adm_mutex); /* demote */ for_each_connection(connection, resource) { struct drbd_peer_device *peer_device; @@ -3419,14 +3519,14 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&connection->peer_devices, peer_device, i) { retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info("failed to demote"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote"); goto out; } } retcode = conn_try_disconnect(connection, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info("failed to disconnect"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect"); goto out; } } @@ -3435,7 +3535,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&resource->devices, device, i) { retcode = adm_detach(device, 0); if (retcode < SS_SUCCESS || retcode > NO_ERROR) { - drbd_msg_put_info("failed to detach"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach"); goto out; } } @@ -3453,7 +3553,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) retcode = adm_del_minor(device); if (retcode != NO_ERROR) { /* "can not happen" */ - drbd_msg_put_info("failed to delete volume"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume"); goto out; } } @@ -3462,25 +3562,28 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) synchronize_rcu(); drbd_free_resource(resource); retcode = NO_ERROR; - out: - drbd_adm_finish(info, retcode); + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_resource *resource; struct drbd_connection *connection; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; resource = adm_ctx.resource; + mutex_lock(&resource->adm_mutex); for_each_connection(connection, resource) { if (connection->cstate > C_STANDALONE) { retcode = ERR_NET_CONFIGURED; @@ -3499,7 +3602,9 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) drbd_free_resource(resource); retcode = NO_ERROR; out: - drbd_adm_finish(info, retcode); + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c index fa672b6df8d6..b2d4791498a6 100644 --- a/drivers/block/drbd/drbd_nla.c +++ b/drivers/block/drbd/drbd_nla.c @@ -1,4 +1,3 @@ -#include "drbd_wrappers.h" #include <linux/kernel.h> #include <net/netlink.h> #include <linux/drbd_genl_api.h> diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 2f26e8ffa45b..89736bdbbc70 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -116,7 +116,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se /* ------------------------ ~18s average ------------------------ */ i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS; dt = (jiffies - device->rs_mark_time[i]) / HZ; - if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS)) + if (dt > 180) stalled = 1; if (!dt) diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index 3c04ec0ea333..2da9104a3851 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -54,6 +54,11 @@ enum drbd_packet { P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ + /* 0x2e to 0x30 reserved, used in drbd 9 */ + + /* REQ_DISCARD. We used "discard" in different contexts before, + * which is why I chose TRIM here, to disambiguate. */ + P_TRIM = 0x31, P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAX_OPT_CMD = 0x101, @@ -119,6 +124,11 @@ struct p_data { u32 dp_flags; } __packed; +struct p_trim { + struct p_data p_data; + u32 size; /* == bio->bi_size */ +} __packed; + /* * commands which share a struct: * p_block_ack: @@ -150,6 +160,8 @@ struct p_block_req { * ReportParams */ +#define FF_TRIM 1 + struct p_connection_features { u32 protocol_min; u32 feature_flags; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 68e3992e8838..b6c8aaf4931b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -46,9 +46,10 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" - #include "drbd_vli.h" +#define PRO_FEATURES (FF_TRIM) + struct packet_info { enum drbd_packet cmd; unsigned int size; @@ -65,7 +66,7 @@ enum finish_epoch { static int drbd_do_features(struct drbd_connection *connection); static int drbd_do_auth(struct drbd_connection *connection); static int drbd_disconnected(struct drbd_peer_device *); - +static void conn_wait_active_ee_empty(struct drbd_connection *connection); static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event); static int e_end_block(struct drbd_work *, int); @@ -234,9 +235,17 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) * @retry: whether to retry, if not enough pages are available right now * * Tries to allocate number pages, first from our own page pool, then from - * the kernel, unless this allocation would exceed the max_buffers setting. + * the kernel. * Possibly retry until DRBD frees sufficient pages somewhere else. * + * If this allocation would exceed the max_buffers setting, we throttle + * allocation (schedule_timeout) to give the system some room to breathe. + * + * We do not use max-buffers as hard limit, because it could lead to + * congestion and further to a distributed deadlock during online-verify or + * (checksum based) resync, if the max-buffers, socket buffer sizes and + * resync-rate settings are mis-configured. + * * Returns a page chain linked via page->private. */ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number, @@ -246,10 +255,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int struct page *page = NULL; struct net_conf *nc; DEFINE_WAIT(wait); - int mxb; + unsigned int mxb; - /* Yes, we may run up to @number over max_buffers. If we - * follow it strictly, the admin will get it wrong anyways. */ rcu_read_lock(); nc = rcu_dereference(peer_device->connection->net_conf); mxb = nc ? nc->max_buffers : 1000000; @@ -277,7 +284,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int break; } - schedule(); + if (schedule_timeout(HZ/10) == 0) + mxb = UINT_MAX; } finish_wait(&drbd_pp_wait, &wait); @@ -331,7 +339,7 @@ You must not have the req_lock: struct drbd_peer_request * drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, - unsigned int data_size, gfp_t gfp_mask) __must_hold(local) + unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local) { struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; @@ -348,7 +356,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto return NULL; } - if (data_size) { + if (has_payload && data_size) { page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); if (!page) goto fail; @@ -1026,24 +1034,27 @@ randomize: if (drbd_send_protocol(connection) == -EOPNOTSUPP) return -1; + /* Prevent a race between resync-handshake and + * being promoted to Primary. + * + * Grab and release the state mutex, so we know that any current + * drbd_set_role() is finished, and any incoming drbd_set_role + * will see the STATE_SENT flag, and wait for it to be cleared. + */ + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_lock(peer_device->device->state_mutex); + set_bit(STATE_SENT, &connection->flags); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_unlock(peer_device->device->state_mutex); + rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; kref_get(&device->kref); rcu_read_unlock(); - /* Prevent a race between resync-handshake and - * being promoted to Primary. - * - * Grab and release the state mutex, so we know that any current - * drbd_set_role() is finished, and any incoming drbd_set_role - * will see the STATE_SENT flag, and wait for it to be cleared. - */ - mutex_lock(device->state_mutex); - mutex_unlock(device->state_mutex); - if (discard_my_data) set_bit(DISCARD_MY_DATA, &device->flags); else @@ -1315,6 +1326,20 @@ int drbd_submit_peer_request(struct drbd_device *device, unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; int err = -ENOMEM; + if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { + /* wait for all pending IO completions, before we start + * zeroing things out. */ + conn_wait_active_ee_empty(first_peer_device(device)->connection); + if (blkdev_issue_zeroout(device->ldev->backing_bdev, + sector, ds >> 9, GFP_NOIO)) + peer_req->flags |= EE_WAS_ERROR; + drbd_endio_write_sec_final(peer_req); + return 0; + } + + if (peer_req->flags & EE_IS_TRIM) + nr_pages = 0; /* discards don't have any payload. */ + /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the @@ -1326,7 +1351,7 @@ int drbd_submit_peer_request(struct drbd_device *device, next_bio: bio = bio_alloc(GFP_NOIO, nr_pages); if (!bio) { - drbd_err(device, "submit_ee: Allocation of a bio failed\n"); + drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages); goto fail; } /* > peer_req->i.sector, unless this is the first bio */ @@ -1340,6 +1365,11 @@ next_bio: bios = bio; ++n_bios; + if (rw & REQ_DISCARD) { + bio->bi_iter.bi_size = ds; + goto submit; + } + page_chain_for_each(page) { unsigned len = min_t(unsigned, ds, PAGE_SIZE); if (!bio_add_page(bio, page, len, 0)) { @@ -1360,8 +1390,9 @@ next_bio: sector += len >> 9; --nr_pages; } - D_ASSERT(device, page == NULL); D_ASSERT(device, ds == 0); +submit: + D_ASSERT(device, page == NULL); atomic_set(&peer_req->pending_bios, n_bios); do { @@ -1490,19 +1521,21 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf * and from receive_Data */ static struct drbd_peer_request * read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, - int data_size) __must_hold(local) + struct packet_info *pi) __must_hold(local) { struct drbd_device *device = peer_device->device; const sector_t capacity = drbd_get_capacity(device->this_bdev); struct drbd_peer_request *peer_req; struct page *page; int dgs, ds, err; + int data_size = pi->size; void *dig_in = peer_device->connection->int_dig_in; void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; + struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; dgs = 0; - if (peer_device->connection->peer_integrity_tfm) { + if (!trim && peer_device->connection->peer_integrity_tfm) { dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); /* * FIXME: Receive the incoming digest into the receive buffer @@ -1514,9 +1547,15 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, data_size -= dgs; } + if (trim) { + D_ASSERT(peer_device, data_size == 0); + data_size = be32_to_cpu(trim->size); + } + if (!expect(IS_ALIGNED(data_size, 512))) return NULL; - if (!expect(data_size <= DRBD_MAX_BIO_SIZE)) + /* prepare for larger trim requests. */ + if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE)) return NULL; /* even though we trust out peer, @@ -1532,11 +1571,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, GFP_NOIO); + peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO); if (!peer_req) return NULL; - if (!data_size) + if (trim) return peer_req; ds = data_size; @@ -1676,12 +1715,12 @@ static int e_end_resync_block(struct drbd_work *w, int unused) } static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector, - int data_size) __releases(local) + struct packet_info *pi) __releases(local) { struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; - peer_req = read_in_block(peer_device, ID_SYNCER, sector, data_size); + peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi); if (!peer_req) goto fail; @@ -1697,7 +1736,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto list_add(&peer_req->w.list, &device->sync_ee); spin_unlock_irq(&device->resource->req_lock); - atomic_add(data_size >> 9, &device->rs_sect_ev); + atomic_add(pi->size >> 9, &device->rs_sect_ev); if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) return 0; @@ -1785,7 +1824,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet /* data is submitted to disk within recv_resync_read. * corresponding put_ldev done below on error, * or in drbd_peer_request_endio. */ - err = recv_resync_read(peer_device, sector, pi->size); + err = recv_resync_read(peer_device, sector, pi); } else { if (__ratelimit(&drbd_ratelimit_state)) drbd_err(device, "Can not write resync data to local disk.\n"); @@ -2196,7 +2235,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * */ sector = be64_to_cpu(p->sector); - peer_req = read_in_block(peer_device, p->block_id, sector, pi->size); + peer_req = read_in_block(peer_device, p->block_id, sector, pi); if (!peer_req) { put_ldev(device); return -EIO; @@ -2206,7 +2245,15 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * dp_flags = be32_to_cpu(p->dp_flags); rw |= wire_flags_to_bio(dp_flags); - if (peer_req->pages == NULL) { + if (pi->cmd == P_TRIM) { + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + peer_req->flags |= EE_IS_TRIM; + if (!blk_queue_discard(q)) + peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT; + D_ASSERT(peer_device, peer_req->i.size > 0); + D_ASSERT(peer_device, rw & REQ_DISCARD); + D_ASSERT(peer_device, peer_req->pages == NULL); + } else if (peer_req->pages == NULL) { D_ASSERT(device, peer_req->i.size == 0); D_ASSERT(device, dp_flags & DP_FLUSH); } @@ -2242,7 +2289,12 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, peer_seq); spin_lock_irq(&device->resource->req_lock); } - list_add(&peer_req->w.list, &device->active_ee); + /* if we use the zeroout fallback code, we process synchronously + * and we wait for all pending requests, respectively wait for + * active_ee to become empty in drbd_submit_peer_request(); + * better not add ourselves here. */ + if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) + list_add(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); if (device->state.conn == C_SYNC_TARGET) @@ -2313,39 +2365,45 @@ out_interrupted: * The current sync rate used here uses only the most recent two step marks, * to have a short time average so we can react faster. */ -int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) +bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) { - struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; - unsigned long db, dt, dbdt; struct lc_element *tmp; - int curr_events; - int throttle = 0; - unsigned int c_min_rate; - - rcu_read_lock(); - c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; - rcu_read_unlock(); + bool throttle = true; - /* feature disabled? */ - if (c_min_rate == 0) - return 0; + if (!drbd_rs_c_min_rate_throttle(device)) + return false; spin_lock_irq(&device->al_lock); tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); if (tmp) { struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); - if (test_bit(BME_PRIORITY, &bm_ext->flags)) { - spin_unlock_irq(&device->al_lock); - return 0; - } + if (test_bit(BME_PRIORITY, &bm_ext->flags)) + throttle = false; /* Do not slow down if app IO is already waiting for this extent */ } spin_unlock_irq(&device->al_lock); + return throttle; +} + +bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) +{ + struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; + unsigned long db, dt, dbdt; + unsigned int c_min_rate; + int curr_events; + + rcu_read_lock(); + c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; + rcu_read_unlock(); + + /* feature disabled? */ + if (c_min_rate == 0) + return false; + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + (int)part_stat_read(&disk->part0, sectors[1]) - atomic_read(&device->rs_sect_ev); - if (!device->rs_last_events || curr_events - device->rs_last_events > 64) { unsigned long rs_left; int i; @@ -2368,12 +2426,11 @@ int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) dbdt = Bit2KB(db/dt); if (dbdt > c_min_rate) - throttle = 1; + return true; } - return throttle; + return false; } - static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi) { struct drbd_peer_device *peer_device; @@ -2436,7 +2493,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, GFP_NOIO); + peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, + true /* has real payload */, GFP_NOIO); if (!peer_req) { put_ldev(device); return -ENOMEM; @@ -3648,6 +3706,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info put_ldev(device); } + device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); + drbd_reconsider_max_bio_size(device); + /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). + In case we cleared the QUEUE_FLAG_DISCARD from our queue in + drbd_reconsider_max_bio_size(), we can be sure that after + drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ + ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(device)) { dd = drbd_determine_dev_size(device, ddsf, NULL); @@ -3660,9 +3725,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info drbd_set_my_capacity(device, p_size); } - device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); - drbd_reconsider_max_bio_size(device); - if (get_ldev(device)) { if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) { device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); @@ -4423,6 +4485,7 @@ static struct data_cmd drbd_cmd_handler[] = { [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, }; static void drbdd(struct drbd_connection *connection) @@ -4630,6 +4693,7 @@ static int drbd_send_features(struct drbd_connection *connection) memset(p, 0, sizeof(*p)); p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); + p->feature_flags = cpu_to_be32(PRO_FEATURES); return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); } @@ -4683,10 +4747,14 @@ static int drbd_do_features(struct drbd_connection *connection) goto incompat; connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); + connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags); drbd_info(connection, "Handshake successful: " "Agreed network protocol version %d\n", connection->agreed_pro_version); + drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n", + connection->agreed_features & FF_TRIM ? " " : " not "); + return 1; incompat: @@ -4778,6 +4846,12 @@ static int drbd_do_auth(struct drbd_connection *connection) goto fail; } + if (pi.size < CHALLENGE_LEN) { + drbd_err(connection, "AuthChallenge payload too small.\n"); + rv = -1; + goto fail; + } + peers_ch = kmalloc(pi.size, GFP_NOIO); if (peers_ch == NULL) { drbd_err(connection, "kmalloc of peers_ch failed\n"); @@ -4791,6 +4865,12 @@ static int drbd_do_auth(struct drbd_connection *connection) goto fail; } + if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) { + drbd_err(connection, "Peer presented the same challenge!\n"); + rv = -1; + goto fail; + } + resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm); response = kmalloc(resp_size, GFP_NOIO); if (response == NULL) { diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3779c8d2875b..09803d0d5207 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -522,6 +522,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); break; + case DISCARD_COMPLETED_NOTSUPP: + case DISCARD_COMPLETED_WITH_ERROR: + /* I'd rather not detach from local disk just because it + * failed a REQ_DISCARD. */ + mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + break; + case QUEUE_FOR_NET_READ: /* READ or READA, and * no local disk, @@ -1235,6 +1242,7 @@ void do_submit(struct work_struct *ws) if (list_empty(&incoming)) break; +skip_fast_path: wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending)); /* Maybe more was queued, while we prepared the transaction? * Try to stuff them into this transaction as well. @@ -1273,6 +1281,25 @@ void do_submit(struct work_struct *ws) list_del_init(&req->tl_requests); drbd_send_and_submit(device, req); } + + /* If all currently hot activity log extents are kept busy by + * incoming requests, we still must not totally starve new + * requests to cold extents. In that case, prepare one request + * in blocking mode. */ + list_for_each_entry_safe(req, tmp, &incoming, tl_requests) { + list_del_init(&req->tl_requests); + req->rq_state |= RQ_IN_ACT_LOG; + if (!drbd_al_begin_io_prepare(device, &req->i)) { + /* Corresponding extent was hot after all? */ + drbd_send_and_submit(device, req); + } else { + /* Found a request to a cold extent. + * Put on "pending" list, + * and try to cumulate with more. */ + list_add(&req->tl_requests, &pending); + goto skip_fast_path; + } + } } } @@ -1326,23 +1353,35 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct return limit; } -static struct drbd_request *find_oldest_request(struct drbd_connection *connection) +static void find_oldest_requests( + struct drbd_connection *connection, + struct drbd_device *device, + struct drbd_request **oldest_req_waiting_for_peer, + struct drbd_request **oldest_req_waiting_for_disk) { - /* Walk the transfer log, - * and find the oldest not yet completed request */ struct drbd_request *r; + *oldest_req_waiting_for_peer = NULL; + *oldest_req_waiting_for_disk = NULL; list_for_each_entry(r, &connection->transfer_log, tl_requests) { - if (atomic_read(&r->completion_ref)) - return r; + const unsigned s = r->rq_state; + if (!*oldest_req_waiting_for_peer + && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) + *oldest_req_waiting_for_peer = r; + + if (!*oldest_req_waiting_for_disk + && (s & RQ_LOCAL_PENDING) && r->device == device) + *oldest_req_waiting_for_disk = r; + + if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk) + break; } - return NULL; } void request_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; struct drbd_connection *connection = first_peer_device(device)->connection; - struct drbd_request *req; /* oldest request */ + struct drbd_request *req_disk, *req_peer; /* oldest request */ struct net_conf *nc; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long now; @@ -1366,8 +1405,8 @@ void request_timer_fn(unsigned long data) now = jiffies; spin_lock_irq(&device->resource->req_lock); - req = find_oldest_request(connection); - if (!req) { + find_oldest_requests(connection, device, &req_peer, &req_disk); + if (req_peer == NULL && req_disk == NULL) { spin_unlock_irq(&device->resource->req_lock); mod_timer(&device->request_timer, now + et); return; @@ -1389,19 +1428,26 @@ void request_timer_fn(unsigned long data) * ~198 days with 250 HZ, we have a window where the timeout would need * to expire twice (worst case) to become effective. Good enough. */ - if (ent && req->rq_state & RQ_NET_PENDING && - time_after(now, req->start_time + ent) && + if (ent && req_peer && + time_after(now, req_peer->start_time + ent) && !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); } - if (dt && req->rq_state & RQ_LOCAL_PENDING && req->device == device && - time_after(now, req->start_time + dt) && + if (dt && req_disk && + time_after(now, req_disk->start_time + dt) && !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); __drbd_chk_io_error(device, DRBD_FORCE_DETACH); } - nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; + + /* Reschedule timer for the nearest not already expired timeout. + * Fallback to now + min(effective network timeout, disk timeout). */ + ent = (ent && req_peer && time_before(now, req_peer->start_time + ent)) + ? req_peer->start_time + ent : now + et; + dt = (dt && req_disk && time_before(now, req_disk->start_time + dt)) + ? req_disk->start_time + dt : now + et; + nt = time_before(ent, dt) ? ent : dt; spin_unlock_irq(&connection->resource->req_lock); mod_timer(&device->request_timer, nt); } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index c684c963538e..8566cd5866b4 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -30,7 +30,6 @@ #include <linux/slab.h> #include <linux/drbd.h> #include "drbd_int.h" -#include "drbd_wrappers.h" /* The request callbacks will be called in irq context by the IDE drivers, and in Softirqs/Tasklets/BH context by the SCSI drivers, @@ -111,11 +110,14 @@ enum drbd_req_event { BARRIER_ACKED, /* in protocol A and B */ DATA_RECEIVED, /* (remote read) */ + COMPLETED_OK, READ_COMPLETED_WITH_ERROR, READ_AHEAD_COMPLETED_WITH_ERROR, WRITE_COMPLETED_WITH_ERROR, + DISCARD_COMPLETED_NOTSUPP, + DISCARD_COMPLETED_WITH_ERROR, + ABORT_DISK_IO, - COMPLETED_OK, RESEND, FAIL_FROZEN_DISK_IO, RESTART_FROZEN_DISK_IO, diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 1a84345a3868..a5d8aae00e04 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -54,8 +54,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); -static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns, - enum sanitize_state_warnings *warn); +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn); static inline bool is_susp(union drbd_state s) { @@ -287,7 +287,7 @@ _req_st_cond(struct drbd_device *device, union drbd_state mask, spin_lock_irqsave(&device->resource->req_lock, flags); os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); rv = is_valid_transition(os, ns); if (rv >= SS_SUCCESS) rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ @@ -333,7 +333,7 @@ drbd_req_state(struct drbd_device *device, union drbd_state mask, spin_lock_irqsave(&device->resource->req_lock, flags); os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); rv = is_valid_transition(os, ns); if (rv < SS_SUCCESS) { spin_unlock_irqrestore(&device->resource->req_lock, flags); @@ -740,8 +740,8 @@ static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_st * When we loose connection, we have to set the state of the peers disk (pdsk) * to D_UNKNOWN. This rule and many more along those lines are in this function. */ -static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns, - enum sanitize_state_warnings *warn) +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn) { enum drbd_fencing_p fp; enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; @@ -882,11 +882,13 @@ static union drbd_state sanitize_state(struct drbd_device *device, union drbd_st } if (fp == FP_STONITH && - (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED)) + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && + !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && + !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { @@ -958,7 +960,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, os = drbd_read_state(device); - ns = sanitize_state(device, ns, &ssw); + ns = sanitize_state(device, os, ns, &ssw); if (ns.i == os.i) return SS_NOTHING_TO_DO; @@ -1656,7 +1658,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; @@ -1718,7 +1720,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union number_of_volumes++; os = drbd_read_state(device); ns = apply_mask_val(os, mask, val); - ns = sanitize_state(device, ns, NULL); + ns = sanitize_state(device, os, ns, NULL); if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; @@ -1763,19 +1765,19 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union static enum drbd_state_rv _conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val) { - enum drbd_state_rv rv; + enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */; if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags)) - return SS_CW_SUCCESS; + rv = SS_CW_SUCCESS; if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags)) - return SS_CW_FAILED_BY_PEER; + rv = SS_CW_FAILED_BY_PEER; - rv = conn_is_valid_transition(connection, mask, val, 0); - if (rv == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) - rv = SS_UNKNOWN_ERROR; /* continue waiting */ + err = conn_is_valid_transition(connection, mask, val, 0); + if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) + return rv; - return rv; + return err; } enum drbd_state_rv diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 2c4ce42c3657..d8f57b6305cd 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -118,7 +118,7 @@ static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __rele /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver, final stage. */ -static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) +void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) { unsigned long flags = 0; struct drbd_peer_device *peer_device = peer_req->peer_device; @@ -150,7 +150,9 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee); - if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) + /* FIXME do we want to detach for failed REQ_DISCARD? + * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ + if (peer_req->flags & EE_WAS_ERROR) __drbd_chk_io_error(device, DRBD_WRITE_ERROR); spin_unlock_irqrestore(&device->resource->req_lock, flags); @@ -176,10 +178,12 @@ void drbd_peer_request_endio(struct bio *bio, int error) struct drbd_device *device = peer_req->peer_device->device; int uptodate = bio_flagged(bio, BIO_UPTODATE); int is_write = bio_data_dir(bio) == WRITE; + int is_discard = !!(bio->bi_rw & REQ_DISCARD); if (error && __ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "%s: error=%d s=%llus\n", - is_write ? "write" : "read", error, + is_write ? (is_discard ? "discard" : "write") + : "read", error, (unsigned long long)peer_req->i.sector); if (!error && !uptodate) { if (__ratelimit(&drbd_ratelimit_state)) @@ -263,7 +267,12 @@ void drbd_request_endio(struct bio *bio, int error) /* to avoid recursion in __req_mod */ if (unlikely(error)) { - what = (bio_data_dir(bio) == WRITE) + if (bio->bi_rw & REQ_DISCARD) + what = (error == -EOPNOTSUPP) + ? DISCARD_COMPLETED_NOTSUPP + : DISCARD_COMPLETED_WITH_ERROR; + else + what = (bio_data_dir(bio) == WRITE) ? WRITE_COMPLETED_WITH_ERROR : (bio_rw(bio) == READ) ? READ_COMPLETED_WITH_ERROR @@ -395,7 +404,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, - size, GFP_TRY); + size, true /* has real payload */, GFP_TRY); if (!peer_req) goto defer; @@ -492,10 +501,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size) return fb; } -static int drbd_rs_controller(struct drbd_device *device) +static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) { struct disk_conf *dc; - unsigned int sect_in; /* Number of sectors that came in since the last turn */ unsigned int want; /* The number of sectors we want in the proxy */ int req_sect; /* Number of sectors to request in this turn */ int correction; /* Number of sectors more we need in the proxy*/ @@ -505,9 +513,6 @@ static int drbd_rs_controller(struct drbd_device *device) int max_sect; struct fifo_buffer *plan; - sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */ - device->rs_in_flight -= sect_in; - dc = rcu_dereference(device->ldev->disk_conf); plan = rcu_dereference(device->rs_plan_s); @@ -550,11 +555,16 @@ static int drbd_rs_controller(struct drbd_device *device) static int drbd_rs_number_requests(struct drbd_device *device) { - int number; + unsigned int sect_in; /* Number of sectors that came in since the last turn */ + int number, mxb; + + sect_in = atomic_xchg(&device->rs_sect_in, 0); + device->rs_in_flight -= sect_in; rcu_read_lock(); + mxb = drbd_get_max_buffers(device) / 2; if (rcu_dereference(device->rs_plan_s)->size) { - number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9); + number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9); device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; } else { device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate; @@ -562,8 +572,14 @@ static int drbd_rs_number_requests(struct drbd_device *device) } rcu_read_unlock(); - /* ignore the amount of pending requests, the resync controller should - * throttle down to incoming reply rate soon enough anyways. */ + /* Don't have more than "max-buffers"/2 in-flight. + * Otherwise we may cause the remote site to stall on drbd_alloc_pages(), + * potentially causing a distributed deadlock on congestion during + * online-verify or (checksum-based) resync, if max-buffers, + * socket buffer sizes and resync rate settings are mis-configured. */ + if (mxb - device->rs_in_flight < number) + number = mxb - device->rs_in_flight; + return number; } @@ -597,7 +613,7 @@ static int make_resync_request(struct drbd_device *device, int cancel) max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; number = drbd_rs_number_requests(device); - if (number == 0) + if (number <= 0) goto requeue; for (i = 0; i < number; i++) { @@ -647,7 +663,7 @@ next_sector: */ align = 1; rollback_i = i; - for (;;) { + while (i < number) { if (size + BM_BLOCK_SIZE > max_bio_size) break; @@ -1670,11 +1686,15 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } clear_bit(B_RS_H_DONE, &device->flags); - write_lock_irq(&global_state_lock); + /* req_lock: serialize with drbd_send_and_submit() and others + * global_state_lock: for stable sync-after dependencies */ + spin_lock_irq(&device->resource->req_lock); + write_lock(&global_state_lock); /* Did some connection breakage or IO error race with us? */ if (device->state.conn < C_CONNECTED || !get_ldev_if_state(device, D_NEGOTIATING)) { - write_unlock_irq(&global_state_lock); + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); mutex_unlock(device->state_mutex); return; } @@ -1714,7 +1734,8 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } _drbd_pause_after(device); } - write_unlock_irq(&global_state_lock); + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); if (r == SS_SUCCESS) { /* reset rs_last_bcast when a resync or verify is started, @@ -1778,34 +1799,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) mutex_unlock(device->state_mutex); } -/* If the resource already closed the current epoch, but we did not - * (because we have not yet seen new requests), we should send the - * corresponding barrier now. Must be checked within the same spinlock - * that is used to check for new requests. */ -static bool need_to_send_barrier(struct drbd_connection *connection) -{ - if (!connection->send.seen_any_write_yet) - return false; - - /* Skip barriers that do not contain any writes. - * This may happen during AHEAD mode. */ - if (!connection->send.current_epoch_writes) - return false; - - /* ->req_lock is held when requests are queued on - * connection->sender_work, and put into ->transfer_log. - * It is also held when ->current_tle_nr is increased. - * So either there are already new requests queued, - * and corresponding barriers will be send there. - * Or nothing new is queued yet, so the difference will be 1. - */ - if (atomic_read(&connection->current_tle_nr) != - connection->send.current_epoch_nr + 1) - return false; - - return true; -} - static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) { spin_lock_irq(&queue->q_lock); @@ -1864,12 +1857,22 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * spin_unlock_irq(&connection->resource->req_lock); break; } - send_barrier = need_to_send_barrier(connection); + + /* We found nothing new to do, no to-be-communicated request, + * no other work item. We may still need to close the last + * epoch. Next incoming request epoch will be connection -> + * current transfer log epoch number. If that is different + * from the epoch of the last request we communicated, it is + * safe to send the epoch separating barrier now. + */ + send_barrier = + atomic_read(&connection->current_tle_nr) != + connection->send.current_epoch_nr; spin_unlock_irq(&connection->resource->req_lock); - if (send_barrier) { - drbd_send_barrier(connection); - connection->send.current_epoch_nr++; - } + + if (send_barrier) + maybe_send_barrier(connection, + connection->send.current_epoch_nr + 1); schedule(); /* may be woken up for other things but new work, too, * e.g. if the current epoch got closed. diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h deleted file mode 100644 index 3db9ebaf64f6..000000000000 --- a/drivers/block/drbd/drbd_wrappers.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef _DRBD_WRAPPERS_H -#define _DRBD_WRAPPERS_H - -#include <linux/ctype.h> -#include <linux/mm.h> -#include "drbd_int.h" - -/* see get_sb_bdev and bd_claim */ -extern char *drbd_sec_holder; - -/* sets the number of 512 byte sectors of our virtual device */ -static inline void drbd_set_my_capacity(struct drbd_device *device, - sector_t size) -{ - /* set_capacity(device->this_bdev->bd_disk, size); */ - set_capacity(device->vdisk, size); - device->this_bdev->bd_inode->i_size = (loff_t)size << 9; -} - -#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) - -/* bi_end_io handlers */ -extern void drbd_md_io_complete(struct bio *bio, int error); -extern void drbd_peer_request_endio(struct bio *bio, int error); -extern void drbd_request_endio(struct bio *bio, int error); - -/* - * used to submit our private bio - */ -static inline void drbd_generic_make_request(struct drbd_device *device, - int fault_type, struct bio *bio) -{ - __release(local); - if (!bio->bi_bdev) { - printk(KERN_ERR "drbd%d: drbd_generic_make_request: " - "bio->bi_bdev == NULL\n", - device_to_minor(device)); - dump_stack(); - bio_endio(bio, -ENODEV); - return; - } - - if (drbd_insert_fault(device, fault_type)) - bio_endio(bio, -EIO); - else - generic_make_request(bio); -} - -#ifndef __CHECKER__ -# undef __cond_lock -# define __cond_lock(x,c) (c) -#endif - -#endif diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 5f69c910c3ac..8e767bb7995e 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3809,7 +3809,7 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive) bio.bi_iter.bi_size = size; bio.bi_bdev = bdev; bio.bi_iter.bi_sector = 0; - bio.bi_flags = (1 << BIO_QUIET); + bio.bi_flags |= (1 << BIO_QUIET); bio.bi_private = &cbdata; bio.bi_end_io = floppy_rb0_cb; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 59c5abe32f06..74abd49fabdc 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -31,6 +31,7 @@ #include <linux/module.h> #include <linux/genhd.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/bio.h> #include <linux/dma-mapping.h> #include <linux/idr.h> @@ -173,60 +174,36 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev) return false; /* device present */ } -/* - * Obtain an empty command slot. - * - * This function needs to be reentrant since it could be called - * at the same time on multiple CPUs. The allocation of the - * command slot must be atomic. - * - * @port Pointer to the port data structure. - * - * return value - * >= 0 Index of command slot obtained. - * -1 No command slots available. - */ -static int get_slot(struct mtip_port *port) +static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) { - int slot, i; - unsigned int num_command_slots = port->dd->slot_groups * 32; + struct request *rq; - /* - * Try 10 times, because there is a small race here. - * that's ok, because it's still cheaper than a lock. - * - * Race: Since this section is not protected by lock, same bit - * could be chosen by different process contexts running in - * different processor. So instead of costly lock, we are going - * with loop. - */ - for (i = 0; i < 10; i++) { - slot = find_next_zero_bit(port->allocated, - num_command_slots, 1); - if ((slot < num_command_slots) && - (!test_and_set_bit(slot, port->allocated))) - return slot; - } - dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n"); + rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true); + return blk_mq_rq_to_pdu(rq); +} - mtip_check_surprise_removal(port->dd->pdev); - return -1; +static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd) +{ + blk_put_request(blk_mq_rq_from_pdu(cmd)); } /* - * Release a command slot. - * - * @port Pointer to the port data structure. - * @tag Tag of command to release - * - * return value - * None + * Once we add support for one hctx per mtip group, this will change a bit */ -static inline void release_slot(struct mtip_port *port, int tag) +static struct request *mtip_rq_from_tag(struct driver_data *dd, + unsigned int tag) +{ + struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0]; + + return blk_mq_tag_to_rq(hctx->tags, tag); +} + +static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd, + unsigned int tag) { - smp_mb__before_clear_bit(); - clear_bit(tag, port->allocated); - smp_mb__after_clear_bit(); + struct request *rq = mtip_rq_from_tag(dd, tag); + + return blk_mq_rq_to_pdu(rq); } /* @@ -248,93 +225,28 @@ static inline void release_slot(struct mtip_port *port, int tag) * None */ static void mtip_async_complete(struct mtip_port *port, - int tag, - void *data, - int status) + int tag, struct mtip_cmd *cmd, int status) { - struct mtip_cmd *cmd; - struct driver_data *dd = data; - int unaligned, cb_status = status ? -EIO : 0; - void (*func)(void *, int); + struct driver_data *dd = port->dd; + struct request *rq; if (unlikely(!dd) || unlikely(!port)) return; - cmd = &port->commands[tag]; - if (unlikely(status == PORT_IRQ_TF_ERR)) { dev_warn(&port->dd->pdev->dev, "Command tag %d failed due to TFE\n", tag); } - /* Clear the active flag */ - atomic_set(&port->commands[tag].active, 0); - - /* Upper layer callback */ - func = cmd->async_callback; - if (likely(func && cmpxchg(&cmd->async_callback, func, 0) == func)) { + /* Unmap the DMA scatter list entries */ + dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents, cmd->direction); - /* Unmap the DMA scatter list entries */ - dma_unmap_sg(&dd->pdev->dev, - cmd->sg, - cmd->scatter_ents, - cmd->direction); + rq = mtip_rq_from_tag(dd, tag); - func(cmd->async_data, cb_status); - unaligned = cmd->unaligned; + if (unlikely(cmd->unaligned)) + up(&port->cmd_slot_unal); - /* Clear the allocated bit for the command */ - release_slot(port, tag); - - if (unlikely(unaligned)) - up(&port->cmd_slot_unal); - else - up(&port->cmd_slot); - } -} - -/* - * This function is called for clean the pending command in the - * command slot during the surprise removal of device and return - * error to the upper layer. - * - * @dd Pointer to the DRIVER_DATA structure. - * - * return value - * None - */ -static void mtip_command_cleanup(struct driver_data *dd) -{ - int tag = 0; - struct mtip_cmd *cmd; - struct mtip_port *port = dd->port; - unsigned int num_cmd_slots = dd->slot_groups * 32; - - if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) - return; - - if (!port) - return; - - cmd = &port->commands[MTIP_TAG_INTERNAL]; - if (atomic_read(&cmd->active)) - if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & - (1 << MTIP_TAG_INTERNAL)) - if (cmd->comp_func) - cmd->comp_func(port, MTIP_TAG_INTERNAL, - cmd->comp_data, -ENODEV); - - while (1) { - tag = find_next_bit(port->allocated, num_cmd_slots, tag); - if (tag >= num_cmd_slots) - break; - - cmd = &port->commands[tag]; - if (atomic_read(&cmd->active)) - mtip_async_complete(port, tag, dd, -ENODEV); - } - - set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); + blk_mq_end_io(rq, status ? -EIO : 0); } /* @@ -388,8 +300,6 @@ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) { int group = tag >> 5; - atomic_set(&port->commands[tag].active, 1); - /* guard SACT and CI registers */ spin_lock(&port->cmd_issue_lock[group]); writel((1 << MTIP_TAG_BIT(tag)), @@ -397,10 +307,6 @@ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) writel((1 << MTIP_TAG_BIT(tag)), port->cmd_issue[MTIP_TAG_INDEX(tag)]); spin_unlock(&port->cmd_issue_lock[group]); - - /* Set the command's timeout value.*/ - port->commands[tag].comp_time = jiffies + msecs_to_jiffies( - MTIP_NCQ_COMMAND_TIMEOUT_MS); } /* @@ -648,132 +554,13 @@ static void print_tags(struct driver_data *dd, memset(tagmap, 0, sizeof(tagmap)); for (group = SLOTBITS_IN_LONGS; group > 0; group--) - tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ", + tagmap_len += sprintf(tagmap + tagmap_len, "%016lX ", tagbits[group-1]); dev_warn(&dd->pdev->dev, "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap); } /* - * Called periodically to see if any read/write commands are - * taking too long to complete. - * - * @data Pointer to the PORT data structure. - * - * return value - * None - */ -static void mtip_timeout_function(unsigned long int data) -{ - struct mtip_port *port = (struct mtip_port *) data; - struct host_to_dev_fis *fis; - struct mtip_cmd *cmd; - int unaligned, tag, cmdto_cnt = 0; - unsigned int bit, group; - unsigned int num_command_slots; - unsigned long to, tagaccum[SLOTBITS_IN_LONGS]; - void (*func)(void *, int); - - if (unlikely(!port)) - return; - - if (unlikely(port->dd->sr)) - return; - - if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) { - mod_timer(&port->cmd_timer, - jiffies + msecs_to_jiffies(30000)); - return; - } - /* clear the tag accumulator */ - memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); - num_command_slots = port->dd->slot_groups * 32; - - for (tag = 0; tag < num_command_slots; tag++) { - /* - * Skip internal command slot as it has - * its own timeout mechanism - */ - if (tag == MTIP_TAG_INTERNAL) - continue; - - if (atomic_read(&port->commands[tag].active) && - (time_after(jiffies, port->commands[tag].comp_time))) { - group = tag >> 5; - bit = tag & 0x1F; - - cmd = &port->commands[tag]; - fis = (struct host_to_dev_fis *) cmd->command; - - set_bit(tag, tagaccum); - cmdto_cnt++; - if (cmdto_cnt == 1) - set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - - /* - * Clear the completed bit. This should prevent - * any interrupt handlers from trying to retire - * the command. - */ - writel(1 << bit, port->completed[group]); - - /* Clear the active flag for the command */ - atomic_set(&port->commands[tag].active, 0); - - func = cmd->async_callback; - if (func && - cmpxchg(&cmd->async_callback, func, 0) == func) { - - /* Unmap the DMA scatter list entries */ - dma_unmap_sg(&port->dd->pdev->dev, - cmd->sg, - cmd->scatter_ents, - cmd->direction); - - func(cmd->async_data, -EIO); - unaligned = cmd->unaligned; - - /* Clear the allocated bit for the command. */ - release_slot(port, tag); - - if (unaligned) - up(&port->cmd_slot_unal); - else - up(&port->cmd_slot); - } - } - } - - if (cmdto_cnt) { - print_tags(port->dd, "timed out", tagaccum, cmdto_cnt); - if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { - mtip_device_reset(port->dd); - wake_up_interruptible(&port->svc_wait); - } - clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - } - - if (port->ic_pause_timer) { - to = port->ic_pause_timer + msecs_to_jiffies(1000); - if (time_after(jiffies, to)) { - if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { - port->ic_pause_timer = 0; - clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); - clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); - clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); - wake_up_interruptible(&port->svc_wait); - } - - - } - } - - /* Restart the timer */ - mod_timer(&port->cmd_timer, - jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); -} - -/* * Internal command completion callback function. * * This function is normally called by the driver ISR when an internal @@ -789,28 +576,19 @@ static void mtip_timeout_function(unsigned long int data) * None */ static void mtip_completion(struct mtip_port *port, - int tag, - void *data, - int status) + int tag, struct mtip_cmd *command, int status) { - struct mtip_cmd *command = &port->commands[tag]; - struct completion *waiting = data; + struct completion *waiting = command->comp_data; if (unlikely(status == PORT_IRQ_TF_ERR)) dev_warn(&port->dd->pdev->dev, "Internal command %d completed with TFE\n", tag); - command->async_callback = NULL; - command->comp_func = NULL; - complete(waiting); } static void mtip_null_completion(struct mtip_port *port, - int tag, - void *data, - int status) + int tag, struct mtip_cmd *command, int status) { - return; } static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, @@ -842,19 +620,16 @@ static void mtip_handle_tfe(struct driver_data *dd) port = dd->port; - /* Stop the timer to prevent command timeouts. */ - del_timer(&port->cmd_timer); set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && test_bit(MTIP_TAG_INTERNAL, port->allocated)) { - cmd = &port->commands[MTIP_TAG_INTERNAL]; + cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); - atomic_inc(&cmd->active); /* active > 1 indicates error */ if (cmd->comp_data && cmd->comp_func) { cmd->comp_func(port, MTIP_TAG_INTERNAL, - cmd->comp_data, PORT_IRQ_TF_ERR); + cmd, PORT_IRQ_TF_ERR); } goto handle_tfe_exit; } @@ -866,6 +641,8 @@ static void mtip_handle_tfe(struct driver_data *dd) for (group = 0; group < dd->slot_groups; group++) { completed = readl(port->completed[group]); + dev_warn(&dd->pdev->dev, "g=%u, comp=%x\n", group, completed); + /* clear completed status register in the hardware.*/ writel(completed, port->completed[group]); @@ -879,15 +656,11 @@ static void mtip_handle_tfe(struct driver_data *dd) if (tag == MTIP_TAG_INTERNAL) continue; - cmd = &port->commands[tag]; + cmd = mtip_cmd_from_tag(dd, tag); if (likely(cmd->comp_func)) { set_bit(tag, tagaccum); cmd_cnt++; - atomic_set(&cmd->active, 0); - cmd->comp_func(port, - tag, - cmd->comp_data, - 0); + cmd->comp_func(port, tag, cmd, 0); } else { dev_err(&port->dd->pdev->dev, "Missing completion func for tag %d", @@ -947,11 +720,7 @@ static void mtip_handle_tfe(struct driver_data *dd) for (bit = 0; bit < 32; bit++) { reissue = 1; tag = (group << 5) + bit; - cmd = &port->commands[tag]; - - /* If the active bit is set re-issue the command */ - if (atomic_read(&cmd->active) == 0) - continue; + cmd = mtip_cmd_from_tag(dd, tag); fis = (struct host_to_dev_fis *)cmd->command; @@ -970,11 +739,9 @@ static void mtip_handle_tfe(struct driver_data *dd) tag, fail_reason != NULL ? fail_reason : "unknown"); - atomic_set(&cmd->active, 0); if (cmd->comp_func) { cmd->comp_func(port, tag, - cmd->comp_data, - -ENODATA); + cmd, -ENODATA); } continue; } @@ -997,14 +764,9 @@ static void mtip_handle_tfe(struct driver_data *dd) /* Retire a command that will not be reissued */ dev_warn(&port->dd->pdev->dev, "retiring tag %d\n", tag); - atomic_set(&cmd->active, 0); if (cmd->comp_func) - cmd->comp_func( - port, - tag, - cmd->comp_data, - PORT_IRQ_TF_ERR); + cmd->comp_func(port, tag, cmd, PORT_IRQ_TF_ERR); else dev_warn(&port->dd->pdev->dev, "Bad completion for tag %d\n", @@ -1017,9 +779,6 @@ handle_tfe_exit: /* clear eh_active */ clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); - - mod_timer(&port->cmd_timer, - jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); } /* @@ -1048,15 +807,10 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, if (unlikely(tag == MTIP_TAG_INTERNAL)) continue; - command = &port->commands[tag]; - /* make internal callback */ - if (likely(command->comp_func)) { - command->comp_func( - port, - tag, - command->comp_data, - 0); - } else { + command = mtip_cmd_from_tag(dd, tag); + if (likely(command->comp_func)) + command->comp_func(port, tag, command, 0); + else { dev_dbg(&dd->pdev->dev, "Null completion for tag %d", tag); @@ -1081,16 +835,13 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) { struct mtip_port *port = dd->port; - struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL]; + struct mtip_cmd *cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & (1 << MTIP_TAG_INTERNAL))) { if (cmd->comp_func) { - cmd->comp_func(port, - MTIP_TAG_INTERNAL, - cmd->comp_data, - 0); + cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd, 0); return; } } @@ -1103,8 +854,6 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) */ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) { - if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) - mtip_handle_tfe(dd); if (unlikely(port_stat & PORT_IRQ_CONNECT)) { dev_warn(&dd->pdev->dev, @@ -1122,6 +871,12 @@ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) dev_warn(&dd->pdev->dev, "Port stat errors %x unhandled\n", (port_stat & ~PORT_IRQ_HANDLED)); + if (mtip_check_surprise_removal(dd->pdev)) + return; + } + if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) { + set_bit(MTIP_PF_EH_ACTIVE_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); } } @@ -1222,7 +977,6 @@ static irqreturn_t mtip_irq_handler(int irq, void *instance) static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) { - atomic_set(&port->commands[tag].active, 1); writel(1 << MTIP_TAG_BIT(tag), port->cmd_issue[MTIP_TAG_INDEX(tag)]); } @@ -1280,6 +1034,8 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) unsigned int n; unsigned int active = 1; + blk_mq_stop_hw_queues(port->dd->queue); + to = jiffies + msecs_to_jiffies(timeout); do { if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) && @@ -1287,8 +1043,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) msleep(20); continue; /* svc thd is actively issuing commands */ } + + msleep(100); + if (mtip_check_surprise_removal(port->dd->pdev)) + goto err_fault; if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) - return -EFAULT; + goto err_fault; + /* * Ignore s_active bit 0 of array element 0. * This bit will always be set @@ -1299,11 +1060,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) if (!active) break; - - msleep(20); } while (time_before(jiffies, to)); + blk_mq_start_stopped_hw_queues(port->dd->queue, true); return active ? -EBUSY : 0; +err_fault: + blk_mq_start_stopped_hw_queues(port->dd->queue, true); + return -EFAULT; } /* @@ -1335,10 +1098,9 @@ static int mtip_exec_internal_command(struct mtip_port *port, { struct mtip_cmd_sg *command_sg; DECLARE_COMPLETION_ONSTACK(wait); - int rv = 0, ready2go = 1; - struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL]; - unsigned long to; + struct mtip_cmd *int_cmd; struct driver_data *dd = port->dd; + int rv = 0; /* Make sure the buffer is 8 byte aligned. This is asic specific. */ if (buffer & 0x00000007) { @@ -1346,19 +1108,8 @@ static int mtip_exec_internal_command(struct mtip_port *port, return -EFAULT; } - to = jiffies + msecs_to_jiffies(timeout); - do { - ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL, - port->allocated); - if (ready2go) - break; - mdelay(100); - } while (time_before(jiffies, to)); - if (!ready2go) { - dev_warn(&dd->pdev->dev, - "Internal cmd active. new cmd [%02X]\n", fis->command); - return -EBUSY; - } + int_cmd = mtip_get_int_command(dd); + set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); port->ic_pause_timer = 0; @@ -1368,10 +1119,11 @@ static int mtip_exec_internal_command(struct mtip_port *port, if (atomic == GFP_KERNEL) { if (fis->command != ATA_CMD_STANDBYNOW1) { /* wait for io to complete if non atomic */ - if (mtip_quiesce_io(port, 5000) < 0) { + if (mtip_quiesce_io(port, + MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) { dev_warn(&dd->pdev->dev, "Failed to quiesce IO\n"); - release_slot(port, MTIP_TAG_INTERNAL); + mtip_put_int_command(dd, int_cmd); clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); return -EBUSY; @@ -1416,9 +1168,9 @@ static int mtip_exec_internal_command(struct mtip_port *port, if (atomic == GFP_KERNEL) { /* Wait for the command to complete or timeout. */ - if (wait_for_completion_interruptible_timeout( + if ((rv = wait_for_completion_interruptible_timeout( &wait, - msecs_to_jiffies(timeout)) <= 0) { + msecs_to_jiffies(timeout))) <= 0) { if (rv == -ERESTARTSYS) { /* interrupted */ dev_err(&dd->pdev->dev, "Internal command [%02X] was interrupted after %lu ms\n", @@ -1497,8 +1249,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, } exec_ic_exit: /* Clear the allocated and active bits for the internal command. */ - atomic_set(&int_cmd->active, 0); - release_slot(port, MTIP_TAG_INTERNAL); + mtip_put_int_command(dd, int_cmd); if (rv >= 0 && mtip_pause_ncq(port, fis)) { /* NCQ paused */ return rv; @@ -1529,6 +1280,37 @@ static inline void ata_swap_string(u16 *buf, unsigned int len) be16_to_cpus(&buf[i]); } +static void mtip_set_timeout(struct driver_data *dd, + struct host_to_dev_fis *fis, + unsigned int *timeout, u8 erasemode) +{ + switch (fis->command) { + case ATA_CMD_DOWNLOAD_MICRO: + *timeout = 120000; /* 2 minutes */ + break; + case ATA_CMD_SEC_ERASE_UNIT: + case 0xFC: + if (erasemode) + *timeout = ((*(dd->port->identify + 90) * 2) * 60000); + else + *timeout = ((*(dd->port->identify + 89) * 2) * 60000); + break; + case ATA_CMD_STANDBYNOW1: + *timeout = 120000; /* 2 minutes */ + break; + case 0xF7: + case 0xFA: + *timeout = 60000; /* 60 seconds */ + break; + case ATA_CMD_SMART: + *timeout = 15000; /* 15 seconds */ + break; + default: + *timeout = MTIP_IOCTL_CMD_TIMEOUT_MS; + break; + } +} + /* * Request the device identity information. * @@ -1576,7 +1358,7 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) sizeof(u16) * ATA_ID_WORDS, 0, GFP_KERNEL, - MTIP_INTERNAL_COMMAND_TIMEOUT_MS) + MTIP_INT_CMD_TIMEOUT_MS) < 0) { rv = -1; goto out; @@ -1644,6 +1426,7 @@ static int mtip_standby_immediate(struct mtip_port *port) int rv; struct host_to_dev_fis fis; unsigned long start; + unsigned int timeout; /* Build the FIS. */ memset(&fis, 0, sizeof(struct host_to_dev_fis)); @@ -1651,6 +1434,8 @@ static int mtip_standby_immediate(struct mtip_port *port) fis.opts = 1 << 7; fis.command = ATA_CMD_STANDBYNOW1; + mtip_set_timeout(port->dd, &fis, &timeout, 0); + start = jiffies; rv = mtip_exec_internal_command(port, &fis, @@ -1659,7 +1444,7 @@ static int mtip_standby_immediate(struct mtip_port *port) 0, 0, GFP_ATOMIC, - 15000); + timeout); dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n", jiffies_to_msecs(jiffies - start)); if (rv) @@ -1705,7 +1490,7 @@ static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, sectors * ATA_SECT_SIZE, 0, GFP_ATOMIC, - MTIP_INTERNAL_COMMAND_TIMEOUT_MS); + MTIP_INT_CMD_TIMEOUT_MS); } /* @@ -1998,6 +1783,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) { struct host_to_dev_fis fis; struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG); + unsigned int to; /* Build the FIS. */ memset(&fis, 0, sizeof(struct host_to_dev_fis)); @@ -2011,6 +1797,8 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) fis.cyl_hi = command[5]; fis.device = command[6] & ~0x10; /* Clear the dev bit*/ + mtip_set_timeout(port->dd, &fis, &to, 0); + dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n", __func__, command[0], @@ -2029,7 +1817,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) 0, 0, GFP_KERNEL, - MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) { + to) < 0) { return -1; } @@ -2069,6 +1857,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, u8 *buf = NULL; dma_addr_t dma_addr = 0; int rv = 0, xfer_sz = command[3]; + unsigned int to; if (xfer_sz) { if (!user_buffer) @@ -2100,6 +1889,8 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, fis.cyl_hi = 0xC2; } + mtip_set_timeout(port->dd, &fis, &to, 0); + if (xfer_sz) reply = (port->rxfis + RX_FIS_PIO_SETUP); else @@ -2122,7 +1913,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0), 0, GFP_KERNEL, - MTIP_IOCTL_COMMAND_TIMEOUT_MS) + to) < 0) { rv = -EFAULT; goto exit_drive_command; @@ -2202,36 +1993,6 @@ static unsigned int implicit_sector(unsigned char command, } return rv; } -static void mtip_set_timeout(struct driver_data *dd, - struct host_to_dev_fis *fis, - unsigned int *timeout, u8 erasemode) -{ - switch (fis->command) { - case ATA_CMD_DOWNLOAD_MICRO: - *timeout = 120000; /* 2 minutes */ - break; - case ATA_CMD_SEC_ERASE_UNIT: - case 0xFC: - if (erasemode) - *timeout = ((*(dd->port->identify + 90) * 2) * 60000); - else - *timeout = ((*(dd->port->identify + 89) * 2) * 60000); - break; - case ATA_CMD_STANDBYNOW1: - *timeout = 120000; /* 2 minutes */ - break; - case 0xF7: - case 0xFA: - *timeout = 60000; /* 60 seconds */ - break; - case ATA_CMD_SMART: - *timeout = 15000; /* 15 seconds */ - break; - default: - *timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS; - break; - } -} /* * Executes a taskfile @@ -2606,22 +2367,21 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * return value * None */ -static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, - int nsect, int nents, int tag, void *callback, - void *data, int dir, int unaligned) +static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, + struct mtip_cmd *command, int nents, + struct blk_mq_hw_ctx *hctx) { struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; - struct mtip_cmd *command = &port->commands[tag]; - int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - u64 start = sector; + int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + u64 start = blk_rq_pos(rq); + unsigned int nsect = blk_rq_sectors(rq); /* Map the scatter list for DMA access */ nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); command->scatter_ents = nents; - command->unaligned = unaligned; /* * The number of retries for this command before it is * reported as a failure to the upper layers. @@ -2632,8 +2392,10 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, fis = command->command; fis->type = 0x27; fis->opts = 1 << 7; - fis->command = - (dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE); + if (rq_data_dir(rq) == READ) + fis->command = ATA_CMD_FPDMA_READ; + else + fis->command = ATA_CMD_FPDMA_WRITE; fis->lba_low = start & 0xFF; fis->lba_mid = (start >> 8) & 0xFF; fis->lba_hi = (start >> 16) & 0xFF; @@ -2643,14 +2405,14 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, fis->device = 1 << 6; fis->features = nsect & 0xFF; fis->features_ex = (nsect >> 8) & 0xFF; - fis->sect_count = ((tag << 3) | (tag >> 5)); + fis->sect_count = ((rq->tag << 3) | (rq->tag >> 5)); fis->sect_cnt_ex = 0; fis->control = 0; fis->res2 = 0; fis->res3 = 0; fill_command_sg(dd, command, nents); - if (unaligned) + if (command->unaligned) fis->device |= 1 << 7; /* Populate the command header */ @@ -2668,81 +2430,17 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, command->direction = dma_dir; /* - * Set the completion function and data for the command passed - * from the upper layer. - */ - command->async_data = data; - command->async_callback = callback; - - /* * To prevent this command from being issued * if an internal command is in progress or error handling is active. */ if (port->flags & MTIP_PF_PAUSE_IO) { - set_bit(tag, port->cmds_to_issue); + set_bit(rq->tag, port->cmds_to_issue); set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); return; } /* Issue the command to the hardware */ - mtip_issue_ncq_command(port, tag); - - return; -} - -/* - * Release a command slot. - * - * @dd Pointer to the driver data structure. - * @tag Slot tag - * - * return value - * None - */ -static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag, - int unaligned) -{ - struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : - &dd->port->cmd_slot; - release_slot(dd->port, tag); - up(sem); -} - -/* - * Obtain a command slot and return its associated scatter list. - * - * @dd Pointer to the driver data structure. - * @tag Pointer to an int that will receive the allocated command - * slot tag. - * - * return value - * Pointer to the scatter list for the allocated command slot - * or NULL if no command slots are available. - */ -static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, - int *tag, int unaligned) -{ - struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : - &dd->port->cmd_slot; - - /* - * It is possible that, even with this semaphore, a thread - * may think that no command slots are available. Therefore, we - * need to make an attempt to get_slot(). - */ - down(sem); - *tag = get_slot(dd->port); - - if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { - up(sem); - return NULL; - } - if (unlikely(*tag < 0)) { - up(sem); - return NULL; - } - - return dd->port->commands[*tag].sg; + mtip_issue_ncq_command(port, rq->tag); } /* @@ -3113,6 +2811,7 @@ static int mtip_free_orphan(struct driver_data *dd) if (dd->queue) { dd->queue->queuedata = NULL; blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); dd->queue = NULL; } } @@ -3270,6 +2969,11 @@ static int mtip_service_thread(void *data) int ret; while (1) { + if (kthread_should_stop() || + test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) + goto st_out; + clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + /* * the condition is to check neither an internal command is * is in progress nor error handling is active @@ -3277,11 +2981,12 @@ static int mtip_service_thread(void *data) wait_event_interruptible(port->svc_wait, (port->flags) && !(port->flags & MTIP_PF_PAUSE_IO)); - if (kthread_should_stop()) - goto st_out; - set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + if (kthread_should_stop() || + test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) + goto st_out; + /* If I am an orphan, start self cleanup */ if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags)) break; @@ -3290,6 +2995,16 @@ static int mtip_service_thread(void *data) &dd->dd_flag))) goto st_out; +restart_eh: + /* Demux bits: start with error handling */ + if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) { + mtip_handle_tfe(dd); + clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); + } + + if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) + goto restart_eh; + if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { slot = 1; /* used to restrict the loop to one iteration */ @@ -3319,16 +3034,14 @@ static int mtip_service_thread(void *data) } clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); - } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { + } + + if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { if (mtip_ftl_rebuild_poll(dd) < 0) set_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag); clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); } - clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); - - if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) - goto st_out; } /* wait for pci remove to exit */ @@ -3365,7 +3078,6 @@ st_out: */ static void mtip_dma_free(struct driver_data *dd) { - int i; struct mtip_port *port = dd->port; if (port->block1) @@ -3376,13 +3088,6 @@ static void mtip_dma_free(struct driver_data *dd) dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, port->command_list, port->command_list_dma); } - - for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { - if (port->commands[i].command) - dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, - port->commands[i].command, - port->commands[i].command_dma); - } } /* @@ -3396,8 +3101,6 @@ static void mtip_dma_free(struct driver_data *dd) static int mtip_dma_alloc(struct driver_data *dd) { struct mtip_port *port = dd->port; - int i, rv = 0; - u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ port->block1 = @@ -3430,41 +3133,63 @@ static int mtip_dma_alloc(struct driver_data *dd) port->smart_buf = port->block1 + AHCI_SMARTBUF_OFFSET; port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET; - /* Setup per command SGL DMA region */ - - /* Point the command headers at the command tables */ - for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { - port->commands[i].command = - dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, - &port->commands[i].command_dma, GFP_KERNEL); - if (!port->commands[i].command) { - rv = -ENOMEM; - mtip_dma_free(dd); - return rv; - } - memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ); - - port->commands[i].command_header = port->command_list + - (sizeof(struct mtip_cmd_hdr) * i); - port->commands[i].command_header_dma = - dd->port->command_list_dma + - (sizeof(struct mtip_cmd_hdr) * i); + return 0; +} - if (host_cap_64) - port->commands[i].command_header->ctbau = - __force_bit2int cpu_to_le32( - (port->commands[i].command_dma >> 16) >> 16); +static int mtip_hw_get_identify(struct driver_data *dd) +{ + struct smart_attr attr242; + unsigned char *buf; + int rv; - port->commands[i].command_header->ctba = - __force_bit2int cpu_to_le32( - port->commands[i].command_dma & 0xFFFFFFFF); + if (mtip_get_identify(dd->port, NULL) < 0) + return -EFAULT; - sg_init_table(port->commands[i].sg, MTIP_MAX_SG); + if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == + MTIP_FTL_REBUILD_MAGIC) { + set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); + return MTIP_FTL_REBUILD_MAGIC; + } + mtip_dump_identify(dd->port); - /* Mark command as currently inactive */ - atomic_set(&dd->port->commands[i].active, 0); + /* check write protect, over temp and rebuild statuses */ + rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, + dd->port->log_buf, + dd->port->log_buf_dma, 1); + if (rv) { + dev_warn(&dd->pdev->dev, + "Error in READ LOG EXT (10h) command\n"); + /* non-critical error, don't fail the load */ + } else { + buf = (unsigned char *)dd->port->log_buf; + if (buf[259] & 0x1) { + dev_info(&dd->pdev->dev, + "Write protect bit is set.\n"); + set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); + } + if (buf[288] == 0xF7) { + dev_info(&dd->pdev->dev, + "Exceeded Tmax, drive in thermal shutdown.\n"); + set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); + } + if (buf[288] == 0xBF) { + dev_info(&dd->pdev->dev, + "Drive indicates rebuild has failed.\n"); + /* TODO */ + } } - return 0; + + /* get write protect progess */ + memset(&attr242, 0, sizeof(struct smart_attr)); + if (mtip_get_smart_attr(dd->port, 242, &attr242)) + dev_warn(&dd->pdev->dev, + "Unable to check write protect progress\n"); + else + dev_info(&dd->pdev->dev, + "Write protect progress: %u%% (%u blocks)\n", + attr242.cur, le32_to_cpu(attr242.data)); + + return rv; } /* @@ -3481,8 +3206,6 @@ static int mtip_hw_init(struct driver_data *dd) int rv; unsigned int num_command_slots; unsigned long timeout, timetaken; - unsigned char *buf; - struct smart_attr attr242; dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; @@ -3513,8 +3236,6 @@ static int mtip_hw_init(struct driver_data *dd) else dd->unal_qdepth = 0; - /* Counting semaphore to track command slot usage */ - sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth); sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth); /* Spinlock to prevent concurrent issue */ @@ -3599,73 +3320,16 @@ static int mtip_hw_init(struct driver_data *dd) writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, dd->mmio + HOST_CTL); - init_timer(&dd->port->cmd_timer); init_waitqueue_head(&dd->port->svc_wait); - dd->port->cmd_timer.data = (unsigned long int) dd->port; - dd->port->cmd_timer.function = mtip_timeout_function; - mod_timer(&dd->port->cmd_timer, - jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); - - if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { rv = -EFAULT; goto out3; } - if (mtip_get_identify(dd->port, NULL) < 0) { - rv = -EFAULT; - goto out3; - } - mtip_dump_identify(dd->port); - - if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == - MTIP_FTL_REBUILD_MAGIC) { - set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); - return MTIP_FTL_REBUILD_MAGIC; - } - - /* check write protect, over temp and rebuild statuses */ - rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, - dd->port->log_buf, - dd->port->log_buf_dma, 1); - if (rv) { - dev_warn(&dd->pdev->dev, - "Error in READ LOG EXT (10h) command\n"); - /* non-critical error, don't fail the load */ - } else { - buf = (unsigned char *)dd->port->log_buf; - if (buf[259] & 0x1) { - dev_info(&dd->pdev->dev, - "Write protect bit is set.\n"); - set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); - } - if (buf[288] == 0xF7) { - dev_info(&dd->pdev->dev, - "Exceeded Tmax, drive in thermal shutdown.\n"); - set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); - } - if (buf[288] == 0xBF) { - dev_info(&dd->pdev->dev, - "Drive is in security locked state.\n"); - set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); - } - } - - /* get write protect progess */ - memset(&attr242, 0, sizeof(struct smart_attr)); - if (mtip_get_smart_attr(dd->port, 242, &attr242)) - dev_warn(&dd->pdev->dev, - "Unable to check write protect progress\n"); - else - dev_info(&dd->pdev->dev, - "Write protect progress: %u%% (%u blocks)\n", - attr242.cur, le32_to_cpu(attr242.data)); return rv; out3: - del_timer_sync(&dd->port->cmd_timer); - /* Disable interrupts on the HBA. */ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, dd->mmio + HOST_CTL); @@ -3685,6 +3349,22 @@ out1: return rv; } +static void mtip_standby_drive(struct driver_data *dd) +{ + if (dd->sr) + return; + + /* + * Send standby immediate (E0h) to the drive so that it + * saves its state. + */ + if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && + !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) + if (mtip_standby_immediate(dd->port)) + dev_warn(&dd->pdev->dev, + "STANDBY IMMEDIATE failed\n"); +} + /* * Called to deinitialize an interface. * @@ -3700,12 +3380,6 @@ static int mtip_hw_exit(struct driver_data *dd) * saves its state. */ if (!dd->sr) { - if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && - !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) - if (mtip_standby_immediate(dd->port)) - dev_warn(&dd->pdev->dev, - "STANDBY IMMEDIATE failed\n"); - /* de-initialize the port. */ mtip_deinit_port(dd->port); @@ -3714,8 +3388,6 @@ static int mtip_hw_exit(struct driver_data *dd) dd->mmio + HOST_CTL); } - del_timer_sync(&dd->port->cmd_timer); - /* Release the IRQ. */ irq_set_affinity_hint(dd->pdev->irq, NULL); devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); @@ -4032,100 +3704,138 @@ static const struct block_device_operations mtip_block_ops = { * * @queue Pointer to the request queue. Unused other than to obtain * the driver data structure. - * @bio Pointer to the BIO. + * @rq Pointer to the request. * */ -static void mtip_make_request(struct request_queue *queue, struct bio *bio) +static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct driver_data *dd = queue->queuedata; - struct scatterlist *sg; - struct bio_vec bvec; - struct bvec_iter iter; - int nents = 0; - int tag = 0, unaligned = 0; + struct driver_data *dd = hctx->queue->queuedata; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + unsigned int nents; if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { - bio_endio(bio, -ENXIO); - return; + return -ENXIO; } if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) { - bio_endio(bio, -ENODATA); - return; + return -ENODATA; } if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag) && - bio_data_dir(bio))) { - bio_endio(bio, -ENODATA); - return; - } - if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) { - bio_endio(bio, -ENODATA); - return; - } - if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) { - bio_endio(bio, -ENXIO); - return; + rq_data_dir(rq))) { + return -ENODATA; } + if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) + return -ENODATA; + if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) + return -ENXIO; } - if (unlikely(bio->bi_rw & REQ_DISCARD)) { - bio_endio(bio, mtip_send_trim(dd, bio->bi_iter.bi_sector, - bio_sectors(bio))); - return; - } + if (rq->cmd_flags & REQ_DISCARD) { + int err; - if (unlikely(!bio_has_data(bio))) { - blk_queue_flush(queue, 0); - bio_endio(bio, 0); - return; + err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); + blk_mq_end_io(rq, err); + return 0; } - if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 && - dd->unal_qdepth) { - if (bio->bi_iter.bi_sector % 8 != 0) - /* Unaligned on 4k boundaries */ - unaligned = 1; - else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */ - unaligned = 1; + /* Create the scatter list for this request. */ + nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg); + + /* Issue the read/write. */ + mtip_hw_submit_io(dd, rq, cmd, nents, hctx); + return 0; +} + +static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct driver_data *dd = hctx->queue->queuedata; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + + if (!dd->unal_qdepth || rq_data_dir(rq) == READ) + return false; + + /* + * If unaligned depth must be limited on this controller, mark it + * as unaligned if the IO isn't on a 4k boundary (start of length). + */ + if (blk_rq_sectors(rq) <= 64) { + if ((blk_rq_pos(rq) & 7) || (blk_rq_sectors(rq) & 7)) + cmd->unaligned = 1; } - sg = mtip_hw_get_scatterlist(dd, &tag, unaligned); - if (likely(sg != NULL)) { - blk_queue_bounce(queue, &bio); + if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal)) + return true; - if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) { - dev_warn(&dd->pdev->dev, - "Maximum number of SGL entries exceeded\n"); - bio_io_error(bio); - mtip_hw_release_scatterlist(dd, tag, unaligned); - return; - } + return false; +} - /* Create the scatter list for this bio. */ - bio_for_each_segment(bvec, bio, iter) { - sg_set_page(&sg[nents], - bvec.bv_page, - bvec.bv_len, - bvec.bv_offset); - nents++; - } +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + int ret; - /* Issue the read/write. */ - mtip_hw_submit_io(dd, - bio->bi_iter.bi_sector, - bio_sectors(bio), - nents, - tag, - bio_endio, - bio, - bio_data_dir(bio), - unaligned); - } else - bio_io_error(bio); + if (mtip_check_unal_depth(hctx, rq)) + return BLK_MQ_RQ_QUEUE_BUSY; + + ret = mtip_submit_request(hctx, rq); + if (!ret) + return BLK_MQ_RQ_QUEUE_OK; + + rq->errors = ret; + return BLK_MQ_RQ_QUEUE_ERROR; +} + +static void mtip_free_cmd(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx) +{ + struct driver_data *dd = data; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + + if (!cmd->command) + return; + + dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + cmd->command, cmd->command_dma); +} + +static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx, + unsigned int request_idx, unsigned int numa_node) +{ + struct driver_data *dd = data; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; + + cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + &cmd->command_dma, GFP_KERNEL); + if (!cmd->command) + return -ENOMEM; + + memset(cmd->command, 0, CMD_DMA_ALLOC_SZ); + + /* Point the command headers at the command tables. */ + cmd->command_header = dd->port->command_list + + (sizeof(struct mtip_cmd_hdr) * request_idx); + cmd->command_header_dma = dd->port->command_list_dma + + (sizeof(struct mtip_cmd_hdr) * request_idx); + + if (host_cap_64) + cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16); + + cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF); + + sg_init_table(cmd->sg, MTIP_MAX_SG); + return 0; } +static struct blk_mq_ops mtip_mq_ops = { + .queue_rq = mtip_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = mtip_init_cmd, + .exit_request = mtip_free_cmd, +}; + /* * Block layer initialization function. * @@ -4148,11 +3858,7 @@ static int mtip_block_initialize(struct driver_data *dd) if (dd->disk) goto skip_create_disk; /* hw init done, before rebuild */ - /* Initialize the protocol layer. */ - wait_for_rebuild = mtip_hw_init(dd); - if (wait_for_rebuild < 0) { - dev_err(&dd->pdev->dev, - "Protocol layer initialization failed\n"); + if (mtip_hw_init(dd)) { rv = -EINVAL; goto protocol_init_error; } @@ -4194,29 +3900,53 @@ static int mtip_block_initialize(struct driver_data *dd) mtip_hw_debugfs_init(dd); - /* - * if rebuild pending, start the service thread, and delay the block - * queue creation and add_disk() - */ - if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) - goto start_service_thread; - skip_create_disk: - /* Allocate the request queue. */ - dd->queue = blk_alloc_queue_node(GFP_KERNEL, dd->numa_node); - if (dd->queue == NULL) { + memset(&dd->tags, 0, sizeof(dd->tags)); + dd->tags.ops = &mtip_mq_ops; + dd->tags.nr_hw_queues = 1; + dd->tags.queue_depth = MTIP_MAX_COMMAND_SLOTS; + dd->tags.reserved_tags = 1; + dd->tags.cmd_size = sizeof(struct mtip_cmd); + dd->tags.numa_node = dd->numa_node; + dd->tags.flags = BLK_MQ_F_SHOULD_MERGE; + dd->tags.driver_data = dd; + + rv = blk_mq_alloc_tag_set(&dd->tags); + if (rv) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); rv = -ENOMEM; goto block_queue_alloc_init_error; } - /* Attach our request function to the request queue. */ - blk_queue_make_request(dd->queue, mtip_make_request); + /* Allocate the request queue. */ + dd->queue = blk_mq_init_queue(&dd->tags); + if (IS_ERR(dd->queue)) { + dev_err(&dd->pdev->dev, + "Unable to allocate request queue\n"); + rv = -ENOMEM; + goto block_queue_alloc_init_error; + } dd->disk->queue = dd->queue; dd->queue->queuedata = dd; + /* Initialize the protocol layer. */ + wait_for_rebuild = mtip_hw_get_identify(dd); + if (wait_for_rebuild < 0) { + dev_err(&dd->pdev->dev, + "Protocol layer initialization failed\n"); + rv = -EINVAL; + goto init_hw_cmds_error; + } + + /* + * if rebuild pending, start the service thread, and delay the block + * queue creation and add_disk() + */ + if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) + goto start_service_thread; + /* Set device limits. */ set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags); blk_queue_max_segments(dd->queue, MTIP_MAX_SG); @@ -4295,8 +4025,9 @@ kthread_run_error: del_gendisk(dd->disk); read_capacity_error: +init_hw_cmds_error: blk_cleanup_queue(dd->queue); - + blk_mq_free_tag_set(&dd->tags); block_queue_alloc_init_error: mtip_hw_debugfs_exit(dd); disk_index_error: @@ -4345,6 +4076,9 @@ static int mtip_block_remove(struct driver_data *dd) kobject_put(kobj); } } + + mtip_standby_drive(dd); + /* * Delete our gendisk structure. This also removes the device * from /dev @@ -4357,6 +4091,7 @@ static int mtip_block_remove(struct driver_data *dd) if (dd->disk->queue) { del_gendisk(dd->disk); blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); dd->queue = NULL; } else put_disk(dd->disk); @@ -4391,6 +4126,8 @@ static int mtip_block_remove(struct driver_data *dd) */ static int mtip_block_shutdown(struct driver_data *dd) { + mtip_hw_shutdown(dd); + /* Delete our gendisk structure, and cleanup the blk queue. */ if (dd->disk) { dev_info(&dd->pdev->dev, @@ -4399,6 +4136,7 @@ static int mtip_block_shutdown(struct driver_data *dd) if (dd->disk->queue) { del_gendisk(dd->disk); blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); } else put_disk(dd->disk); dd->disk = NULL; @@ -4408,8 +4146,6 @@ static int mtip_block_shutdown(struct driver_data *dd) spin_lock(&rssd_index_lock); ida_remove(&rssd_index_ida, dd->index); spin_unlock(&rssd_index_lock); - - mtip_hw_shutdown(dd); return 0; } @@ -4479,6 +4215,57 @@ static DEFINE_HANDLER(5); static DEFINE_HANDLER(6); static DEFINE_HANDLER(7); +static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev) +{ + int pos; + unsigned short pcie_dev_ctrl; + + pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); + if (pos) { + pci_read_config_word(pdev, + pos + PCI_EXP_DEVCTL, + &pcie_dev_ctrl); + if (pcie_dev_ctrl & (1 << 11) || + pcie_dev_ctrl & (1 << 4)) { + dev_info(&dd->pdev->dev, + "Disabling ERO/No-Snoop on bridge device %04x:%04x\n", + pdev->vendor, pdev->device); + pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN | + PCI_EXP_DEVCTL_RELAX_EN); + pci_write_config_word(pdev, + pos + PCI_EXP_DEVCTL, + pcie_dev_ctrl); + } + } +} + +static void mtip_fix_ero_nosnoop(struct driver_data *dd, struct pci_dev *pdev) +{ + /* + * This workaround is specific to AMD/ATI chipset with a PCI upstream + * device with device id 0x5aXX + */ + if (pdev->bus && pdev->bus->self) { + if (pdev->bus->self->vendor == PCI_VENDOR_ID_ATI && + ((pdev->bus->self->device & 0xff00) == 0x5a00)) { + mtip_disable_link_opts(dd, pdev->bus->self); + } else { + /* Check further up the topology */ + struct pci_dev *parent_dev = pdev->bus->self; + if (parent_dev->bus && + parent_dev->bus->parent && + parent_dev->bus->parent->self && + parent_dev->bus->parent->self->vendor == + PCI_VENDOR_ID_ATI && + (parent_dev->bus->parent->self->device & + 0xff00) == 0x5a00) { + mtip_disable_link_opts(dd, + parent_dev->bus->parent->self); + } + } + } +} + /* * Called for each supported PCI device detected. * @@ -4630,6 +4417,8 @@ static int mtip_pci_probe(struct pci_dev *pdev, goto msi_initialize_err; } + mtip_fix_ero_nosnoop(dd, pdev); + /* Initialize the block layer. */ rv = mtip_block_initialize(dd); if (rv < 0) { @@ -4710,8 +4499,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) dev_warn(&dd->pdev->dev, "Completion workers still active!\n"); } - /* Cleanup the outstanding commands */ - mtip_command_cleanup(dd); /* Clean up the block layer. */ mtip_block_remove(dd); @@ -4737,8 +4524,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); pci_set_drvdata(pdev, NULL); - pci_dev_put(pdev); - } /* @@ -4935,13 +4720,13 @@ static int __init mtip_init(void) */ static void __exit mtip_exit(void) { - debugfs_remove_recursive(dfs_parent); - /* Release the allocated major block device number. */ unregister_blkdev(mtip_major, MTIP_DRV_NAME); /* Unregister the PCI driver. */ pci_unregister_driver(&mtip_pci_driver); + + debugfs_remove_recursive(dfs_parent); } MODULE_AUTHOR("Micron Technology, Inc"); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index ffb955e7ccb9..4b9b554234bc 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -40,9 +40,11 @@ #define MTIP_MAX_RETRIES 2 /* Various timeout values in ms */ -#define MTIP_NCQ_COMMAND_TIMEOUT_MS 5000 -#define MTIP_IOCTL_COMMAND_TIMEOUT_MS 5000 -#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS 5000 +#define MTIP_NCQ_CMD_TIMEOUT_MS 15000 +#define MTIP_IOCTL_CMD_TIMEOUT_MS 5000 +#define MTIP_INT_CMD_TIMEOUT_MS 5000 +#define MTIP_QUIESCE_IO_TIMEOUT_MS (MTIP_NCQ_CMD_TIMEOUT_MS * \ + (MTIP_MAX_RETRIES + 1)) /* check for timeouts every 500ms */ #define MTIP_TIMEOUT_CHECK_PERIOD 500 @@ -331,12 +333,8 @@ struct mtip_cmd { */ void (*comp_func)(struct mtip_port *port, int tag, - void *data, + struct mtip_cmd *cmd, int status); - /* Additional callback function that may be called by comp_func() */ - void (*async_callback)(void *data, int status); - - void *async_data; /* Addl. data passed to async_callback() */ int scatter_ents; /* Number of scatter list entries used */ @@ -347,10 +345,6 @@ struct mtip_cmd { int retries; /* The number of retries left for this command. */ int direction; /* Data transfer direction */ - - unsigned long comp_time; /* command completion time, in jiffies */ - - atomic_t active; /* declares if this command sent to the drive. */ }; /* Structure used to describe a port. */ @@ -436,12 +430,6 @@ struct mtip_port { * or error handling is active */ unsigned long cmds_to_issue[SLOTBITS_IN_LONGS]; - /* - * Array of command slots. Structure includes pointers to the - * command header and command table, and completion function and data - * pointers. - */ - struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS]; /* Used by mtip_service_thread to wait for an event */ wait_queue_head_t svc_wait; /* @@ -452,13 +440,7 @@ struct mtip_port { /* * Timer used to complete commands that have been active for too long. */ - struct timer_list cmd_timer; unsigned long ic_pause_timer; - /* - * Semaphore used to block threads if there are no - * command slots available. - */ - struct semaphore cmd_slot; /* Semaphore to control queue depth of unaligned IOs */ struct semaphore cmd_slot_unal; @@ -485,6 +467,8 @@ struct driver_data { struct request_queue *queue; /* Our request queue. */ + struct blk_mq_tag_set tags; /* blk_mq tags */ + struct mtip_port *port; /* Pointer to the port data structure. */ unsigned product_type; /* magic value declaring the product type */ diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index b40af63a5476..77087a29b127 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -203,8 +203,8 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) entry = llist_reverse_order(entry); do { cmd = container_of(entry, struct nullb_cmd, ll_list); - end_cmd(cmd); entry = entry->next; + end_cmd(cmd); } while (entry); } diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index c48d9084c965..608532d3f8c9 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -3944,15 +3944,14 @@ static int skd_acquire_msix(struct skd_device *skdev) for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) entries[i].entry = i; - rc = pci_enable_msix_range(pdev, entries, - SKD_MIN_MSIX_COUNT, SKD_MAX_MSIX_COUNT); - if (rc < 0) { + rc = pci_enable_msix_exact(pdev, entries, SKD_MAX_MSIX_COUNT); + if (rc) { pr_err("(%s): failed to enable MSI-X %d\n", skd_name(skdev), rc); goto msix_out; } - skdev->msix_count = rc; + skdev->msix_count = SKD_MAX_MSIX_COUNT; skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) * skdev->msix_count, GFP_KERNEL); if (!skdev->msix_entries) { diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 16c21c0cb14d..c8f286e8d80f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -147,11 +147,11 @@ static void virtblk_done(struct virtqueue *vq) if (unlikely(virtqueue_is_broken(vq))) break; } while (!virtqueue_enable_cb(vq)); - spin_unlock_irqrestore(&vblk->vq_lock, flags); /* In case queue is stopped waiting for more buffers. */ if (req_done) blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + spin_unlock_irqrestore(&vblk->vq_lock, flags); } static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) @@ -205,8 +205,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num); if (err) { virtqueue_kick(vblk->vq); - spin_unlock_irqrestore(&vblk->vq_lock, flags); blk_mq_stop_hw_queue(hctx); + spin_unlock_irqrestore(&vblk->vq_lock, flags); /* Out of mem doesn't actually happen, since we fall back * to direct descriptors */ if (err == -ENOMEM || err == -ENOSPC) |