diff options
50 files changed, 9366 insertions, 3379 deletions
diff --git a/Documentation/blockdev/floppy.txt b/Documentation/blockdev/floppy.txt index 470fe4b5e379..e2240f5ab64d 100644 --- a/Documentation/blockdev/floppy.txt +++ b/Documentation/blockdev/floppy.txt @@ -39,15 +39,15 @@ Module configuration options ============================ If you use the floppy driver as a module, use the following syntax: -modprobe floppy <options> +modprobe floppy floppy="<options>" Example: - modprobe floppy omnibook messages + modprobe floppy floppy="omnibook messages" If you need certain options enabled every time you load the floppy driver, you can put: - options floppy omnibook messages + options floppy floppy="omnibook messages" in a configuration file in /etc/modprobe.d/. diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 4682546c5da7..1b84778e9bbd 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -110,7 +110,7 @@ source "drivers/block/mtip32xx/Kconfig" config BLK_CPQ_DA tristate "Compaq SMART2 support" - depends on PCI && VIRT_TO_BUS + depends on PCI && VIRT_TO_BUS && 0 help This is the driver for Compaq Smart Array controllers. Everyone using these boards should say Y here. See the file @@ -319,6 +319,16 @@ config BLK_DEV_NVME To compile this driver as a module, choose M here: the module will be called nvme. +config BLK_DEV_SKD + tristate "STEC S1120 Block Driver" + depends on PCI + depends on 64BIT + ---help--- + Saying Y or M here will enable support for the + STEC, Inc. S1120 PCIe SSD. + + Use device /dev/skd$N amd /dev/skd$Np$M. + config BLK_DEV_OSD tristate "OSD object-as-blkdev support" depends on SCSI_OSD_ULD diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 03b3b4a2bd8a..8cc98cd0d4a8 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_MG_DISK) += mg_disk.o obj-$(CONFIG_SUNVDC) += sunvdc.o obj-$(CONFIG_BLK_DEV_NVME) += nvme.o +obj-$(CONFIG_BLK_DEV_SKD) += skd.o obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o obj-$(CONFIG_BLK_DEV_UMEM) += umem.o @@ -44,4 +45,5 @@ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o nvme-y := nvme-core.o nvme-scsi.o +skd-y := skd_main.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index edfa2515bc86..0c004ac05811 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -5183,7 +5183,7 @@ reinit_after_soft_reset: rebuild_lun_table(h, 1, 0); cciss_engage_scsi(h); h->busy_initializing = 0; - return 1; + return 0; clean4: cciss_free_cmd_pool(h); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 2d7f608d181c..0e06f0c5dd1e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1474,7 +1474,8 @@ enum determine_dev_size { DS_ERROR = -1, DS_UNCHANGED = 0, DS_SHRUNK = 1, - DS_GREW = 2 + DS_GREW = 2, + DS_GREW_FROM_ZERO = 3, }; extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 55635edf563b..9e3818b1bc83 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2750,13 +2750,6 @@ int __init drbd_init(void) return err; } - err = drbd_genl_register(); - if (err) { - printk(KERN_ERR "drbd: unable to register generic netlink family\n"); - goto fail; - } - - register_reboot_notifier(&drbd_notifier); /* @@ -2767,6 +2760,15 @@ int __init drbd_init(void) drbd_proc = NULL; /* play safe for drbd_cleanup */ idr_init(&minors); + rwlock_init(&global_state_lock); + INIT_LIST_HEAD(&drbd_tconns); + + err = drbd_genl_register(); + if (err) { + printk(KERN_ERR "drbd: unable to register generic netlink family\n"); + goto fail; + } + err = drbd_create_mempools(); if (err) goto fail; @@ -2778,9 +2780,6 @@ int __init drbd_init(void) goto fail; } - rwlock_init(&global_state_lock); - INIT_LIST_HEAD(&drbd_tconns); - retry.wq = create_singlethread_workqueue("drbd-reissue"); if (!retry.wq) { printk(KERN_ERR "drbd: unable to create retry workqueue\n"); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 8cc1e640f485..c706d50a8b06 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -955,7 +955,7 @@ drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct res } if (size > la_size_sect) - rv = DS_GREW; + rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO; if (size < la_size_sect) rv = DS_SHRUNK; @@ -1132,9 +1132,9 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) /* We may ignore peer limits if the peer is modern enough. Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ - if (mdev->state.conn >= C_CONNECTED) { + if (mdev->state.conn >= C_WF_REPORT_PARAMS) { if (mdev->tconn->agreed_pro_version < 94) - peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ else if (mdev->tconn->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index cc29cd3bf78b..6fa6673b36b3 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1890,29 +1890,11 @@ static u32 seq_max(u32 a, u32 b) return seq_greater(a, b) ? a : b; } -static bool need_peer_seq(struct drbd_conf *mdev) -{ - struct drbd_tconn *tconn = mdev->tconn; - int tp; - - /* - * We only need to keep track of the last packet_seq number of our peer - * if we are in dual-primary mode and we have the resolve-conflicts flag set; see - * handle_write_conflicts(). - */ - - rcu_read_lock(); - tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; - rcu_read_unlock(); - - return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags); -} - static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq) { unsigned int newest_peer_seq; - if (need_peer_seq(mdev)) { + if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)) { spin_lock(&mdev->peer_seq_lock); newest_peer_seq = seq_max(mdev->peer_seq, peer_seq); mdev->peer_seq = newest_peer_seq; @@ -1972,22 +1954,31 @@ static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_s { DEFINE_WAIT(wait); long timeout; - int ret; + int ret = 0, tp; - if (!need_peer_seq(mdev)) + if (!test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)) return 0; spin_lock(&mdev->peer_seq_lock); for (;;) { if (!seq_greater(peer_seq - 1, mdev->peer_seq)) { mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq); - ret = 0; break; } + if (signal_pending(current)) { ret = -ERESTARTSYS; break; } + + rcu_read_lock(); + tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; + rcu_read_unlock(); + + if (!tp) + break; + + /* Only need to wait if two_primaries is enabled */ prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); spin_unlock(&mdev->peer_seq_lock); rcu_read_lock(); @@ -2228,8 +2219,10 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) } goto out_interrupted; } - } else + } else { + update_peer_seq(mdev, peer_seq); spin_lock_irq(&mdev->tconn->req_lock); + } list_add(&peer_req->w.list, &mdev->active_ee); spin_unlock_irq(&mdev->tconn->req_lock); @@ -4132,7 +4125,11 @@ recv_bm_rle_bits(struct drbd_conf *mdev, (unsigned int)bs.buf_len); return -EIO; } - look_ahead >>= bits; + /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */ + if (likely(bits < 64)) + look_ahead >>= bits; + else + look_ahead = 0; have -= bits; bits = bitstream_get_bits(&bs, &tmp, 64 - have); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index c24379ffd4e3..fec7bef44994 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1306,6 +1306,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct int backing_limit; if (bio_size && get_ldev(mdev)) { + unsigned int max_hw_sectors = queue_max_hw_sectors(q); struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; if (b->merge_bvec_fn) { @@ -1313,6 +1314,8 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct limit = min(limit, backing_limit); } put_ldev(mdev); + if ((limit >> 9) > max_hw_sectors) + limit = max_hw_sectors << 9; } return limit; } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index dbdb88a4976c..c8dac7305244 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -894,13 +894,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, bio_list_init(&lo->lo_bio_list); - /* - * set queue make_request_fn, and add limits based on lower level - * device - */ - blk_queue_make_request(lo->lo_queue, loop_make_request); - lo->lo_queue->queuedata = lo; - if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_flush(lo->lo_queue, REQ_FLUSH); @@ -1618,6 +1611,8 @@ static int loop_add(struct loop_device **l, int i) if (!lo) goto out; + lo->lo_state = Lo_unbound; + /* allocate id, if @id >= 0, we're requesting that specific id */ if (i >= 0) { err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); @@ -1635,6 +1630,12 @@ static int loop_add(struct loop_device **l, int i) if (!lo->lo_queue) goto out_free_idr; + /* + * set queue make_request_fn + */ + blk_queue_make_request(lo->lo_queue, loop_make_request); + lo->lo_queue->queuedata = lo; + disk = lo->lo_disk = alloc_disk(1 << part_shift); if (!disk) goto out_free_queue; diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 77a60bedd7a3..7bc363f1ee82 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -936,7 +936,7 @@ static int mg_probe(struct platform_device *plat_dev) goto probe_err_3b; } err = request_irq(host->irq, mg_irq, - IRQF_DISABLED | IRQF_TRIGGER_RISING, + IRQF_TRIGGER_RISING, MG_DEV_NAME, host); if (err) { printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n", diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 952dbfe22126..050c71267f14 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -126,64 +126,30 @@ struct mtip_compat_ide_task_request_s { static bool mtip_check_surprise_removal(struct pci_dev *pdev) { u16 vendor_id = 0; + struct driver_data *dd = pci_get_drvdata(pdev); + + if (dd->sr) + return true; /* Read the vendorID from the configuration space */ pci_read_config_word(pdev, 0x00, &vendor_id); - if (vendor_id == 0xFFFF) + if (vendor_id == 0xFFFF) { + dd->sr = true; + if (dd->queue) + set_bit(QUEUE_FLAG_DEAD, &dd->queue->queue_flags); + else + dev_warn(&dd->pdev->dev, + "%s: dd->queue is NULL\n", __func__); + if (dd->port) { + set_bit(MTIP_PF_SR_CLEANUP_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); + } else + dev_warn(&dd->pdev->dev, + "%s: dd->port is NULL\n", __func__); return true; /* device removed */ - - return false; /* device present */ -} - -/* - * This function is called for clean the pending command in the - * command slot during the surprise removal of device and return - * error to the upper layer. - * - * @dd Pointer to the DRIVER_DATA structure. - * - * return value - * None - */ -static void mtip_command_cleanup(struct driver_data *dd) -{ - int group = 0, commandslot = 0, commandindex = 0; - struct mtip_cmd *command; - struct mtip_port *port = dd->port; - static int in_progress; - - if (in_progress) - return; - - in_progress = 1; - - for (group = 0; group < 4; group++) { - for (commandslot = 0; commandslot < 32; commandslot++) { - if (!(port->allocated[group] & (1 << commandslot))) - continue; - - commandindex = group << 5 | commandslot; - command = &port->commands[commandindex]; - - if (atomic_read(&command->active) - && (command->async_callback)) { - command->async_callback(command->async_data, - -ENODEV); - command->async_callback = NULL; - command->async_data = NULL; - } - - dma_unmap_sg(&port->dd->pdev->dev, - command->sg, - command->scatter_ents, - command->direction); - } } - up(&port->cmd_slot); - - set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); - in_progress = 0; + return false; /* device present */ } /* @@ -222,10 +188,7 @@ static int get_slot(struct mtip_port *port) } dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n"); - if (mtip_check_surprise_removal(port->dd->pdev)) { - /* Device not present, clean outstanding commands */ - mtip_command_cleanup(port->dd); - } + mtip_check_surprise_removal(port->dd->pdev); return -1; } @@ -246,6 +209,107 @@ static inline void release_slot(struct mtip_port *port, int tag) } /* + * IO completion function. + * + * This completion function is called by the driver ISR when a + * command that was issued by the kernel completes. It first calls the + * asynchronous completion function which normally calls back into the block + * layer passing the asynchronous callback data, then unmaps the + * scatter list associated with the completed command, and finally + * clears the allocated bit associated with the completed command. + * + * @port Pointer to the port data structure. + * @tag Tag of the command. + * @data Pointer to driver_data. + * @status Completion status. + * + * return value + * None + */ +static void mtip_async_complete(struct mtip_port *port, + int tag, + void *data, + int status) +{ + struct mtip_cmd *command; + struct driver_data *dd = data; + int cb_status = status ? -EIO : 0; + + if (unlikely(!dd) || unlikely(!port)) + return; + + command = &port->commands[tag]; + + if (unlikely(status == PORT_IRQ_TF_ERR)) { + dev_warn(&port->dd->pdev->dev, + "Command tag %d failed due to TFE\n", tag); + } + + /* Upper layer callback */ + if (likely(command->async_callback)) + command->async_callback(command->async_data, cb_status); + + command->async_callback = NULL; + command->comp_func = NULL; + + /* Unmap the DMA scatter list entries */ + dma_unmap_sg(&dd->pdev->dev, + command->sg, + command->scatter_ents, + command->direction); + + /* Clear the allocated and active bits for the command */ + atomic_set(&port->commands[tag].active, 0); + release_slot(port, tag); + + up(&port->cmd_slot); +} + +/* + * This function is called for clean the pending command in the + * command slot during the surprise removal of device and return + * error to the upper layer. + * + * @dd Pointer to the DRIVER_DATA structure. + * + * return value + * None + */ +static void mtip_command_cleanup(struct driver_data *dd) +{ + int tag = 0; + struct mtip_cmd *cmd; + struct mtip_port *port = dd->port; + unsigned int num_cmd_slots = dd->slot_groups * 32; + + if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) + return; + + if (!port) + return; + + cmd = &port->commands[MTIP_TAG_INTERNAL]; + if (atomic_read(&cmd->active)) + if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & + (1 << MTIP_TAG_INTERNAL)) + if (cmd->comp_func) + cmd->comp_func(port, MTIP_TAG_INTERNAL, + cmd->comp_data, -ENODEV); + + while (1) { + tag = find_next_bit(port->allocated, num_cmd_slots, tag); + if (tag >= num_cmd_slots) + break; + + cmd = &port->commands[tag]; + if (atomic_read(&cmd->active)) + mtip_async_complete(port, tag, dd, -ENODEV); + } + + set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); +} + +/* * Reset the HBA (without sleeping) * * @dd Pointer to the driver data structure. @@ -584,6 +648,9 @@ static void mtip_timeout_function(unsigned long int data) if (unlikely(!port)) return; + if (unlikely(port->dd->sr)) + return; + if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) { mod_timer(&port->cmd_timer, jiffies + msecs_to_jiffies(30000)); @@ -675,66 +742,6 @@ static void mtip_timeout_function(unsigned long int data) } /* - * IO completion function. - * - * This completion function is called by the driver ISR when a - * command that was issued by the kernel completes. It first calls the - * asynchronous completion function which normally calls back into the block - * layer passing the asynchronous callback data, then unmaps the - * scatter list associated with the completed command, and finally - * clears the allocated bit associated with the completed command. - * - * @port Pointer to the port data structure. - * @tag Tag of the command. - * @data Pointer to driver_data. - * @status Completion status. - * - * return value - * None - */ -static void mtip_async_complete(struct mtip_port *port, - int tag, - void *data, - int status) -{ - struct mtip_cmd *command; - struct driver_data *dd = data; - int cb_status = status ? -EIO : 0; - - if (unlikely(!dd) || unlikely(!port)) - return; - - command = &port->commands[tag]; - - if (unlikely(status == PORT_IRQ_TF_ERR)) { - dev_warn(&port->dd->pdev->dev, - "Command tag %d failed due to TFE\n", tag); - } - - /* Upper layer callback */ - if (likely(command->async_callback)) - command->async_callback(command->async_data, cb_status); - - command->async_callback = NULL; - command->comp_func = NULL; - - /* Unmap the DMA scatter list entries */ - dma_unmap_sg(&dd->pdev->dev, - command->sg, - command->scatter_ents, - command->direction); - - /* Clear the allocated and active bits for the command */ - atomic_set(&port->commands[tag].active, 0); - release_slot(port, tag); - - if (unlikely(command->unaligned)) - up(&port->cmd_slot_unal); - else - up(&port->cmd_slot); -} - -/* * Internal command completion callback function. * * This function is normally called by the driver ISR when an internal @@ -854,7 +861,6 @@ static void mtip_handle_tfe(struct driver_data *dd) "Missing completion func for tag %d", tag); if (mtip_check_surprise_removal(dd->pdev)) { - mtip_command_cleanup(dd); /* don't proceed further */ return; } @@ -1018,14 +1024,12 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, command->comp_data, 0); } else { - dev_warn(&dd->pdev->dev, - "Null completion " - "for tag %d", + dev_dbg(&dd->pdev->dev, + "Null completion for tag %d", tag); if (mtip_check_surprise_removal( dd->pdev)) { - mtip_command_cleanup(dd); return; } } @@ -1145,7 +1149,6 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) if (unlikely(port_stat & PORT_IRQ_ERR)) { if (unlikely(mtip_check_surprise_removal(dd->pdev))) { - mtip_command_cleanup(dd); /* don't proceed further */ return IRQ_HANDLED; } @@ -2806,34 +2809,51 @@ static ssize_t show_device_status(struct device_driver *drv, char *buf) static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf, size_t len, loff_t *offset) { + struct driver_data *dd = (struct driver_data *)f->private_data; int size = *offset; - char buf[MTIP_DFS_MAX_BUF_SIZE]; + char *buf; + int rv = 0; if (!len || *offset) return 0; + buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); + if (!buf) { + dev_err(&dd->pdev->dev, + "Memory allocation: status buffer\n"); + return -ENOMEM; + } + size += show_device_status(NULL, buf); *offset = size <= len ? size : len; size = copy_to_user(ubuf, buf, *offset); if (size) - return -EFAULT; + rv = -EFAULT; - return *offset; + kfree(buf); + return rv ? rv : *offset; } static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, size_t len, loff_t *offset) { struct driver_data *dd = (struct driver_data *)f->private_data; - char buf[MTIP_DFS_MAX_BUF_SIZE]; + char *buf; u32 group_allocated; int size = *offset; - int n; + int n, rv = 0; if (!len || size) return 0; + buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); + if (!buf) { + dev_err(&dd->pdev->dev, + "Memory allocation: register buffer\n"); + return -ENOMEM; + } + size += sprintf(&buf[size], "H/ S ACTive : [ 0x"); for (n = dd->slot_groups-1; n >= 0; n--) @@ -2888,21 +2908,30 @@ static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, *offset = size <= len ? size : len; size = copy_to_user(ubuf, buf, *offset); if (size) - return -EFAULT; + rv = -EFAULT; - return *offset; + kfree(buf); + return rv ? rv : *offset; } static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, size_t len, loff_t *offset) { struct driver_data *dd = (struct driver_data *)f->private_data; - char buf[MTIP_DFS_MAX_BUF_SIZE]; + char *buf; int size = *offset; + int rv = 0; if (!len || size) return 0; + buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); + if (!buf) { + dev_err(&dd->pdev->dev, + "Memory allocation: flag buffer\n"); + return -ENOMEM; + } + size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n", dd->port->flags); size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n", @@ -2911,9 +2940,10 @@ static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, *offset = size <= len ? size : len; size = copy_to_user(ubuf, buf, *offset); if (size) - return -EFAULT; + rv = -EFAULT; - return *offset; + kfree(buf); + return rv ? rv : *offset; } static const struct file_operations mtip_device_status_fops = { @@ -3006,6 +3036,46 @@ static void mtip_hw_debugfs_exit(struct driver_data *dd) debugfs_remove_recursive(dd->dfs_node); } +static int mtip_free_orphan(struct driver_data *dd) +{ + struct kobject *kobj; + + if (dd->bdev) { + if (dd->bdev->bd_holders >= 1) + return -2; + + bdput(dd->bdev); + dd->bdev = NULL; + } + + mtip_hw_debugfs_exit(dd); + + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); + + if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag) && + test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) { + put_disk(dd->disk); + } else { + if (dd->disk) { + kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); + if (kobj) { + mtip_hw_sysfs_exit(dd, kobj); + kobject_put(kobj); + } + del_gendisk(dd->disk); + dd->disk = NULL; + } + if (dd->queue) { + dd->queue->queuedata = NULL; + blk_cleanup_queue(dd->queue); + dd->queue = NULL; + } + } + kfree(dd); + return 0; +} /* * Perform any init/resume time hardware setup @@ -3154,6 +3224,7 @@ static int mtip_service_thread(void *data) unsigned long slot, slot_start, slot_wrap; unsigned int num_cmd_slots = dd->slot_groups * 32; struct mtip_port *port = dd->port; + int ret; while (1) { /* @@ -3164,13 +3235,18 @@ static int mtip_service_thread(void *data) !(port->flags & MTIP_PF_PAUSE_IO)); if (kthread_should_stop()) + goto st_out; + + set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + + /* If I am an orphan, start self cleanup */ + if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags)) break; if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) - break; + goto st_out; - set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { slot = 1; /* used to restrict the loop to one iteration */ @@ -3201,7 +3277,7 @@ static int mtip_service_thread(void *data) clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { - if (!mtip_ftl_rebuild_poll(dd)) + if (mtip_ftl_rebuild_poll(dd) < 0) set_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag); clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); @@ -3209,8 +3285,30 @@ static int mtip_service_thread(void *data) clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) + goto st_out; + } + + /* wait for pci remove to exit */ + while (1) { + if (test_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag)) break; + msleep_interruptible(1000); + if (kthread_should_stop()) + goto st_out; + } + + while (1) { + ret = mtip_free_orphan(dd); + if (!ret) { + /* NOTE: All data structures are invalid, do not + * access any here */ + return 0; + } + msleep_interruptible(1000); + if (kthread_should_stop()) + goto st_out; } +st_out: return 0; } @@ -3437,13 +3535,13 @@ static int mtip_hw_init(struct driver_data *dd) rv = -EFAULT; goto out3; } + mtip_dump_identify(dd->port); if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == MTIP_FTL_REBUILD_MAGIC) { set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); return MTIP_FTL_REBUILD_MAGIC; } - mtip_dump_identify(dd->port); /* check write protect, over temp and rebuild statuses */ rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, @@ -3467,8 +3565,8 @@ static int mtip_hw_init(struct driver_data *dd) } if (buf[288] == 0xBF) { dev_info(&dd->pdev->dev, - "Drive indicates rebuild has failed.\n"); - /* TODO */ + "Drive is in security locked state.\n"); + set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); } } @@ -3523,9 +3621,8 @@ static int mtip_hw_exit(struct driver_data *dd) * Send standby immediate (E0h) to the drive so that it * saves its state. */ - if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { - - if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) + if (!dd->sr) { + if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) if (mtip_standby_immediate(dd->port)) dev_warn(&dd->pdev->dev, "STANDBY IMMEDIATE failed\n"); @@ -3551,6 +3648,7 @@ static int mtip_hw_exit(struct driver_data *dd) dd->port->command_list_dma); /* Free the memory allocated for the for structure. */ kfree(dd->port); + dd->port = NULL; return 0; } @@ -3572,7 +3670,8 @@ static int mtip_hw_shutdown(struct driver_data *dd) * Send standby immediate (E0h) to the drive so that it * saves its state. */ - mtip_standby_immediate(dd->port); + if (!dd->sr && dd->port) + mtip_standby_immediate(dd->port); return 0; } @@ -3887,6 +3986,10 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) bio_endio(bio, -ENODATA); return; } + if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) { + bio_endio(bio, -ENXIO); + return; + } } if (unlikely(bio->bi_rw & REQ_DISCARD)) { @@ -4010,6 +4113,8 @@ static int mtip_block_initialize(struct driver_data *dd) dd->disk->private_data = dd; dd->index = index; + mtip_hw_debugfs_init(dd); + /* * if rebuild pending, start the service thread, and delay the block * queue creation and add_disk() @@ -4068,6 +4173,7 @@ skip_create_disk: /* Enable the block device and add it to /dev */ add_disk(dd->disk); + dd->bdev = bdget_disk(dd->disk, 0); /* * Now that the disk is active, initialize any sysfs attributes * managed by the protocol layer. @@ -4077,7 +4183,6 @@ skip_create_disk: mtip_hw_sysfs_init(dd, kobj); kobject_put(kobj); } - mtip_hw_debugfs_init(dd); if (dd->mtip_svc_handler) { set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); @@ -4103,7 +4208,8 @@ start_service_thread: return rv; kthread_run_error: - mtip_hw_debugfs_exit(dd); + bdput(dd->bdev); + dd->bdev = NULL; /* Delete our gendisk. This also removes the device from /dev */ del_gendisk(dd->disk); @@ -4112,6 +4218,7 @@ read_capacity_error: blk_cleanup_queue(dd->queue); block_queue_alloc_init_error: + mtip_hw_debugfs_exit(dd); disk_index_error: spin_lock(&rssd_index_lock); ida_remove(&rssd_index_ida, index); @@ -4141,40 +4248,48 @@ static int mtip_block_remove(struct driver_data *dd) { struct kobject *kobj; - if (dd->mtip_svc_handler) { - set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); - wake_up_interruptible(&dd->port->svc_wait); - kthread_stop(dd->mtip_svc_handler); - } + if (!dd->sr) { + mtip_hw_debugfs_exit(dd); - /* Clean up the sysfs attributes, if created */ - if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { - kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); - if (kobj) { - mtip_hw_sysfs_exit(dd, kobj); - kobject_put(kobj); + if (dd->mtip_svc_handler) { + set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); + kthread_stop(dd->mtip_svc_handler); } - } - mtip_hw_debugfs_exit(dd); - /* - * Delete our gendisk structure. This also removes the device - * from /dev - */ - if (dd->disk) { - if (dd->disk->queue) - del_gendisk(dd->disk); - else - put_disk(dd->disk); - } - - spin_lock(&rssd_index_lock); - ida_remove(&rssd_index_ida, dd->index); - spin_unlock(&rssd_index_lock); + /* Clean up the sysfs attributes, if created */ + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { + kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); + if (kobj) { + mtip_hw_sysfs_exit(dd, kobj); + kobject_put(kobj); + } + } + /* + * Delete our gendisk structure. This also removes the device + * from /dev + */ + if (dd->bdev) { + bdput(dd->bdev); + dd->bdev = NULL; + } + if (dd->disk) { + if (dd->disk->queue) { + del_gendisk(dd->disk); + blk_cleanup_queue(dd->queue); + dd->queue = NULL; + } else + put_disk(dd->disk); + } + dd->disk = NULL; - blk_cleanup_queue(dd->queue); - dd->disk = NULL; - dd->queue = NULL; + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); + } else { + dev_info(&dd->pdev->dev, "device %s surprise removal\n", + dd->disk->disk_name); + } /* De-initialize the protocol layer. */ mtip_hw_exit(dd); @@ -4490,8 +4605,7 @@ done: static void mtip_pci_remove(struct pci_dev *pdev) { struct driver_data *dd = pci_get_drvdata(pdev); - int counter = 0; - unsigned long flags; + unsigned long flags, to; set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); @@ -4500,17 +4614,22 @@ static void mtip_pci_remove(struct pci_dev *pdev) list_add(&dd->remove_list, &removing_list); spin_unlock_irqrestore(&dev_lock, flags); - if (mtip_check_surprise_removal(pdev)) { - while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { - counter++; - msleep(20); - if (counter == 10) { - /* Cleanup the outstanding commands */ - mtip_command_cleanup(dd); - break; - } - } + mtip_check_surprise_removal(pdev); + synchronize_irq(dd->pdev->irq); + + /* Spin until workers are done */ + to = jiffies + msecs_to_jiffies(4000); + do { + msleep(20); + } while (atomic_read(&dd->irq_workers_active) != 0 && + time_before(jiffies, to)); + + if (atomic_read(&dd->irq_workers_active) != 0) { + dev_warn(&dd->pdev->dev, + "Completion workers still active!\n"); } + /* Cleanup the outstanding commands */ + mtip_command_cleanup(dd); /* Clean up the block layer. */ mtip_block_remove(dd); @@ -4529,8 +4648,15 @@ static void mtip_pci_remove(struct pci_dev *pdev) list_del_init(&dd->remove_list); spin_unlock_irqrestore(&dev_lock, flags); - kfree(dd); + if (!dd->sr) + kfree(dd); + else + set_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag); + pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); + pci_set_drvdata(pdev, NULL); + pci_dev_put(pdev); + } /* diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 3bb8a295fbe4..9be7a1582ad3 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -140,6 +140,7 @@ enum { MTIP_PF_SVC_THD_ACTIVE_BIT = 4, MTIP_PF_ISSUE_CMDS_BIT = 5, MTIP_PF_REBUILD_BIT = 6, + MTIP_PF_SR_CLEANUP_BIT = 7, MTIP_PF_SVC_THD_STOP_BIT = 8, /* below are bit numbers in 'dd_flag' defined in driver_data */ @@ -147,15 +148,18 @@ enum { MTIP_DDF_REMOVE_PENDING_BIT = 1, MTIP_DDF_OVER_TEMP_BIT = 2, MTIP_DDF_WRITE_PROTECT_BIT = 3, - MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | - (1 << MTIP_DDF_SEC_LOCK_BIT) | - (1 << MTIP_DDF_OVER_TEMP_BIT) | - (1 << MTIP_DDF_WRITE_PROTECT_BIT)), - + MTIP_DDF_REMOVE_DONE_BIT = 4, MTIP_DDF_CLEANUP_BIT = 5, MTIP_DDF_RESUME_BIT = 6, MTIP_DDF_INIT_DONE_BIT = 7, MTIP_DDF_REBUILD_FAILED_BIT = 8, + + MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | + (1 << MTIP_DDF_SEC_LOCK_BIT) | + (1 << MTIP_DDF_OVER_TEMP_BIT) | + (1 << MTIP_DDF_WRITE_PROTECT_BIT) | + (1 << MTIP_DDF_REBUILD_FAILED_BIT)), + }; struct smart_attr { @@ -499,6 +503,8 @@ struct driver_data { bool trim_supp; /* flag indicating trim support */ + bool sr; + int numa_node; /* NUMA support */ char workq_name[32]; @@ -511,6 +517,8 @@ struct driver_data { int isr_binding; + struct block_device *bdev; + int unal_qdepth; /* qdepth of unaligned IO queue */ struct list_head online_list; /* linkage for online list */ diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 56188475cfd3..ff8668c5efb1 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -473,45 +473,31 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) { if (!pkt_debugfs_root) return; - pd->dfs_f_info = NULL; pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root); - if (IS_ERR(pd->dfs_d_root)) { - pd->dfs_d_root = NULL; + if (!pd->dfs_d_root) return; - } + pd->dfs_f_info = debugfs_create_file("info", S_IRUGO, pd->dfs_d_root, pd, &debug_fops); - if (IS_ERR(pd->dfs_f_info)) { - pd->dfs_f_info = NULL; - return; - } } static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) { if (!pkt_debugfs_root) return; - if (pd->dfs_f_info) - debugfs_remove(pd->dfs_f_info); + debugfs_remove(pd->dfs_f_info); + debugfs_remove(pd->dfs_d_root); pd->dfs_f_info = NULL; - if (pd->dfs_d_root) - debugfs_remove(pd->dfs_d_root); pd->dfs_d_root = NULL; } static void pkt_debugfs_init(void) { pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL); - if (IS_ERR(pkt_debugfs_root)) { - pkt_debugfs_root = NULL; - return; - } } static void pkt_debugfs_cleanup(void) { - if (!pkt_debugfs_root) - return; debugfs_remove(pkt_debugfs_root); pkt_debugfs_root = NULL; } diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 6e85e21445eb..a8de2eec6ff3 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -654,7 +654,8 @@ static void rsxx_eeh_failure(struct pci_dev *dev) for (i = 0; i < card->n_targets; i++) { spin_lock_bh(&card->ctrl[i].queue_lock); cnt = rsxx_cleanup_dma_queue(&card->ctrl[i], - &card->ctrl[i].queue); + &card->ctrl[i].queue, + COMPLETE_DMA); spin_unlock_bh(&card->ctrl[i].queue_lock); cnt += rsxx_dma_cancel(&card->ctrl[i]); @@ -748,10 +749,6 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) card->eeh_state = 0; - st = rsxx_eeh_remap_dmas(card); - if (st) - goto failed_remap_dmas; - spin_lock_irqsave(&card->irq_lock, flags); if (card->n_targets & RSXX_MAX_TARGETS) rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G); @@ -778,7 +775,6 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) return PCI_ERS_RESULT_RECOVERED; failed_hw_buffers_init: -failed_remap_dmas: for (i = 0; i < card->n_targets; i++) { if (card->ctrl[i].status.buf) pci_free_consistent(card->dev, diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index d7af441880be..2284f5d3a54a 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -295,13 +295,15 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) return -ENOMEM; } - blk_size = card->config.data.block_size; + if (card->config_valid) { + blk_size = card->config.data.block_size; + blk_queue_dma_alignment(card->queue, blk_size - 1); + blk_queue_logical_block_size(card->queue, blk_size); + } blk_queue_make_request(card->queue, rsxx_make_request); blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); - blk_queue_dma_alignment(card->queue, blk_size - 1); blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); - blk_queue_logical_block_size(card->queue, blk_size); blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue); diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index bed32f16b084..fc88ba3e1bd2 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -221,6 +221,21 @@ static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card) } /*----------------- RSXX DMA Handling -------------------*/ +static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma) +{ + if (dma->cmd != HW_CMD_BLK_DISCARD) { + if (!pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) { + pci_unmap_page(ctrl->card->dev, dma->dma_addr, + get_dma_size(dma), + dma->cmd == HW_CMD_BLK_WRITE ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + } + } + + kmem_cache_free(rsxx_dma_pool, dma); +} + static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma, unsigned int status) @@ -232,21 +247,14 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, if (status & DMA_CANCELLED) ctrl->stats.dma_cancelled++; - if (dma->dma_addr) - pci_unmap_page(ctrl->card->dev, dma->dma_addr, - get_dma_size(dma), - dma->cmd == HW_CMD_BLK_WRITE ? - PCI_DMA_TODEVICE : - PCI_DMA_FROMDEVICE); - if (dma->cb) dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0); - kmem_cache_free(rsxx_dma_pool, dma); + rsxx_free_dma(ctrl, dma); } int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, - struct list_head *q) + struct list_head *q, unsigned int done) { struct rsxx_dma *dma; struct rsxx_dma *tmp; @@ -254,7 +262,10 @@ int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, list_for_each_entry_safe(dma, tmp, q, list) { list_del(&dma->list); - rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); + if (done & COMPLETE_DMA) + rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); + else + rsxx_free_dma(ctrl, dma); cnt++; } @@ -370,7 +381,7 @@ static void dma_engine_stalled(unsigned long data) /* Clean up the DMA queue */ spin_lock(&ctrl->queue_lock); - cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); + cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA); spin_unlock(&ctrl->queue_lock); cnt += rsxx_dma_cancel(ctrl); @@ -388,6 +399,7 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl) int tag; int cmds_pending = 0; struct hw_cmd *hw_cmd_buf; + int dir; hw_cmd_buf = ctrl->cmd.buf; @@ -424,6 +436,31 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl) continue; } + if (dma->cmd != HW_CMD_BLK_DISCARD) { + if (dma->cmd == HW_CMD_BLK_WRITE) + dir = PCI_DMA_TODEVICE; + else + dir = PCI_DMA_FROMDEVICE; + + /* + * The function pci_map_page is placed here because we + * can only, by design, issue up to 255 commands to the + * hardware at one time per DMA channel. So the maximum + * amount of mapped memory would be 255 * 4 channels * + * 4096 Bytes which is less than 2GB, the limit of a x8 + * Non-HWWD PCIe slot. This way the pci_map_page + * function should never fail because of a lack of + * mappable memory. + */ + dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page, + dma->pg_off, dma->sub_page.cnt << 9, dir); + if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) { + push_tracker(ctrl->trackers, tag); + rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); + continue; + } + } + set_tracker_dma(ctrl->trackers, tag, dma); hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd; hw_cmd_buf[ctrl->cmd.idx].tag = tag; @@ -620,14 +657,6 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card, if (!dma) return -ENOMEM; - dma->dma_addr = pci_map_page(card->dev, page, pg_off, dma_len, - dir ? PCI_DMA_TODEVICE : - PCI_DMA_FROMDEVICE); - if (!dma->dma_addr) { - kmem_cache_free(rsxx_dma_pool, dma); - return -ENOMEM; - } - dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; dma->laddr = laddr; dma->sub_page.off = (dma_off >> 9); @@ -736,11 +765,9 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, return 0; bvec_err: - for (i = 0; i < card->n_targets; i++) { - spin_lock_bh(&card->ctrl[i].queue_lock); - rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]); - spin_unlock_bh(&card->ctrl[i].queue_lock); - } + for (i = 0; i < card->n_targets; i++) + rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i], + FREE_DMA); return st; } @@ -990,7 +1017,7 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card) /* Clean up the DMA queue */ spin_lock_bh(&ctrl->queue_lock); - rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); + rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA); spin_unlock_bh(&ctrl->queue_lock); rsxx_dma_cancel(ctrl); @@ -1032,6 +1059,14 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) else card->ctrl[i].stats.reads_issued--; + if (dma->cmd != HW_CMD_BLK_DISCARD) { + pci_unmap_page(card->dev, dma->dma_addr, + get_dma_size(dma), + dma->cmd == HW_CMD_BLK_WRITE ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + } + list_add_tail(&dma->list, &issued_dmas[i]); push_tracker(card->ctrl[i].trackers, j); cnt++; @@ -1043,15 +1078,6 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); card->ctrl[i].stats.sw_q_depth += cnt; card->ctrl[i].e_cnt = 0; - - list_for_each_entry(dma, &card->ctrl[i].queue, list) { - if (dma->dma_addr) - pci_unmap_page(card->dev, dma->dma_addr, - get_dma_size(dma), - dma->cmd == HW_CMD_BLK_WRITE ? - PCI_DMA_TODEVICE : - PCI_DMA_FROMDEVICE); - } spin_unlock_bh(&card->ctrl[i].queue_lock); } @@ -1060,31 +1086,6 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) return 0; } -int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) -{ - struct rsxx_dma *dma; - int i; - - for (i = 0; i < card->n_targets; i++) { - spin_lock_bh(&card->ctrl[i].queue_lock); - list_for_each_entry(dma, &card->ctrl[i].queue, list) { - dma->dma_addr = pci_map_page(card->dev, dma->page, - dma->pg_off, get_dma_size(dma), - dma->cmd == HW_CMD_BLK_WRITE ? - PCI_DMA_TODEVICE : - PCI_DMA_FROMDEVICE); - if (!dma->dma_addr) { - spin_unlock_bh(&card->ctrl[i].queue_lock); - kmem_cache_free(rsxx_dma_pool, dma); - return -ENOMEM; - } - } - spin_unlock_bh(&card->ctrl[i].queue_lock); - } - - return 0; -} - int rsxx_dma_init(void) { rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN); diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index 5ad5055a4104..6bbc64d0f690 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h @@ -52,7 +52,7 @@ struct proc_cmd; #define RS70_PCI_REV_SUPPORTED 4 #define DRIVER_NAME "rsxx" -#define DRIVER_VERSION "4.0" +#define DRIVER_VERSION "4.0.3.2516" /* Block size is 4096 */ #define RSXX_HW_BLK_SHIFT 12 @@ -345,6 +345,11 @@ enum rsxx_creg_stat { CREG_STAT_TAG_MASK = 0x0000ff00, }; +enum rsxx_dma_finish { + FREE_DMA = 0x0, + COMPLETE_DMA = 0x1, +}; + static inline unsigned int CREG_DATA(int N) { return CREG_DATA0 + (N << 2); @@ -379,7 +384,9 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card, int rsxx_dma_setup(struct rsxx_cardinfo *card); void rsxx_dma_destroy(struct rsxx_cardinfo *card); int rsxx_dma_init(void); -int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q); +int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, + struct list_head *q, + unsigned int done); int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); void rsxx_dma_cleanup(void); void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c new file mode 100644 index 000000000000..9199c93be926 --- /dev/null +++ b/drivers/block/skd_main.c @@ -0,0 +1,5432 @@ +/* Copyright 2012 STEC, Inc. + * + * This file is licensed under the terms of the 3-clause + * BSD License (http://opensource.org/licenses/BSD-3-Clause) + * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html), + * at your option. Both licenses are also available in the LICENSE file + * distributed with this project. This file may not be copied, modified, + * or distributed except in accordance with those terms. + * Gordoni Waidhofer <gwaidhofer@stec-inc.com> + * Initial Driver Design! + * Thomas Swann <tswann@stec-inc.com> + * Interrupt handling. + * Ramprasad Chinthekindi <rchinthekindi@stec-inc.com> + * biomode implementation. + * Akhil Bhansali <abhansali@stec-inc.com> + * Added support for DISCARD / FLUSH and FUA. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/compiler.h> +#include <linux/workqueue.h> +#include <linux/bitops.h> +#include <linux/delay.h> +#include <linux/time.h> +#include <linux/hdreg.h> +#include <linux/dma-mapping.h> +#include <linux/completion.h> +#include <linux/scatterlist.h> +#include <linux/version.h> +#include <linux/err.h> +#include <linux/scatterlist.h> +#include <linux/aer.h> +#include <linux/ctype.h> +#include <linux/wait.h> +#include <linux/uio.h> +#include <scsi/scsi.h> +#include <scsi/sg.h> +#include <linux/io.h> +#include <linux/uaccess.h> +#include <asm/unaligned.h> + +#include "skd_s1120.h" + +static int skd_dbg_level; +static int skd_isr_comp_limit = 4; + +enum { + STEC_LINK_2_5GTS = 0, + STEC_LINK_5GTS = 1, + STEC_LINK_8GTS = 2, + STEC_LINK_UNKNOWN = 0xFF +}; + +enum { + SKD_FLUSH_INITIALIZER, + SKD_FLUSH_ZERO_SIZE_FIRST, + SKD_FLUSH_DATA_SECOND, +}; + +#define SKD_ASSERT(expr) \ + do { \ + if (unlikely(!(expr))) { \ + pr_err("Assertion failed! %s,%s,%s,line=%d\n", \ + # expr, __FILE__, __func__, __LINE__); \ + } \ + } while (0) + +#define DRV_NAME "skd" +#define DRV_VERSION "2.2.1" +#define DRV_BUILD_ID "0260" +#define PFX DRV_NAME ": " +#define DRV_BIN_VERSION 0x100 +#define DRV_VER_COMPL "2.2.1." DRV_BUILD_ID + +MODULE_AUTHOR("bug-reports: support@stec-inc.com"); +MODULE_LICENSE("Dual BSD/GPL"); + +MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver (b" DRV_BUILD_ID ")"); +MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); + +#define PCI_VENDOR_ID_STEC 0x1B39 +#define PCI_DEVICE_ID_S1120 0x0001 + +#define SKD_FUA_NV (1 << 1) +#define SKD_MINORS_PER_DEVICE 16 + +#define SKD_MAX_QUEUE_DEPTH 200u + +#define SKD_PAUSE_TIMEOUT (5 * 1000) + +#define SKD_N_FITMSG_BYTES (512u) + +#define SKD_N_SPECIAL_CONTEXT 32u +#define SKD_N_SPECIAL_FITMSG_BYTES (128u) + +/* SG elements are 32 bytes, so we can make this 4096 and still be under the + * 128KB limit. That allows 4096*4K = 16M xfer size + */ +#define SKD_N_SG_PER_REQ_DEFAULT 256u +#define SKD_N_SG_PER_SPECIAL 256u + +#define SKD_N_COMPLETION_ENTRY 256u +#define SKD_N_READ_CAP_BYTES (8u) + +#define SKD_N_INTERNAL_BYTES (512u) + +/* 5 bits of uniqifier, 0xF800 */ +#define SKD_ID_INCR (0x400) +#define SKD_ID_TABLE_MASK (3u << 8u) +#define SKD_ID_RW_REQUEST (0u << 8u) +#define SKD_ID_INTERNAL (1u << 8u) +#define SKD_ID_SPECIAL_REQUEST (2u << 8u) +#define SKD_ID_FIT_MSG (3u << 8u) +#define SKD_ID_SLOT_MASK 0x00FFu +#define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu + +#define SKD_N_TIMEOUT_SLOT 4u +#define SKD_TIMEOUT_SLOT_MASK 3u + +#define SKD_N_MAX_SECTORS 2048u + +#define SKD_MAX_RETRIES 2u + +#define SKD_TIMER_SECONDS(seconds) (seconds) +#define SKD_TIMER_MINUTES(minutes) ((minutes) * (60)) + +#define INQ_STD_NBYTES 36 +#define SKD_DISCARD_CDB_LENGTH 24 + +enum skd_drvr_state { + SKD_DRVR_STATE_LOAD, + SKD_DRVR_STATE_IDLE, + SKD_DRVR_STATE_BUSY, + SKD_DRVR_STATE_STARTING, + SKD_DRVR_STATE_ONLINE, + SKD_DRVR_STATE_PAUSING, + SKD_DRVR_STATE_PAUSED, + SKD_DRVR_STATE_DRAINING_TIMEOUT, + SKD_DRVR_STATE_RESTARTING, + SKD_DRVR_STATE_RESUMING, + SKD_DRVR_STATE_STOPPING, + SKD_DRVR_STATE_FAULT, + SKD_DRVR_STATE_DISAPPEARED, + SKD_DRVR_STATE_PROTOCOL_MISMATCH, + SKD_DRVR_STATE_BUSY_ERASE, + SKD_DRVR_STATE_BUSY_SANITIZE, + SKD_DRVR_STATE_BUSY_IMMINENT, + SKD_DRVR_STATE_WAIT_BOOT, + SKD_DRVR_STATE_SYNCING, +}; + +#define SKD_WAIT_BOOT_TIMO SKD_TIMER_SECONDS(90u) +#define SKD_STARTING_TIMO SKD_TIMER_SECONDS(8u) +#define SKD_RESTARTING_TIMO SKD_TIMER_MINUTES(4u) +#define SKD_DRAINING_TIMO SKD_TIMER_SECONDS(6u) +#define SKD_BUSY_TIMO SKD_TIMER_MINUTES(20u) +#define SKD_STARTED_BUSY_TIMO SKD_TIMER_SECONDS(60u) +#define SKD_START_WAIT_SECONDS 90u + +enum skd_req_state { + SKD_REQ_STATE_IDLE, + SKD_REQ_STATE_SETUP, + SKD_REQ_STATE_BUSY, + SKD_REQ_STATE_COMPLETED, + SKD_REQ_STATE_TIMEOUT, + SKD_REQ_STATE_ABORTED, +}; + +enum skd_fit_msg_state { + SKD_MSG_STATE_IDLE, + SKD_MSG_STATE_BUSY, +}; + +enum skd_check_status_action { + SKD_CHECK_STATUS_REPORT_GOOD, + SKD_CHECK_STATUS_REPORT_SMART_ALERT, + SKD_CHECK_STATUS_REQUEUE_REQUEST, + SKD_CHECK_STATUS_REPORT_ERROR, + SKD_CHECK_STATUS_BUSY_IMMINENT, +}; + +struct skd_fitmsg_context { + enum skd_fit_msg_state state; + + struct skd_fitmsg_context *next; + + u32 id; + u16 outstanding; + + u32 length; + u32 offset; + + u8 *msg_buf; + dma_addr_t mb_dma_address; +}; + +struct skd_request_context { + enum skd_req_state state; + + struct skd_request_context *next; + + u16 id; + u32 fitmsg_id; + + struct request *req; + u8 flush_cmd; + u8 discard_page; + + u32 timeout_stamp; + u8 sg_data_dir; + struct scatterlist *sg; + u32 n_sg; + u32 sg_byte_count; + + struct fit_sg_descriptor *sksg_list; + dma_addr_t sksg_dma_address; + + struct fit_completion_entry_v1 completion; + + struct fit_comp_error_info err_info; + +}; +#define SKD_DATA_DIR_HOST_TO_CARD 1 +#define SKD_DATA_DIR_CARD_TO_HOST 2 +#define SKD_DATA_DIR_NONE 3 /* especially for DISCARD requests. */ + +struct skd_special_context { + struct skd_request_context req; + + u8 orphaned; + + void *data_buf; + dma_addr_t db_dma_address; + + u8 *msg_buf; + dma_addr_t mb_dma_address; +}; + +struct skd_sg_io { + fmode_t mode; + void __user *argp; + + struct sg_io_hdr sg; + + u8 cdb[16]; + + u32 dxfer_len; + u32 iovcnt; + struct sg_iovec *iov; + struct sg_iovec no_iov_iov; + + struct skd_special_context *skspcl; +}; + +typedef enum skd_irq_type { + SKD_IRQ_LEGACY, + SKD_IRQ_MSI, + SKD_IRQ_MSIX +} skd_irq_type_t; + +#define SKD_MAX_BARS 2 + +struct skd_device { + volatile void __iomem *mem_map[SKD_MAX_BARS]; + resource_size_t mem_phys[SKD_MAX_BARS]; + u32 mem_size[SKD_MAX_BARS]; + + skd_irq_type_t irq_type; + u32 msix_count; + struct skd_msix_entry *msix_entries; + + struct pci_dev *pdev; + int pcie_error_reporting_is_enabled; + + spinlock_t lock; + struct gendisk *disk; + struct request_queue *queue; + struct device *class_dev; + int gendisk_on; + int sync_done; + + atomic_t device_count; + u32 devno; + u32 major; + char name[32]; + char isr_name[30]; + + enum skd_drvr_state state; + u32 drive_state; + + u32 in_flight; + u32 cur_max_queue_depth; + u32 queue_low_water_mark; + u32 dev_max_queue_depth; + + u32 num_fitmsg_context; + u32 num_req_context; + + u32 timeout_slot[SKD_N_TIMEOUT_SLOT]; + u32 timeout_stamp; + struct skd_fitmsg_context *skmsg_free_list; + struct skd_fitmsg_context *skmsg_table; + + struct skd_request_context *skreq_free_list; + struct skd_request_context *skreq_table; + + struct skd_special_context *skspcl_free_list; + struct skd_special_context *skspcl_table; + + struct skd_special_context internal_skspcl; + u32 read_cap_blocksize; + u32 read_cap_last_lba; + int read_cap_is_valid; + int inquiry_is_valid; + u8 inq_serial_num[13]; /*12 chars plus null term */ + u8 id_str[80]; /* holds a composite name (pci + sernum) */ + + u8 skcomp_cycle; + u32 skcomp_ix; + struct fit_completion_entry_v1 *skcomp_table; + struct fit_comp_error_info *skerr_table; + dma_addr_t cq_dma_address; + + wait_queue_head_t waitq; + + struct timer_list timer; + u32 timer_countdown; + u32 timer_substate; + + int n_special; + int sgs_per_request; + u32 last_mtd; + + u32 proto_ver; + + int dbg_level; + u32 connect_time_stamp; + int connect_retries; +#define SKD_MAX_CONNECT_RETRIES 16 + u32 drive_jiffies; + + u32 timo_slot; + + + struct work_struct completion_worker; +}; + +#define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF) +#define SKD_READL(DEV, OFF) skd_reg_read32(DEV, OFF) +#define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF) + +static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset) +{ + u32 val; + + if (likely(skdev->dbg_level < 2)) + return readl(skdev->mem_map[1] + offset); + else { + barrier(); + val = readl(skdev->mem_map[1] + offset); + barrier(); + pr_debug("%s:%s:%d offset %x = %x\n", + skdev->name, __func__, __LINE__, offset, val); + return val; + } + +} + +static inline void skd_reg_write32(struct skd_device *skdev, u32 val, + u32 offset) +{ + if (likely(skdev->dbg_level < 2)) { + writel(val, skdev->mem_map[1] + offset); + barrier(); + } else { + barrier(); + writel(val, skdev->mem_map[1] + offset); + barrier(); + pr_debug("%s:%s:%d offset %x = %x\n", + skdev->name, __func__, __LINE__, offset, val); + } +} + +static inline void skd_reg_write64(struct skd_device *skdev, u64 val, + u32 offset) +{ + if (likely(skdev->dbg_level < 2)) { + writeq(val, skdev->mem_map[1] + offset); + barrier(); + } else { + barrier(); + writeq(val, skdev->mem_map[1] + offset); + barrier(); + pr_debug("%s:%s:%d offset %x = %016llx\n", + skdev->name, __func__, __LINE__, offset, val); + } +} + + +#define SKD_IRQ_DEFAULT SKD_IRQ_MSI +static int skd_isr_type = SKD_IRQ_DEFAULT; + +module_param(skd_isr_type, int, 0444); +MODULE_PARM_DESC(skd_isr_type, "Interrupt type capability." + " (0==legacy, 1==MSI, 2==MSI-X, default==1)"); + +#define SKD_MAX_REQ_PER_MSG_DEFAULT 1 +static int skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT; + +module_param(skd_max_req_per_msg, int, 0444); +MODULE_PARM_DESC(skd_max_req_per_msg, + "Maximum SCSI requests packed in a single message." + " (1-14, default==1)"); + +#define SKD_MAX_QUEUE_DEPTH_DEFAULT 64 +#define SKD_MAX_QUEUE_DEPTH_DEFAULT_STR "64" +static int skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT; + +module_param(skd_max_queue_depth, int, 0444); +MODULE_PARM_DESC(skd_max_queue_depth, + "Maximum SCSI requests issued to s1120." + " (1-200, default==" SKD_MAX_QUEUE_DEPTH_DEFAULT_STR ")"); + +static int skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT; +module_param(skd_sgs_per_request, int, 0444); +MODULE_PARM_DESC(skd_sgs_per_request, + "Maximum SG elements per block request." + " (1-4096, default==256)"); + +static int skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT; +module_param(skd_max_pass_thru, int, 0444); +MODULE_PARM_DESC(skd_max_pass_thru, + "Maximum SCSI pass-thru at a time." " (1-50, default==32)"); + +module_param(skd_dbg_level, int, 0444); +MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)"); + +module_param(skd_isr_comp_limit, int, 0444); +MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4"); + +/* Major device number dynamically assigned. */ +static u32 skd_major; + +static void skd_destruct(struct skd_device *skdev); +static const struct block_device_operations skd_blockdev_ops; +static void skd_send_fitmsg(struct skd_device *skdev, + struct skd_fitmsg_context *skmsg); +static void skd_send_special_fitmsg(struct skd_device *skdev, + struct skd_special_context *skspcl); +static void skd_request_fn(struct request_queue *rq); +static void skd_end_request(struct skd_device *skdev, + struct skd_request_context *skreq, int error); +static int skd_preop_sg_list(struct skd_device *skdev, + struct skd_request_context *skreq); +static void skd_postop_sg_list(struct skd_device *skdev, + struct skd_request_context *skreq); + +static void skd_restart_device(struct skd_device *skdev); +static int skd_quiesce_dev(struct skd_device *skdev); +static int skd_unquiesce_dev(struct skd_device *skdev); +static void skd_release_special(struct skd_device *skdev, + struct skd_special_context *skspcl); +static void skd_disable_interrupts(struct skd_device *skdev); +static void skd_isr_fwstate(struct skd_device *skdev); +static void skd_recover_requests(struct skd_device *skdev, int requeue); +static void skd_soft_reset(struct skd_device *skdev); + +static const char *skd_name(struct skd_device *skdev); +const char *skd_drive_state_to_str(int state); +const char *skd_skdev_state_to_str(enum skd_drvr_state state); +static void skd_log_skdev(struct skd_device *skdev, const char *event); +static void skd_log_skmsg(struct skd_device *skdev, + struct skd_fitmsg_context *skmsg, const char *event); +static void skd_log_skreq(struct skd_device *skdev, + struct skd_request_context *skreq, const char *event); + +/* + ***************************************************************************** + * READ/WRITE REQUESTS + ***************************************************************************** + */ +static void skd_fail_all_pending(struct skd_device *skdev) +{ + struct request_queue *q = skdev->queue; + struct request *req; + + for (;; ) { + req = blk_peek_request(q); + if (req == NULL) + break; + blk_start_request(req); + __blk_end_request_all(req, -EIO); + } +} + +static void +skd_prep_rw_cdb(struct skd_scsi_request *scsi_req, + int data_dir, unsigned lba, + unsigned count) +{ + if (data_dir == READ) + scsi_req->cdb[0] = 0x28; + else + scsi_req->cdb[0] = 0x2a; + + scsi_req->cdb[1] = 0; + scsi_req->cdb[2] = (lba & 0xff000000) >> 24; + scsi_req->cdb[3] = (lba & 0xff0000) >> 16; + scsi_req->cdb[4] = (lba & 0xff00) >> 8; + scsi_req->cdb[5] = (lba & 0xff); + scsi_req->cdb[6] = 0; + scsi_req->cdb[7] = (count & 0xff00) >> 8; + scsi_req->cdb[8] = count & 0xff; + scsi_req->cdb[9] = 0; +} + +static void +skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req, + struct skd_request_context *skreq) +{ + skreq->flush_cmd = 1; + + scsi_req->cdb[0] = 0x35; + scsi_req->cdb[1] = 0; + scsi_req->cdb[2] = 0; + scsi_req->cdb[3] = 0; + scsi_req->cdb[4] = 0; + scsi_req->cdb[5] = 0; + scsi_req->cdb[6] = 0; + scsi_req->cdb[7] = 0; + scsi_req->cdb[8] = 0; + scsi_req->cdb[9] = 0; +} + +static void +skd_prep_discard_cdb(struct skd_scsi_request *scsi_req, + struct skd_request_context *skreq, + struct page *page, + u32 lba, u32 count) +{ + char *buf; + unsigned long len; + struct request *req; + + buf = page_address(page); + len = SKD_DISCARD_CDB_LENGTH; + + scsi_req->cdb[0] = UNMAP; + scsi_req->cdb[8] = len; + + put_unaligned_be16(6 + 16, &buf[0]); + put_unaligned_be16(16, &buf[2]); + put_unaligned_be64(lba, &buf[8]); + put_unaligned_be32(count, &buf[16]); + + req = skreq->req; + blk_add_request_payload(req, page, len); + req->buffer = buf; +} + +static void skd_request_fn_not_online(struct request_queue *q); + +static void skd_request_fn(struct request_queue *q) +{ + struct skd_device *skdev = q->queuedata; + struct skd_fitmsg_context *skmsg = NULL; + struct fit_msg_hdr *fmh = NULL; + struct skd_request_context *skreq; + struct request *req = NULL; + struct skd_scsi_request *scsi_req; + struct page *page; + unsigned long io_flags; + int error; + u32 lba; + u32 count; + int data_dir; + u32 be_lba; + u32 be_count; + u64 be_dmaa; + u64 cmdctxt; + u32 timo_slot; + void *cmd_ptr; + int flush, fua; + + if (skdev->state != SKD_DRVR_STATE_ONLINE) { + skd_request_fn_not_online(q); + return; + } + + if (blk_queue_stopped(skdev->queue)) { + if (skdev->skmsg_free_list == NULL || + skdev->skreq_free_list == NULL || + skdev->in_flight >= skdev->queue_low_water_mark) + /* There is still some kind of shortage */ + return; + + queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue); + } + + /* + * Stop conditions: + * - There are no more native requests + * - There are already the maximum number of requests in progress + * - There are no more skd_request_context entries + * - There are no more FIT msg buffers + */ + for (;; ) { + + flush = fua = 0; + + req = blk_peek_request(q); + + /* Are there any native requests to start? */ + if (req == NULL) + break; + + lba = (u32)blk_rq_pos(req); + count = blk_rq_sectors(req); + data_dir = rq_data_dir(req); + io_flags = req->cmd_flags; + + if (io_flags & REQ_FLUSH) + flush++; + + if (io_flags & REQ_FUA) + fua++; + + pr_debug("%s:%s:%d new req=%p lba=%u(0x%x) " + "count=%u(0x%x) dir=%d\n", + skdev->name, __func__, __LINE__, + req, lba, lba, count, count, data_dir); + + /* At this point we know there is a request */ + + /* Are too many requets already in progress? */ + if (skdev->in_flight >= skdev->cur_max_queue_depth) { + pr_debug("%s:%s:%d qdepth %d, limit %d\n", + skdev->name, __func__, __LINE__, + skdev->in_flight, skdev->cur_max_queue_depth); + break; + } + + /* Is a skd_request_context available? */ + skreq = skdev->skreq_free_list; + if (skreq == NULL) { + pr_debug("%s:%s:%d Out of req=%p\n", + skdev->name, __func__, __LINE__, q); + break; + } + SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE); + SKD_ASSERT((skreq->id & SKD_ID_INCR) == 0); + + /* Now we check to see if we can get a fit msg */ + if (skmsg == NULL) { + if (skdev->skmsg_free_list == NULL) { + pr_debug("%s:%s:%d Out of msg\n", + skdev->name, __func__, __LINE__); + break; + } + } + + skreq->flush_cmd = 0; + skreq->n_sg = 0; + skreq->sg_byte_count = 0; + skreq->discard_page = 0; + + /* + * OK to now dequeue request from q. + * + * At this point we are comitted to either start or reject + * the native request. Note that skd_request_context is + * available but is still at the head of the free list. + */ + blk_start_request(req); + skreq->req = req; + skreq->fitmsg_id = 0; + + /* Either a FIT msg is in progress or we have to start one. */ + if (skmsg == NULL) { + /* Are there any FIT msg buffers available? */ + skmsg = skdev->skmsg_free_list; + if (skmsg == NULL) { + pr_debug("%s:%s:%d Out of msg skdev=%p\n", + skdev->name, __func__, __LINE__, + skdev); + break; + } + SKD_ASSERT(skmsg->state == SKD_MSG_STATE_IDLE); + SKD_ASSERT((skmsg->id & SKD_ID_INCR) == 0); + + skdev->skmsg_free_list = skmsg->next; + + skmsg->state = SKD_MSG_STATE_BUSY; + skmsg->id += SKD_ID_INCR; + + /* Initialize the FIT msg header */ + fmh = (struct fit_msg_hdr *)skmsg->msg_buf; + memset(fmh, 0, sizeof(*fmh)); + fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; + skmsg->length = sizeof(*fmh); + } + + skreq->fitmsg_id = skmsg->id; + + /* + * Note that a FIT msg may have just been started + * but contains no SoFIT requests yet. + */ + + /* + * Transcode the request, checking as we go. The outcome of + * the transcoding is represented by the error variable. + */ + cmd_ptr = &skmsg->msg_buf[skmsg->length]; + memset(cmd_ptr, 0, 32); + + be_lba = cpu_to_be32(lba); + be_count = cpu_to_be32(count); + be_dmaa = cpu_to_be64((u64)skreq->sksg_dma_address); + cmdctxt = skreq->id + SKD_ID_INCR; + + scsi_req = cmd_ptr; + scsi_req->hdr.tag = cmdctxt; + scsi_req->hdr.sg_list_dma_address = be_dmaa; + + if (data_dir == READ) + skreq->sg_data_dir = SKD_DATA_DIR_CARD_TO_HOST; + else + skreq->sg_data_dir = SKD_DATA_DIR_HOST_TO_CARD; + + if (io_flags & REQ_DISCARD) { + page = alloc_page(GFP_ATOMIC | __GFP_ZERO); + if (!page) { + pr_err("request_fn:Page allocation failed.\n"); + skd_end_request(skdev, skreq, -ENOMEM); + break; + } + skreq->discard_page = 1; + skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); + + } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { + skd_prep_zerosize_flush_cdb(scsi_req, skreq); + SKD_ASSERT(skreq->flush_cmd == 1); + + } else { + skd_prep_rw_cdb(scsi_req, data_dir, lba, count); + } + + if (fua) + scsi_req->cdb[1] |= SKD_FUA_NV; + + if (!req->bio) + goto skip_sg; + + error = skd_preop_sg_list(skdev, skreq); + + if (error != 0) { + /* + * Complete the native request with error. + * Note that the request context is still at the + * head of the free list, and that the SoFIT request + * was encoded into the FIT msg buffer but the FIT + * msg length has not been updated. In short, the + * only resource that has been allocated but might + * not be used is that the FIT msg could be empty. + */ + pr_debug("%s:%s:%d error Out\n", + skdev->name, __func__, __LINE__); + skd_end_request(skdev, skreq, error); + continue; + } + +skip_sg: + scsi_req->hdr.sg_list_len_bytes = + cpu_to_be32(skreq->sg_byte_count); + + /* Complete resource allocations. */ + skdev->skreq_free_list = skreq->next; + skreq->state = SKD_REQ_STATE_BUSY; + skreq->id += SKD_ID_INCR; + + skmsg->length += sizeof(struct skd_scsi_request); + fmh->num_protocol_cmds_coalesced++; + + /* + * Update the active request counts. + * Capture the timeout timestamp. + */ + skreq->timeout_stamp = skdev->timeout_stamp; + timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; + skdev->timeout_slot[timo_slot]++; + skdev->in_flight++; + pr_debug("%s:%s:%d req=0x%x busy=%d\n", + skdev->name, __func__, __LINE__, + skreq->id, skdev->in_flight); + + /* + * If the FIT msg buffer is full send it. + */ + if (skmsg->length >= SKD_N_FITMSG_BYTES || + fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { + skd_send_fitmsg(skdev, skmsg); + skmsg = NULL; + fmh = NULL; + } + } + + /* + * Is a FIT msg in progress? If it is empty put the buffer back + * on the free list. If it is non-empty send what we got. + * This minimizes latency when there are fewer requests than + * what fits in a FIT msg. + */ + if (skmsg != NULL) { + /* Bigger than just a FIT msg header? */ + if (skmsg->length > sizeof(struct fit_msg_hdr)) { + pr_debug("%s:%s:%d sending msg=%p, len %d\n", + skdev->name, __func__, __LINE__, + skmsg, skmsg->length); + skd_send_fitmsg(skdev, skmsg); + } else { + /* + * The FIT msg is empty. It means we got started + * on the msg, but the requests were rejected. + */ + skmsg->state = SKD_MSG_STATE_IDLE; + skmsg->id += SKD_ID_INCR; + skmsg->next = skdev->skmsg_free_list; + skdev->skmsg_free_list = skmsg; + } + skmsg = NULL; + fmh = NULL; + } + + /* + * If req is non-NULL it means there is something to do but + * we are out of a resource. + */ + if (req) + blk_stop_queue(skdev->queue); +} + +static void skd_end_request(struct skd_device *skdev, + struct skd_request_context *skreq, int error) +{ + struct request *req = skreq->req; + unsigned int io_flags = req->cmd_flags; + + if ((io_flags & REQ_DISCARD) && + (skreq->discard_page == 1)) { + pr_debug("%s:%s:%d, free the page!", + skdev->name, __func__, __LINE__); + free_page((unsigned long)req->buffer); + req->buffer = NULL; + } + + if (unlikely(error)) { + struct request *req = skreq->req; + char *cmd = (rq_data_dir(req) == READ) ? "read" : "write"; + u32 lba = (u32)blk_rq_pos(req); + u32 count = blk_rq_sectors(req); + + pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n", + skd_name(skdev), cmd, lba, count, skreq->id); + } else + pr_debug("%s:%s:%d id=0x%x error=%d\n", + skdev->name, __func__, __LINE__, skreq->id, error); + + __blk_end_request_all(skreq->req, error); +} + +static int skd_preop_sg_list(struct skd_device *skdev, + struct skd_request_context *skreq) +{ + struct request *req = skreq->req; + int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD; + int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE; + struct scatterlist *sg = &skreq->sg[0]; + int n_sg; + int i; + + skreq->sg_byte_count = 0; + + /* SKD_ASSERT(skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD || + skreq->sg_data_dir == SKD_DATA_DIR_CARD_TO_HOST); */ + + n_sg = blk_rq_map_sg(skdev->queue, req, sg); + if (n_sg <= 0) + return -EINVAL; + + /* + * Map scatterlist to PCI bus addresses. + * Note PCI might change the number of entries. + */ + n_sg = pci_map_sg(skdev->pdev, sg, n_sg, pci_dir); + if (n_sg <= 0) + return -EINVAL; + + SKD_ASSERT(n_sg <= skdev->sgs_per_request); + + skreq->n_sg = n_sg; + + for (i = 0; i < n_sg; i++) { + struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; + u32 cnt = sg_dma_len(&sg[i]); + uint64_t dma_addr = sg_dma_address(&sg[i]); + + sgd->control = FIT_SGD_CONTROL_NOT_LAST; + sgd->byte_count = cnt; + skreq->sg_byte_count += cnt; + sgd->host_side_addr = dma_addr; + sgd->dev_side_addr = 0; + } + + skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL; + skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST; + + if (unlikely(skdev->dbg_level > 1)) { + pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n", + skdev->name, __func__, __LINE__, + skreq->id, skreq->sksg_list, skreq->sksg_dma_address); + for (i = 0; i < n_sg; i++) { + struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; + pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " + "addr=0x%llx next=0x%llx\n", + skdev->name, __func__, __LINE__, + i, sgd->byte_count, sgd->control, + sgd->host_side_addr, sgd->next_desc_ptr); + } + } + + return 0; +} + +static void skd_postop_sg_list(struct skd_device *skdev, + struct skd_request_context *skreq) +{ + int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD; + int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE; + + /* + * restore the next ptr for next IO request so we + * don't have to set it every time. + */ + skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr = + skreq->sksg_dma_address + + ((skreq->n_sg) * sizeof(struct fit_sg_descriptor)); + pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, pci_dir); +} + +static void skd_request_fn_not_online(struct request_queue *q) +{ + struct skd_device *skdev = q->queuedata; + int error; + + SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE); + + skd_log_skdev(skdev, "req_not_online"); + switch (skdev->state) { + case SKD_DRVR_STATE_PAUSING: + case SKD_DRVR_STATE_PAUSED: + case SKD_DRVR_STATE_STARTING: + case SKD_DRVR_STATE_RESTARTING: + case SKD_DRVR_STATE_WAIT_BOOT: + /* In case of starting, we haven't started the queue, + * so we can't get here... but requests are + * possibly hanging out waiting for us because we + * reported the dev/skd0 already. They'll wait + * forever if connect doesn't complete. + * What to do??? delay dev/skd0 ?? + */ + case SKD_DRVR_STATE_BUSY: + case SKD_DRVR_STATE_BUSY_IMMINENT: + case SKD_DRVR_STATE_BUSY_ERASE: + case SKD_DRVR_STATE_DRAINING_TIMEOUT: + return; + + case SKD_DRVR_STATE_BUSY_SANITIZE: + case SKD_DRVR_STATE_STOPPING: + case SKD_DRVR_STATE_SYNCING: + case SKD_DRVR_STATE_FAULT: + case SKD_DRVR_STATE_DISAPPEARED: + default: + error = -EIO; + break; + } + + /* If we get here, terminate all pending block requeusts + * with EIO and any scsi pass thru with appropriate sense + */ + + skd_fail_all_pending(skdev); +} + +/* + ***************************************************************************** + * TIMER + ***************************************************************************** + */ + +static void skd_timer_tick_not_online(struct skd_device *skdev); + +static void skd_timer_tick(ulong arg) +{ + struct skd_device *skdev = (struct skd_device *)arg; + + u32 timo_slot; + u32 overdue_timestamp; + unsigned long reqflags; + u32 state; + + if (skdev->state == SKD_DRVR_STATE_FAULT) + /* The driver has declared fault, and we want it to + * stay that way until driver is reloaded. + */ + return; + + spin_lock_irqsave(&skdev->lock, reqflags); + + state = SKD_READL(skdev, FIT_STATUS); + state &= FIT_SR_DRIVE_STATE_MASK; + if (state != skdev->drive_state) + skd_isr_fwstate(skdev); + + if (skdev->state != SKD_DRVR_STATE_ONLINE) { + skd_timer_tick_not_online(skdev); + goto timer_func_out; + } + skdev->timeout_stamp++; + timo_slot = skdev->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; + + /* + * All requests that happened during the previous use of + * this slot should be done by now. The previous use was + * over 7 seconds ago. + */ + if (skdev->timeout_slot[timo_slot] == 0) + goto timer_func_out; + + /* Something is overdue */ + overdue_timestamp = skdev->timeout_stamp - SKD_N_TIMEOUT_SLOT; + + pr_debug("%s:%s:%d found %d timeouts, draining busy=%d\n", + skdev->name, __func__, __LINE__, + skdev->timeout_slot[timo_slot], skdev->in_flight); + pr_err("(%s): Overdue IOs (%d), busy %d\n", + skd_name(skdev), skdev->timeout_slot[timo_slot], + skdev->in_flight); + + skdev->timer_countdown = SKD_DRAINING_TIMO; + skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT; + skdev->timo_slot = timo_slot; + blk_stop_queue(skdev->queue); + +timer_func_out: + mod_timer(&skdev->timer, (jiffies + HZ)); + + spin_unlock_irqrestore(&skdev->lock, reqflags); +} + +static void skd_timer_tick_not_online(struct skd_device *skdev) +{ + switch (skdev->state) { + case SKD_DRVR_STATE_IDLE: + case SKD_DRVR_STATE_LOAD: + break; + case SKD_DRVR_STATE_BUSY_SANITIZE: + pr_debug("%s:%s:%d drive busy sanitize[%x], driver[%x]\n", + skdev->name, __func__, __LINE__, + skdev->drive_state, skdev->state); + /* If we've been in sanitize for 3 seconds, we figure we're not + * going to get anymore completions, so recover requests now + */ + if (skdev->timer_countdown > 0) { + skdev->timer_countdown--; + return; + } + skd_recover_requests(skdev, 0); + break; + + case SKD_DRVR_STATE_BUSY: + case SKD_DRVR_STATE_BUSY_IMMINENT: + case SKD_DRVR_STATE_BUSY_ERASE: + pr_debug("%s:%s:%d busy[%x], countdown=%d\n", + skdev->name, __func__, __LINE__, + skdev->state, skdev->timer_countdown); + if (skdev->timer_countdown > 0) { + skdev->timer_countdown--; + return; + } + pr_debug("%s:%s:%d busy[%x], timedout=%d, restarting device.", + skdev->name, __func__, __LINE__, + skdev->state, skdev->timer_countdown); + skd_restart_device(skdev); + break; + + case SKD_DRVR_STATE_WAIT_BOOT: + case SKD_DRVR_STATE_STARTING: + if (skdev->timer_countdown > 0) { + skdev->timer_countdown--; + return; + } + /* For now, we fault the drive. Could attempt resets to + * revcover at some point. */ + skdev->state = SKD_DRVR_STATE_FAULT; + + pr_err("(%s): DriveFault Connect Timeout (%x)\n", + skd_name(skdev), skdev->drive_state); + + /*start the queue so we can respond with error to requests */ + /* wakeup anyone waiting for startup complete */ + blk_start_queue(skdev->queue); + skdev->gendisk_on = -1; + wake_up_interruptible(&skdev->waitq); + break; + + case SKD_DRVR_STATE_ONLINE: + /* shouldn't get here. */ + break; + + case SKD_DRVR_STATE_PAUSING: + case SKD_DRVR_STATE_PAUSED: + break; + + case SKD_DRVR_STATE_DRAINING_TIMEOUT: + pr_debug("%s:%s:%d " + "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n", + skdev->name, __func__, __LINE__, + skdev->timo_slot, + skdev->timer_countdown, + skdev->in_flight, + skdev->timeout_slot[skdev->timo_slot]); + /* if the slot has cleared we can let the I/O continue */ + if (skdev->timeout_slot[skdev->timo_slot] == 0) { + pr_debug("%s:%s:%d Slot drained, starting queue.\n", + skdev->name, __func__, __LINE__); + skdev->state = SKD_DRVR_STATE_ONLINE; + blk_start_queue(skdev->queue); + return; + } + if (skdev->timer_countdown > 0) { + skdev->timer_countdown--; + return; + } + skd_restart_device(skdev); + break; + + case SKD_DRVR_STATE_RESTARTING: + if (skdev->timer_countdown > 0) { + skdev->timer_countdown--; + return; + } + /* For now, we fault the drive. Could attempt resets to + * revcover at some point. */ + skdev->state = SKD_DRVR_STATE_FAULT; + pr_err("(%s): DriveFault Reconnect Timeout (%x)\n", + skd_name(skdev), skdev->drive_state); + + /* + * Recovering does two things: + * 1. completes IO with error + * 2. reclaims dma resources + * When is it safe to recover requests? + * - if the drive state is faulted + * - if the state is still soft reset after out timeout + * - if the drive registers are dead (state = FF) + * If it is "unsafe", we still need to recover, so we will + * disable pci bus mastering and disable our interrupts. + */ + + if ((skdev->drive_state == FIT_SR_DRIVE_SOFT_RESET) || + (skdev->drive_state == FIT_SR_DRIVE_FAULT) || + (skdev->drive_state == FIT_SR_DRIVE_STATE_MASK)) + /* It never came out of soft reset. Try to + * recover the requests and then let them + * fail. This is to mitigate hung processes. */ + skd_recover_requests(skdev, 0); + else { + pr_err("(%s): Disable BusMaster (%x)\n", + skd_name(skdev), skdev->drive_state); + pci_disable_device(skdev->pdev); + skd_disable_interrupts(skdev); + skd_recover_requests(skdev, 0); + } + + /*start the queue so we can respond with error to requests */ + /* wakeup anyone waiting for startup complete */ + blk_start_queue(skdev->queue); + skdev->gendisk_on = -1; + wake_up_interruptible(&skdev->waitq); + break; + + case SKD_DRVR_STATE_RESUMING: + case SKD_DRVR_STATE_STOPPING: + case SKD_DRVR_STATE_SYNCING: + case SKD_DRVR_STATE_FAULT: + case SKD_DRVR_STATE_DISAPPEARED: + default: + break; + } +} + +static int skd_start_timer(struct skd_device *skdev) +{ + int rc; + + init_timer(&skdev->timer); + setup_timer(&skdev->timer, skd_timer_tick, (ulong)skdev); + + rc = mod_timer(&skdev->timer, (jiffies + HZ)); + if (rc) + pr_err("%s: failed to start timer %d\n", + __func__, rc); + return rc; +} + +static void skd_kill_timer(struct skd_device *skdev) +{ + del_timer_sync(&skdev->timer); +} + +/* + ***************************************************************************** + * IOCTL + ***************************************************************************** + */ +static int skd_ioctl_sg_io(struct skd_device *skdev, + fmode_t mode, void __user *argp); +static int skd_sg_io_get_and_check_args(struct skd_device *skdev, + struct skd_sg_io *sksgio); +static int skd_sg_io_obtain_skspcl(struct skd_device *skdev, + struct skd_sg_io *sksgio); +static int skd_sg_io_prep_buffering(struct skd_device *skdev, + struct skd_sg_io *sksgio); +static int skd_sg_io_copy_buffer(struct skd_device *skdev, + struct skd_sg_io *sksgio, int dxfer_dir); +static int skd_sg_io_send_fitmsg(struct skd_device *skdev, + struct skd_sg_io *sksgio); +static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio); +static int skd_sg_io_release_skspcl(struct skd_device *skdev, + struct skd_sg_io *sksgio); +static int skd_sg_io_put_status(struct skd_device *skdev, + struct skd_sg_io *sksgio); + +static void skd_complete_special(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 + *skcomp, + volatile struct fit_comp_error_info *skerr, + struct skd_special_context *skspcl); + +static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode, + uint cmd_in, ulong arg) +{ + int rc = 0; + struct gendisk *disk = bdev->bd_disk; + struct skd_device *skdev = disk->private_data; + void __user *p = (void *)arg; + + pr_debug("%s:%s:%d %s: CMD[%s] ioctl mode 0x%x, cmd 0x%x arg %0lx\n", + skdev->name, __func__, __LINE__, + disk->disk_name, current->comm, mode, cmd_in, arg); + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd_in) { + case SG_SET_TIMEOUT: + case SG_GET_TIMEOUT: + case SG_GET_VERSION_NUM: + rc = scsi_cmd_ioctl(disk->queue, disk, mode, cmd_in, p); + break; + case SG_IO: + rc = skd_ioctl_sg_io(skdev, mode, p); + break; + + default: + rc = -ENOTTY; + break; + } + + pr_debug("%s:%s:%d %s: completion rc %d\n", + skdev->name, __func__, __LINE__, disk->disk_name, rc); + return rc; +} + +static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode, + void __user *argp) +{ + int rc; + struct skd_sg_io sksgio; + + memset(&sksgio, 0, sizeof(sksgio)); + sksgio.mode = mode; + sksgio.argp = argp; + sksgio.iov = &sksgio.no_iov_iov; + + switch (skdev->state) { + case SKD_DRVR_STATE_ONLINE: + case SKD_DRVR_STATE_BUSY_IMMINENT: + break; + + default: + pr_debug("%s:%s:%d drive not online\n", + skdev->name, __func__, __LINE__); + rc = -ENXIO; + goto out; + } + + rc = skd_sg_io_get_and_check_args(skdev, &sksgio); + if (rc) + goto out; + + rc = skd_sg_io_obtain_skspcl(skdev, &sksgio); + if (rc) + goto out; + + rc = skd_sg_io_prep_buffering(skdev, &sksgio); + if (rc) + goto out; + + rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_TO_DEV); + if (rc) + goto out; + + rc = skd_sg_io_send_fitmsg(skdev, &sksgio); + if (rc) + goto out; + + rc = skd_sg_io_await(skdev, &sksgio); + if (rc) + goto out; + + rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_FROM_DEV); + if (rc) + goto out; + + rc = skd_sg_io_put_status(skdev, &sksgio); + if (rc) + goto out; + + rc = 0; + +out: + skd_sg_io_release_skspcl(skdev, &sksgio); + + if (sksgio.iov != NULL && sksgio.iov != &sksgio.no_iov_iov) + kfree(sksgio.iov); + return rc; +} + +static int skd_sg_io_get_and_check_args(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct sg_io_hdr *sgp = &sksgio->sg; + int i, acc; + + if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) { + pr_debug("%s:%s:%d access sg failed %p\n", + skdev->name, __func__, __LINE__, sksgio->argp); + return -EFAULT; + } + + if (__copy_from_user(sgp, sksgio->argp, sizeof(sg_io_hdr_t))) { + pr_debug("%s:%s:%d copy_from_user sg failed %p\n", + skdev->name, __func__, __LINE__, sksgio->argp); + return -EFAULT; + } + + if (sgp->interface_id != SG_INTERFACE_ID_ORIG) { + pr_debug("%s:%s:%d interface_id invalid 0x%x\n", + skdev->name, __func__, __LINE__, sgp->interface_id); + return -EINVAL; + } + + if (sgp->cmd_len > sizeof(sksgio->cdb)) { + pr_debug("%s:%s:%d cmd_len invalid %d\n", + skdev->name, __func__, __LINE__, sgp->cmd_len); + return -EINVAL; + } + + if (sgp->iovec_count > 256) { + pr_debug("%s:%s:%d iovec_count invalid %d\n", + skdev->name, __func__, __LINE__, sgp->iovec_count); + return -EINVAL; + } + + if (sgp->dxfer_len > (PAGE_SIZE * SKD_N_SG_PER_SPECIAL)) { + pr_debug("%s:%s:%d dxfer_len invalid %d\n", + skdev->name, __func__, __LINE__, sgp->dxfer_len); + return -EINVAL; + } + + switch (sgp->dxfer_direction) { + case SG_DXFER_NONE: + acc = -1; + break; + + case SG_DXFER_TO_DEV: + acc = VERIFY_READ; + break; + + case SG_DXFER_FROM_DEV: + case SG_DXFER_TO_FROM_DEV: + acc = VERIFY_WRITE; + break; + + default: + pr_debug("%s:%s:%d dxfer_dir invalid %d\n", + skdev->name, __func__, __LINE__, sgp->dxfer_direction); + return -EINVAL; + } + + if (copy_from_user(sksgio->cdb, sgp->cmdp, sgp->cmd_len)) { + pr_debug("%s:%s:%d copy_from_user cmdp failed %p\n", + skdev->name, __func__, __LINE__, sgp->cmdp); + return -EFAULT; + } + + if (sgp->mx_sb_len != 0) { + if (!access_ok(VERIFY_WRITE, sgp->sbp, sgp->mx_sb_len)) { + pr_debug("%s:%s:%d access sbp failed %p\n", + skdev->name, __func__, __LINE__, sgp->sbp); + return -EFAULT; + } + } + + if (sgp->iovec_count == 0) { + sksgio->iov[0].iov_base = sgp->dxferp; + sksgio->iov[0].iov_len = sgp->dxfer_len; + sksgio->iovcnt = 1; + sksgio->dxfer_len = sgp->dxfer_len; + } else { + struct sg_iovec *iov; + uint nbytes = sizeof(*iov) * sgp->iovec_count; + size_t iov_data_len; + + iov = kmalloc(nbytes, GFP_KERNEL); + if (iov == NULL) { + pr_debug("%s:%s:%d alloc iovec failed %d\n", + skdev->name, __func__, __LINE__, + sgp->iovec_count); + return -ENOMEM; + } + sksgio->iov = iov; + sksgio->iovcnt = sgp->iovec_count; + + if (copy_from_user(iov, sgp->dxferp, nbytes)) { + pr_debug("%s:%s:%d copy_from_user iovec failed %p\n", + skdev->name, __func__, __LINE__, sgp->dxferp); + return -EFAULT; + } + + /* + * Sum up the vecs, making sure they don't overflow + */ + iov_data_len = 0; + for (i = 0; i < sgp->iovec_count; i++) { + if (iov_data_len + iov[i].iov_len < iov_data_len) + return -EINVAL; + iov_data_len += iov[i].iov_len; + } + + /* SG_IO howto says that the shorter of the two wins */ + if (sgp->dxfer_len < iov_data_len) { + sksgio->iovcnt = iov_shorten((struct iovec *)iov, + sgp->iovec_count, + sgp->dxfer_len); + sksgio->dxfer_len = sgp->dxfer_len; + } else + sksgio->dxfer_len = iov_data_len; + } + + if (sgp->dxfer_direction != SG_DXFER_NONE) { + struct sg_iovec *iov = sksgio->iov; + for (i = 0; i < sksgio->iovcnt; i++, iov++) { + if (!access_ok(acc, iov->iov_base, iov->iov_len)) { + pr_debug("%s:%s:%d access data failed %p/%d\n", + skdev->name, __func__, __LINE__, + iov->iov_base, (int)iov->iov_len); + return -EFAULT; + } + } + } + + return 0; +} + +static int skd_sg_io_obtain_skspcl(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct skd_special_context *skspcl = NULL; + int rc; + + for (;;) { + ulong flags; + + spin_lock_irqsave(&skdev->lock, flags); + skspcl = skdev->skspcl_free_list; + if (skspcl != NULL) { + skdev->skspcl_free_list = + (struct skd_special_context *)skspcl->req.next; + skspcl->req.id += SKD_ID_INCR; + skspcl->req.state = SKD_REQ_STATE_SETUP; + skspcl->orphaned = 0; + skspcl->req.n_sg = 0; + } + spin_unlock_irqrestore(&skdev->lock, flags); + + if (skspcl != NULL) { + rc = 0; + break; + } + + pr_debug("%s:%s:%d blocking\n", + skdev->name, __func__, __LINE__); + + rc = wait_event_interruptible_timeout( + skdev->waitq, + (skdev->skspcl_free_list != NULL), + msecs_to_jiffies(sksgio->sg.timeout)); + + pr_debug("%s:%s:%d unblocking, rc=%d\n", + skdev->name, __func__, __LINE__, rc); + + if (rc <= 0) { + if (rc == 0) + rc = -ETIMEDOUT; + else + rc = -EINTR; + break; + } + /* + * If we get here rc > 0 meaning the timeout to + * wait_event_interruptible_timeout() had time left, hence the + * sought event -- non-empty free list -- happened. + * Retry the allocation. + */ + } + sksgio->skspcl = skspcl; + + return rc; +} + +static int skd_skreq_prep_buffering(struct skd_device *skdev, + struct skd_request_context *skreq, + u32 dxfer_len) +{ + u32 resid = dxfer_len; + + /* + * The DMA engine must have aligned addresses and byte counts. + */ + resid += (-resid) & 3; + skreq->sg_byte_count = resid; + + skreq->n_sg = 0; + + while (resid > 0) { + u32 nbytes = PAGE_SIZE; + u32 ix = skreq->n_sg; + struct scatterlist *sg = &skreq->sg[ix]; + struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix]; + struct page *page; + + if (nbytes > resid) + nbytes = resid; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + return -ENOMEM; + + sg_set_page(sg, page, nbytes, 0); + + /* TODO: This should be going through a pci_???() + * routine to do proper mapping. */ + sksg->control = FIT_SGD_CONTROL_NOT_LAST; + sksg->byte_count = nbytes; + + sksg->host_side_addr = sg_phys(sg); + + sksg->dev_side_addr = 0; + sksg->next_desc_ptr = skreq->sksg_dma_address + + (ix + 1) * sizeof(*sksg); + + skreq->n_sg++; + resid -= nbytes; + } + + if (skreq->n_sg > 0) { + u32 ix = skreq->n_sg - 1; + struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix]; + + sksg->control = FIT_SGD_CONTROL_LAST; + sksg->next_desc_ptr = 0; + } + + if (unlikely(skdev->dbg_level > 1)) { + u32 i; + + pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n", + skdev->name, __func__, __LINE__, + skreq->id, skreq->sksg_list, skreq->sksg_dma_address); + for (i = 0; i < skreq->n_sg; i++) { + struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; + + pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " + "addr=0x%llx next=0x%llx\n", + skdev->name, __func__, __LINE__, + i, sgd->byte_count, sgd->control, + sgd->host_side_addr, sgd->next_desc_ptr); + } + } + + return 0; +} + +static int skd_sg_io_prep_buffering(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct skd_special_context *skspcl = sksgio->skspcl; + struct skd_request_context *skreq = &skspcl->req; + u32 dxfer_len = sksgio->dxfer_len; + int rc; + + rc = skd_skreq_prep_buffering(skdev, skreq, dxfer_len); + /* + * Eventually, errors or not, skd_release_special() is called + * to recover allocations including partial allocations. + */ + return rc; +} + +static int skd_sg_io_copy_buffer(struct skd_device *skdev, + struct skd_sg_io *sksgio, int dxfer_dir) +{ + struct skd_special_context *skspcl = sksgio->skspcl; + u32 iov_ix = 0; + struct sg_iovec curiov; + u32 sksg_ix = 0; + u8 *bufp = NULL; + u32 buf_len = 0; + u32 resid = sksgio->dxfer_len; + int rc; + + curiov.iov_len = 0; + curiov.iov_base = NULL; + + if (dxfer_dir != sksgio->sg.dxfer_direction) { + if (dxfer_dir != SG_DXFER_TO_DEV || + sksgio->sg.dxfer_direction != SG_DXFER_TO_FROM_DEV) + return 0; + } + + while (resid > 0) { + u32 nbytes = PAGE_SIZE; + + if (curiov.iov_len == 0) { + curiov = sksgio->iov[iov_ix++]; + continue; + } + + if (buf_len == 0) { + struct page *page; + page = sg_page(&skspcl->req.sg[sksg_ix++]); + bufp = page_address(page); + buf_len = PAGE_SIZE; + } + + nbytes = min_t(u32, nbytes, resid); + nbytes = min_t(u32, nbytes, curiov.iov_len); + nbytes = min_t(u32, nbytes, buf_len); + + if (dxfer_dir == SG_DXFER_TO_DEV) + rc = __copy_from_user(bufp, curiov.iov_base, nbytes); + else + rc = __copy_to_user(curiov.iov_base, bufp, nbytes); + + if (rc) + return -EFAULT; + + resid -= nbytes; + curiov.iov_len -= nbytes; + curiov.iov_base += nbytes; + buf_len -= nbytes; + } + + return 0; +} + +static int skd_sg_io_send_fitmsg(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct skd_special_context *skspcl = sksgio->skspcl; + struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf; + struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1]; + + memset(skspcl->msg_buf, 0, SKD_N_SPECIAL_FITMSG_BYTES); + + /* Initialize the FIT msg header */ + fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; + fmh->num_protocol_cmds_coalesced = 1; + + /* Initialize the SCSI request */ + if (sksgio->sg.dxfer_direction != SG_DXFER_NONE) + scsi_req->hdr.sg_list_dma_address = + cpu_to_be64(skspcl->req.sksg_dma_address); + scsi_req->hdr.tag = skspcl->req.id; + scsi_req->hdr.sg_list_len_bytes = + cpu_to_be32(skspcl->req.sg_byte_count); + memcpy(scsi_req->cdb, sksgio->cdb, sizeof(scsi_req->cdb)); + + skspcl->req.state = SKD_REQ_STATE_BUSY; + skd_send_special_fitmsg(skdev, skspcl); + + return 0; +} + +static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio) +{ + unsigned long flags; + int rc; + + rc = wait_event_interruptible_timeout(skdev->waitq, + (sksgio->skspcl->req.state != + SKD_REQ_STATE_BUSY), + msecs_to_jiffies(sksgio->sg. + timeout)); + + spin_lock_irqsave(&skdev->lock, flags); + + if (sksgio->skspcl->req.state == SKD_REQ_STATE_ABORTED) { + pr_debug("%s:%s:%d skspcl %p aborted\n", + skdev->name, __func__, __LINE__, sksgio->skspcl); + + /* Build check cond, sense and let command finish. */ + /* For a timeout, we must fabricate completion and sense + * data to complete the command */ + sksgio->skspcl->req.completion.status = + SAM_STAT_CHECK_CONDITION; + + memset(&sksgio->skspcl->req.err_info, 0, + sizeof(sksgio->skspcl->req.err_info)); + sksgio->skspcl->req.err_info.type = 0x70; + sksgio->skspcl->req.err_info.key = ABORTED_COMMAND; + sksgio->skspcl->req.err_info.code = 0x44; + sksgio->skspcl->req.err_info.qual = 0; + rc = 0; + } else if (sksgio->skspcl->req.state != SKD_REQ_STATE_BUSY) + /* No longer on the adapter. We finish. */ + rc = 0; + else { + /* Something's gone wrong. Still busy. Timeout or + * user interrupted (control-C). Mark as an orphan + * so it will be disposed when completed. */ + sksgio->skspcl->orphaned = 1; + sksgio->skspcl = NULL; + if (rc == 0) { + pr_debug("%s:%s:%d timed out %p (%u ms)\n", + skdev->name, __func__, __LINE__, + sksgio, sksgio->sg.timeout); + rc = -ETIMEDOUT; + } else { + pr_debug("%s:%s:%d cntlc %p\n", + skdev->name, __func__, __LINE__, sksgio); + rc = -EINTR; + } + } + + spin_unlock_irqrestore(&skdev->lock, flags); + + return rc; +} + +static int skd_sg_io_put_status(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct sg_io_hdr *sgp = &sksgio->sg; + struct skd_special_context *skspcl = sksgio->skspcl; + int resid = 0; + + u32 nb = be32_to_cpu(skspcl->req.completion.num_returned_bytes); + + sgp->status = skspcl->req.completion.status; + resid = sksgio->dxfer_len - nb; + + sgp->masked_status = sgp->status & STATUS_MASK; + sgp->msg_status = 0; + sgp->host_status = 0; + sgp->driver_status = 0; + sgp->resid = resid; + if (sgp->masked_status || sgp->host_status || sgp->driver_status) + sgp->info |= SG_INFO_CHECK; + + pr_debug("%s:%s:%d status %x masked %x resid 0x%x\n", + skdev->name, __func__, __LINE__, + sgp->status, sgp->masked_status, sgp->resid); + + if (sgp->masked_status == SAM_STAT_CHECK_CONDITION) { + if (sgp->mx_sb_len > 0) { + struct fit_comp_error_info *ei = &skspcl->req.err_info; + u32 nbytes = sizeof(*ei); + + nbytes = min_t(u32, nbytes, sgp->mx_sb_len); + + sgp->sb_len_wr = nbytes; + + if (__copy_to_user(sgp->sbp, ei, nbytes)) { + pr_debug("%s:%s:%d copy_to_user sense failed %p\n", + skdev->name, __func__, __LINE__, + sgp->sbp); + return -EFAULT; + } + } + } + + if (__copy_to_user(sksgio->argp, sgp, sizeof(sg_io_hdr_t))) { + pr_debug("%s:%s:%d copy_to_user sg failed %p\n", + skdev->name, __func__, __LINE__, sksgio->argp); + return -EFAULT; + } + + return 0; +} + +static int skd_sg_io_release_skspcl(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct skd_special_context *skspcl = sksgio->skspcl; + + if (skspcl != NULL) { + ulong flags; + + sksgio->skspcl = NULL; + + spin_lock_irqsave(&skdev->lock, flags); + skd_release_special(skdev, skspcl); + spin_unlock_irqrestore(&skdev->lock, flags); + } + + return 0; +} + +/* + ***************************************************************************** + * INTERNAL REQUESTS -- generated by driver itself + ***************************************************************************** + */ + +static int skd_format_internal_skspcl(struct skd_device *skdev) +{ + struct skd_special_context *skspcl = &skdev->internal_skspcl; + struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0]; + struct fit_msg_hdr *fmh; + uint64_t dma_address; + struct skd_scsi_request *scsi; + + fmh = (struct fit_msg_hdr *)&skspcl->msg_buf[0]; + fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; + fmh->num_protocol_cmds_coalesced = 1; + + scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64]; + memset(scsi, 0, sizeof(*scsi)); + dma_address = skspcl->req.sksg_dma_address; + scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address); + sgd->control = FIT_SGD_CONTROL_LAST; + sgd->byte_count = 0; + sgd->host_side_addr = skspcl->db_dma_address; + sgd->dev_side_addr = 0; + sgd->next_desc_ptr = 0LL; + + return 1; +} + +#define WR_BUF_SIZE SKD_N_INTERNAL_BYTES + +static void skd_send_internal_skspcl(struct skd_device *skdev, + struct skd_special_context *skspcl, + u8 opcode) +{ + struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0]; + struct skd_scsi_request *scsi; + unsigned char *buf = skspcl->data_buf; + int i; + + if (skspcl->req.state != SKD_REQ_STATE_IDLE) + /* + * A refresh is already in progress. + * Just wait for it to finish. + */ + return; + + SKD_ASSERT((skspcl->req.id & SKD_ID_INCR) == 0); + skspcl->req.state = SKD_REQ_STATE_BUSY; + skspcl->req.id += SKD_ID_INCR; + + scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64]; + scsi->hdr.tag = skspcl->req.id; + + memset(scsi->cdb, 0, sizeof(scsi->cdb)); + + switch (opcode) { + case TEST_UNIT_READY: + scsi->cdb[0] = TEST_UNIT_READY; + sgd->byte_count = 0; + scsi->hdr.sg_list_len_bytes = 0; + break; + + case READ_CAPACITY: + scsi->cdb[0] = READ_CAPACITY; + sgd->byte_count = SKD_N_READ_CAP_BYTES; + scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); + break; + + case INQUIRY: + scsi->cdb[0] = INQUIRY; + scsi->cdb[1] = 0x01; /* evpd */ + scsi->cdb[2] = 0x80; /* serial number page */ + scsi->cdb[4] = 0x10; + sgd->byte_count = 16; + scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); + break; + + case SYNCHRONIZE_CACHE: + scsi->cdb[0] = SYNCHRONIZE_CACHE; + sgd->byte_count = 0; + scsi->hdr.sg_list_len_bytes = 0; + break; + + case WRITE_BUFFER: + scsi->cdb[0] = WRITE_BUFFER; + scsi->cdb[1] = 0x02; + scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8; + scsi->cdb[8] = WR_BUF_SIZE & 0xFF; + sgd->byte_count = WR_BUF_SIZE; + scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); + /* fill incrementing byte pattern */ + for (i = 0; i < sgd->byte_count; i++) + buf[i] = i & 0xFF; + break; + + case READ_BUFFER: + scsi->cdb[0] = READ_BUFFER; + scsi->cdb[1] = 0x02; + scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8; + scsi->cdb[8] = WR_BUF_SIZE & 0xFF; + sgd->byte_count = WR_BUF_SIZE; + scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); + memset(skspcl->data_buf, 0, sgd->byte_count); + break; + + default: + SKD_ASSERT("Don't know what to send"); + return; + + } + skd_send_special_fitmsg(skdev, skspcl); +} + +static void skd_refresh_device_data(struct skd_device *skdev) +{ + struct skd_special_context *skspcl = &skdev->internal_skspcl; + + skd_send_internal_skspcl(skdev, skspcl, TEST_UNIT_READY); +} + +static int skd_chk_read_buf(struct skd_device *skdev, + struct skd_special_context *skspcl) +{ + unsigned char *buf = skspcl->data_buf; + int i; + + /* check for incrementing byte pattern */ + for (i = 0; i < WR_BUF_SIZE; i++) + if (buf[i] != (i & 0xFF)) + return 1; + + return 0; +} + +static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key, + u8 code, u8 qual, u8 fruc) +{ + /* If the check condition is of special interest, log a message */ + if ((status == SAM_STAT_CHECK_CONDITION) && (key == 0x02) + && (code == 0x04) && (qual == 0x06)) { + pr_err("(%s): *** LOST_WRITE_DATA ERROR *** key/asc/" + "ascq/fruc %02x/%02x/%02x/%02x\n", + skd_name(skdev), key, code, qual, fruc); + } +} + +static void skd_complete_internal(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 + *skcomp, + volatile struct fit_comp_error_info *skerr, + struct skd_special_context *skspcl) +{ + u8 *buf = skspcl->data_buf; + u8 status; + int i; + struct skd_scsi_request *scsi = + (struct skd_scsi_request *)&skspcl->msg_buf[64]; + + SKD_ASSERT(skspcl == &skdev->internal_skspcl); + + pr_debug("%s:%s:%d complete internal %x\n", + skdev->name, __func__, __LINE__, scsi->cdb[0]); + + skspcl->req.completion = *skcomp; + skspcl->req.state = SKD_REQ_STATE_IDLE; + skspcl->req.id += SKD_ID_INCR; + + status = skspcl->req.completion.status; + + skd_log_check_status(skdev, status, skerr->key, skerr->code, + skerr->qual, skerr->fruc); + + switch (scsi->cdb[0]) { + case TEST_UNIT_READY: + if (status == SAM_STAT_GOOD) + skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER); + else if ((status == SAM_STAT_CHECK_CONDITION) && + (skerr->key == MEDIUM_ERROR)) + skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER); + else { + if (skdev->state == SKD_DRVR_STATE_STOPPING) { + pr_debug("%s:%s:%d TUR failed, don't send anymore state 0x%x\n", + skdev->name, __func__, __LINE__, + skdev->state); + return; + } + pr_debug("%s:%s:%d **** TUR failed, retry skerr\n", + skdev->name, __func__, __LINE__); + skd_send_internal_skspcl(skdev, skspcl, 0x00); + } + break; + + case WRITE_BUFFER: + if (status == SAM_STAT_GOOD) + skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER); + else { + if (skdev->state == SKD_DRVR_STATE_STOPPING) { + pr_debug("%s:%s:%d write buffer failed, don't send anymore state 0x%x\n", + skdev->name, __func__, __LINE__, + skdev->state); + return; + } + pr_debug("%s:%s:%d **** write buffer failed, retry skerr\n", + skdev->name, __func__, __LINE__); + skd_send_internal_skspcl(skdev, skspcl, 0x00); + } + break; + + case READ_BUFFER: + if (status == SAM_STAT_GOOD) { + if (skd_chk_read_buf(skdev, skspcl) == 0) + skd_send_internal_skspcl(skdev, skspcl, + READ_CAPACITY); + else { + pr_err( + "(%s):*** W/R Buffer mismatch %d ***\n", + skd_name(skdev), skdev->connect_retries); + if (skdev->connect_retries < + SKD_MAX_CONNECT_RETRIES) { + skdev->connect_retries++; + skd_soft_reset(skdev); + } else { + pr_err( + "(%s): W/R Buffer Connect Error\n", + skd_name(skdev)); + return; + } + } + + } else { + if (skdev->state == SKD_DRVR_STATE_STOPPING) { + pr_debug("%s:%s:%d " + "read buffer failed, don't send anymore state 0x%x\n", + skdev->name, __func__, __LINE__, + skdev->state); + return; + } + pr_debug("%s:%s:%d " + "**** read buffer failed, retry skerr\n", + skdev->name, __func__, __LINE__); + skd_send_internal_skspcl(skdev, skspcl, 0x00); + } + break; + + case READ_CAPACITY: + skdev->read_cap_is_valid = 0; + if (status == SAM_STAT_GOOD) { + skdev->read_cap_last_lba = + (buf[0] << 24) | (buf[1] << 16) | + (buf[2] << 8) | buf[3]; + skdev->read_cap_blocksize = + (buf[4] << 24) | (buf[5] << 16) | + (buf[6] << 8) | buf[7]; + + pr_debug("%s:%s:%d last lba %d, bs %d\n", + skdev->name, __func__, __LINE__, + skdev->read_cap_last_lba, + skdev->read_cap_blocksize); + + set_capacity(skdev->disk, skdev->read_cap_last_lba + 1); + + skdev->read_cap_is_valid = 1; + + skd_send_internal_skspcl(skdev, skspcl, INQUIRY); + } else if ((status == SAM_STAT_CHECK_CONDITION) && + (skerr->key == MEDIUM_ERROR)) { + skdev->read_cap_last_lba = ~0; + set_capacity(skdev->disk, skdev->read_cap_last_lba + 1); + pr_debug("%s:%s:%d " + "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n", + skdev->name, __func__, __LINE__); + skd_send_internal_skspcl(skdev, skspcl, INQUIRY); + } else { + pr_debug("%s:%s:%d **** READCAP failed, retry TUR\n", + skdev->name, __func__, __LINE__); + skd_send_internal_skspcl(skdev, skspcl, + TEST_UNIT_READY); + } + break; + + case INQUIRY: + skdev->inquiry_is_valid = 0; + if (status == SAM_STAT_GOOD) { + skdev->inquiry_is_valid = 1; + + for (i = 0; i < 12; i++) + skdev->inq_serial_num[i] = buf[i + 4]; + skdev->inq_serial_num[12] = 0; + } + + if (skd_unquiesce_dev(skdev) < 0) + pr_debug("%s:%s:%d **** failed, to ONLINE device\n", + skdev->name, __func__, __LINE__); + /* connection is complete */ + skdev->connect_retries = 0; + break; + + case SYNCHRONIZE_CACHE: + if (status == SAM_STAT_GOOD) + skdev->sync_done = 1; + else + skdev->sync_done = -1; + wake_up_interruptible(&skdev->waitq); + break; + + default: + SKD_ASSERT("we didn't send this"); + } +} + +/* + ***************************************************************************** + * FIT MESSAGES + ***************************************************************************** + */ + +static void skd_send_fitmsg(struct skd_device *skdev, + struct skd_fitmsg_context *skmsg) +{ + u64 qcmd; + struct fit_msg_hdr *fmh; + + pr_debug("%s:%s:%d dma address 0x%llx, busy=%d\n", + skdev->name, __func__, __LINE__, + skmsg->mb_dma_address, skdev->in_flight); + pr_debug("%s:%s:%d msg_buf 0x%p, offset %x\n", + skdev->name, __func__, __LINE__, + skmsg->msg_buf, skmsg->offset); + + qcmd = skmsg->mb_dma_address; + qcmd |= FIT_QCMD_QID_NORMAL; + + fmh = (struct fit_msg_hdr *)skmsg->msg_buf; + skmsg->outstanding = fmh->num_protocol_cmds_coalesced; + + if (unlikely(skdev->dbg_level > 1)) { + u8 *bp = (u8 *)skmsg->msg_buf; + int i; + for (i = 0; i < skmsg->length; i += 8) { + pr_debug("%s:%s:%d msg[%2d] %02x %02x %02x %02x " + "%02x %02x %02x %02x\n", + skdev->name, __func__, __LINE__, + i, bp[i + 0], bp[i + 1], bp[i + 2], + bp[i + 3], bp[i + 4], bp[i + 5], + bp[i + 6], bp[i + 7]); + if (i == 0) + i = 64 - 8; + } + } + + if (skmsg->length > 256) + qcmd |= FIT_QCMD_MSGSIZE_512; + else if (skmsg->length > 128) + qcmd |= FIT_QCMD_MSGSIZE_256; + else if (skmsg->length > 64) + qcmd |= FIT_QCMD_MSGSIZE_128; + else + /* + * This makes no sense because the FIT msg header is + * 64 bytes. If the msg is only 64 bytes long it has + * no payload. + */ + qcmd |= FIT_QCMD_MSGSIZE_64; + + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); + +} + +static void skd_send_special_fitmsg(struct skd_device *skdev, + struct skd_special_context *skspcl) +{ + u64 qcmd; + + if (unlikely(skdev->dbg_level > 1)) { + u8 *bp = (u8 *)skspcl->msg_buf; + int i; + + for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) { + pr_debug("%s:%s:%d spcl[%2d] %02x %02x %02x %02x " + "%02x %02x %02x %02x\n", + skdev->name, __func__, __LINE__, i, + bp[i + 0], bp[i + 1], bp[i + 2], bp[i + 3], + bp[i + 4], bp[i + 5], bp[i + 6], bp[i + 7]); + if (i == 0) + i = 64 - 8; + } + + pr_debug("%s:%s:%d skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n", + skdev->name, __func__, __LINE__, + skspcl, skspcl->req.id, skspcl->req.sksg_list, + skspcl->req.sksg_dma_address); + for (i = 0; i < skspcl->req.n_sg; i++) { + struct fit_sg_descriptor *sgd = + &skspcl->req.sksg_list[i]; + + pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " + "addr=0x%llx next=0x%llx\n", + skdev->name, __func__, __LINE__, + i, sgd->byte_count, sgd->control, + sgd->host_side_addr, sgd->next_desc_ptr); + } + } + + /* + * Special FIT msgs are always 128 bytes: a 64-byte FIT hdr + * and one 64-byte SSDI command. + */ + qcmd = skspcl->mb_dma_address; + qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128; + + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); +} + +/* + ***************************************************************************** + * COMPLETION QUEUE + ***************************************************************************** + */ + +static void skd_complete_other(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 *skcomp, + volatile struct fit_comp_error_info *skerr); + +struct sns_info { + u8 type; + u8 stat; + u8 key; + u8 asc; + u8 ascq; + u8 mask; + enum skd_check_status_action action; +}; + +static struct sns_info skd_chkstat_table[] = { + /* Good */ + { 0x70, 0x02, RECOVERED_ERROR, 0, 0, 0x1c, + SKD_CHECK_STATUS_REPORT_GOOD }, + + /* Smart alerts */ + { 0x70, 0x02, NO_SENSE, 0x0B, 0x00, 0x1E, /* warnings */ + SKD_CHECK_STATUS_REPORT_SMART_ALERT }, + { 0x70, 0x02, NO_SENSE, 0x5D, 0x00, 0x1E, /* thresholds */ + SKD_CHECK_STATUS_REPORT_SMART_ALERT }, + { 0x70, 0x02, RECOVERED_ERROR, 0x0B, 0x01, 0x1F, /* temperature over trigger */ + SKD_CHECK_STATUS_REPORT_SMART_ALERT }, + + /* Retry (with limits) */ + { 0x70, 0x02, 0x0B, 0, 0, 0x1C, /* This one is for DMA ERROR */ + SKD_CHECK_STATUS_REQUEUE_REQUEST }, + { 0x70, 0x02, 0x06, 0x0B, 0x00, 0x1E, /* warnings */ + SKD_CHECK_STATUS_REQUEUE_REQUEST }, + { 0x70, 0x02, 0x06, 0x5D, 0x00, 0x1E, /* thresholds */ + SKD_CHECK_STATUS_REQUEUE_REQUEST }, + { 0x70, 0x02, 0x06, 0x80, 0x30, 0x1F, /* backup power */ + SKD_CHECK_STATUS_REQUEUE_REQUEST }, + + /* Busy (or about to be) */ + { 0x70, 0x02, 0x06, 0x3f, 0x01, 0x1F, /* fw changed */ + SKD_CHECK_STATUS_BUSY_IMMINENT }, +}; + +/* + * Look up status and sense data to decide how to handle the error + * from the device. + * mask says which fields must match e.g., mask=0x18 means check + * type and stat, ignore key, asc, ascq. + */ + +static enum skd_check_status_action +skd_check_status(struct skd_device *skdev, + u8 cmp_status, volatile struct fit_comp_error_info *skerr) +{ + int i, n; + + pr_err("(%s): key/asc/ascq/fruc %02x/%02x/%02x/%02x\n", + skd_name(skdev), skerr->key, skerr->code, skerr->qual, + skerr->fruc); + + pr_debug("%s:%s:%d stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n", + skdev->name, __func__, __LINE__, skerr->type, cmp_status, + skerr->key, skerr->code, skerr->qual, skerr->fruc); + + /* Does the info match an entry in the good category? */ + n = sizeof(skd_chkstat_table) / sizeof(skd_chkstat_table[0]); + for (i = 0; i < n; i++) { + struct sns_info *sns = &skd_chkstat_table[i]; + + if (sns->mask & 0x10) + if (skerr->type != sns->type) + continue; + + if (sns->mask & 0x08) + if (cmp_status != sns->stat) + continue; + + if (sns->mask & 0x04) + if (skerr->key != sns->key) + continue; + + if (sns->mask & 0x02) + if (skerr->code != sns->asc) + continue; + + if (sns->mask & 0x01) + if (skerr->qual != sns->ascq) + continue; + + if (sns->action == SKD_CHECK_STATUS_REPORT_SMART_ALERT) { + pr_err("(%s): SMART Alert: sense key/asc/ascq " + "%02x/%02x/%02x\n", + skd_name(skdev), skerr->key, + skerr->code, skerr->qual); + } + return sns->action; + } + + /* No other match, so nonzero status means error, + * zero status means good + */ + if (cmp_status) { + pr_debug("%s:%s:%d status check: error\n", + skdev->name, __func__, __LINE__); + return SKD_CHECK_STATUS_REPORT_ERROR; + } + + pr_debug("%s:%s:%d status check good default\n", + skdev->name, __func__, __LINE__); + return SKD_CHECK_STATUS_REPORT_GOOD; +} + +static void skd_resolve_req_exception(struct skd_device *skdev, + struct skd_request_context *skreq) +{ + u8 cmp_status = skreq->completion.status; + + switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) { + case SKD_CHECK_STATUS_REPORT_GOOD: + case SKD_CHECK_STATUS_REPORT_SMART_ALERT: + skd_end_request(skdev, skreq, 0); + break; + + case SKD_CHECK_STATUS_BUSY_IMMINENT: + skd_log_skreq(skdev, skreq, "retry(busy)"); + blk_requeue_request(skdev->queue, skreq->req); + pr_info("(%s) drive BUSY imminent\n", skd_name(skdev)); + skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT; + skdev->timer_countdown = SKD_TIMER_MINUTES(20); + skd_quiesce_dev(skdev); + break; + + case SKD_CHECK_STATUS_REQUEUE_REQUEST: + if ((unsigned long) ++skreq->req->special < SKD_MAX_RETRIES) { + skd_log_skreq(skdev, skreq, "retry"); + blk_requeue_request(skdev->queue, skreq->req); + break; + } + /* fall through to report error */ + + case SKD_CHECK_STATUS_REPORT_ERROR: + default: + skd_end_request(skdev, skreq, -EIO); + break; + } +} + +/* assume spinlock is already held */ +static void skd_release_skreq(struct skd_device *skdev, + struct skd_request_context *skreq) +{ + u32 msg_slot; + struct skd_fitmsg_context *skmsg; + + u32 timo_slot; + + /* + * Reclaim the FIT msg buffer if this is + * the first of the requests it carried to + * be completed. The FIT msg buffer used to + * send this request cannot be reused until + * we are sure the s1120 card has copied + * it to its memory. The FIT msg might have + * contained several requests. As soon as + * any of them are completed we know that + * the entire FIT msg was transferred. + * Only the first completed request will + * match the FIT msg buffer id. The FIT + * msg buffer id is immediately updated. + * When subsequent requests complete the FIT + * msg buffer id won't match, so we know + * quite cheaply that it is already done. + */ + msg_slot = skreq->fitmsg_id & SKD_ID_SLOT_MASK; + SKD_ASSERT(msg_slot < skdev->num_fitmsg_context); + + skmsg = &skdev->skmsg_table[msg_slot]; + if (skmsg->id == skreq->fitmsg_id) { + SKD_ASSERT(skmsg->state == SKD_MSG_STATE_BUSY); + SKD_ASSERT(skmsg->outstanding > 0); + skmsg->outstanding--; + if (skmsg->outstanding == 0) { + skmsg->state = SKD_MSG_STATE_IDLE; + skmsg->id += SKD_ID_INCR; + skmsg->next = skdev->skmsg_free_list; + skdev->skmsg_free_list = skmsg; + } + } + + /* + * Decrease the number of active requests. + * Also decrements the count in the timeout slot. + */ + SKD_ASSERT(skdev->in_flight > 0); + skdev->in_flight -= 1; + + timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; + SKD_ASSERT(skdev->timeout_slot[timo_slot] > 0); + skdev->timeout_slot[timo_slot] -= 1; + + /* + * Reset backpointer + */ + skreq->req = NULL; + + /* + * Reclaim the skd_request_context + */ + skreq->state = SKD_REQ_STATE_IDLE; + skreq->id += SKD_ID_INCR; + skreq->next = skdev->skreq_free_list; + skdev->skreq_free_list = skreq; +} + +#define DRIVER_INQ_EVPD_PAGE_CODE 0xDA + +static void skd_do_inq_page_00(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 *skcomp, + volatile struct fit_comp_error_info *skerr, + uint8_t *cdb, uint8_t *buf) +{ + uint16_t insert_pt, max_bytes, drive_pages, drive_bytes, new_size; + + /* Caller requested "supported pages". The driver needs to insert + * its page. + */ + pr_debug("%s:%s:%d skd_do_driver_inquiry: modify supported pages.\n", + skdev->name, __func__, __LINE__); + + /* If the device rejected the request because the CDB was + * improperly formed, then just leave. + */ + if (skcomp->status == SAM_STAT_CHECK_CONDITION && + skerr->key == ILLEGAL_REQUEST && skerr->code == 0x24) + return; + + /* Get the amount of space the caller allocated */ + max_bytes = (cdb[3] << 8) | cdb[4]; + + /* Get the number of pages actually returned by the device */ + drive_pages = (buf[2] << 8) | buf[3]; + drive_bytes = drive_pages + 4; + new_size = drive_pages + 1; + + /* Supported pages must be in numerical order, so find where + * the driver page needs to be inserted into the list of + * pages returned by the device. + */ + for (insert_pt = 4; insert_pt < drive_bytes; insert_pt++) { + if (buf[insert_pt] == DRIVER_INQ_EVPD_PAGE_CODE) + return; /* Device using this page code. abort */ + else if (buf[insert_pt] > DRIVER_INQ_EVPD_PAGE_CODE) + break; + } + + if (insert_pt < max_bytes) { + uint16_t u; + + /* Shift everything up one byte to make room. */ + for (u = new_size + 3; u > insert_pt; u--) + buf[u] = buf[u - 1]; + buf[insert_pt] = DRIVER_INQ_EVPD_PAGE_CODE; + + /* SCSI byte order increment of num_returned_bytes by 1 */ + skcomp->num_returned_bytes = + be32_to_cpu(skcomp->num_returned_bytes) + 1; + skcomp->num_returned_bytes = + be32_to_cpu(skcomp->num_returned_bytes); + } + + /* update page length field to reflect the driver's page too */ + buf[2] = (uint8_t)((new_size >> 8) & 0xFF); + buf[3] = (uint8_t)((new_size >> 0) & 0xFF); +} + +static void skd_get_link_info(struct pci_dev *pdev, u8 *speed, u8 *width) +{ + int pcie_reg; + u16 pci_bus_speed; + u8 pci_lanes; + + pcie_reg = pci_find_capability(pdev, PCI_CAP_ID_EXP); + if (pcie_reg) { + u16 linksta; + pci_read_config_word(pdev, pcie_reg + PCI_EXP_LNKSTA, &linksta); + + pci_bus_speed = linksta & 0xF; + pci_lanes = (linksta & 0x3F0) >> 4; + } else { + *speed = STEC_LINK_UNKNOWN; + *width = 0xFF; + return; + } + + switch (pci_bus_speed) { + case 1: + *speed = STEC_LINK_2_5GTS; + break; + case 2: + *speed = STEC_LINK_5GTS; + break; + case 3: + *speed = STEC_LINK_8GTS; + break; + default: + *speed = STEC_LINK_UNKNOWN; + break; + } + + if (pci_lanes <= 0x20) + *width = pci_lanes; + else + *width = 0xFF; +} + +static void skd_do_inq_page_da(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 *skcomp, + volatile struct fit_comp_error_info *skerr, + uint8_t *cdb, uint8_t *buf) +{ + struct pci_dev *pdev = skdev->pdev; + unsigned max_bytes; + struct driver_inquiry_data inq; + u16 val; + + pr_debug("%s:%s:%d skd_do_driver_inquiry: return driver page\n", + skdev->name, __func__, __LINE__); + + memset(&inq, 0, sizeof(inq)); + + inq.page_code = DRIVER_INQ_EVPD_PAGE_CODE; + + skd_get_link_info(pdev, &inq.pcie_link_speed, &inq.pcie_link_lanes); + inq.pcie_bus_number = cpu_to_be16(pdev->bus->number); + inq.pcie_device_number = PCI_SLOT(pdev->devfn); + inq.pcie_function_number = PCI_FUNC(pdev->devfn); + + pci_read_config_word(pdev, PCI_VENDOR_ID, &val); + inq.pcie_vendor_id = cpu_to_be16(val); + + pci_read_config_word(pdev, PCI_DEVICE_ID, &val); + inq.pcie_device_id = cpu_to_be16(val); + + pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &val); + inq.pcie_subsystem_vendor_id = cpu_to_be16(val); + + pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &val); + inq.pcie_subsystem_device_id = cpu_to_be16(val); + + /* Driver version, fixed lenth, padded with spaces on the right */ + inq.driver_version_length = sizeof(inq.driver_version); + memset(&inq.driver_version, ' ', sizeof(inq.driver_version)); + memcpy(inq.driver_version, DRV_VER_COMPL, + min(sizeof(inq.driver_version), strlen(DRV_VER_COMPL))); + + inq.page_length = cpu_to_be16((sizeof(inq) - 4)); + + /* Clear the error set by the device */ + skcomp->status = SAM_STAT_GOOD; + memset((void *)skerr, 0, sizeof(*skerr)); + + /* copy response into output buffer */ + max_bytes = (cdb[3] << 8) | cdb[4]; + memcpy(buf, &inq, min_t(unsigned, max_bytes, sizeof(inq))); + + skcomp->num_returned_bytes = + be32_to_cpu(min_t(uint16_t, max_bytes, sizeof(inq))); +} + +static void skd_do_driver_inq(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 *skcomp, + volatile struct fit_comp_error_info *skerr, + uint8_t *cdb, uint8_t *buf) +{ + if (!buf) + return; + else if (cdb[0] != INQUIRY) + return; /* Not an INQUIRY */ + else if ((cdb[1] & 1) == 0) + return; /* EVPD not set */ + else if (cdb[2] == 0) + /* Need to add driver's page to supported pages list */ + skd_do_inq_page_00(skdev, skcomp, skerr, cdb, buf); + else if (cdb[2] == DRIVER_INQ_EVPD_PAGE_CODE) + /* Caller requested driver's page */ + skd_do_inq_page_da(skdev, skcomp, skerr, cdb, buf); +} + +static unsigned char *skd_sg_1st_page_ptr(struct scatterlist *sg) +{ + if (!sg) + return NULL; + if (!sg_page(sg)) + return NULL; + return sg_virt(sg); +} + +static void skd_process_scsi_inq(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 + *skcomp, + volatile struct fit_comp_error_info *skerr, + struct skd_special_context *skspcl) +{ + uint8_t *buf; + struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf; + struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1]; + + dma_sync_sg_for_cpu(skdev->class_dev, skspcl->req.sg, skspcl->req.n_sg, + skspcl->req.sg_data_dir); + buf = skd_sg_1st_page_ptr(skspcl->req.sg); + + if (buf) + skd_do_driver_inq(skdev, skcomp, skerr, scsi_req->cdb, buf); +} + + +static int skd_isr_completion_posted(struct skd_device *skdev, + int limit, int *enqueued) +{ + volatile struct fit_completion_entry_v1 *skcmp = NULL; + volatile struct fit_comp_error_info *skerr; + u16 req_id; + u32 req_slot; + struct skd_request_context *skreq; + u16 cmp_cntxt = 0; + u8 cmp_status = 0; + u8 cmp_cycle = 0; + u32 cmp_bytes = 0; + int rc = 0; + int processed = 0; + + for (;; ) { + SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY); + + skcmp = &skdev->skcomp_table[skdev->skcomp_ix]; + cmp_cycle = skcmp->cycle; + cmp_cntxt = skcmp->tag; + cmp_status = skcmp->status; + cmp_bytes = be32_to_cpu(skcmp->num_returned_bytes); + + skerr = &skdev->skerr_table[skdev->skcomp_ix]; + + pr_debug("%s:%s:%d " + "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d " + "busy=%d rbytes=0x%x proto=%d\n", + skdev->name, __func__, __LINE__, skdev->skcomp_cycle, + skdev->skcomp_ix, cmp_cycle, cmp_cntxt, cmp_status, + skdev->in_flight, cmp_bytes, skdev->proto_ver); + + if (cmp_cycle != skdev->skcomp_cycle) { + pr_debug("%s:%s:%d end of completions\n", + skdev->name, __func__, __LINE__); + break; + } + /* + * Update the completion queue head index and possibly + * the completion cycle count. 8-bit wrap-around. + */ + skdev->skcomp_ix++; + if (skdev->skcomp_ix >= SKD_N_COMPLETION_ENTRY) { + skdev->skcomp_ix = 0; + skdev->skcomp_cycle++; + } + + /* + * The command context is a unique 32-bit ID. The low order + * bits help locate the request. The request is usually a + * r/w request (see skd_start() above) or a special request. + */ + req_id = cmp_cntxt; + req_slot = req_id & SKD_ID_SLOT_AND_TABLE_MASK; + + /* Is this other than a r/w request? */ + if (req_slot >= skdev->num_req_context) { + /* + * This is not a completion for a r/w request. + */ + skd_complete_other(skdev, skcmp, skerr); + continue; + } + + skreq = &skdev->skreq_table[req_slot]; + + /* + * Make sure the request ID for the slot matches. + */ + if (skreq->id != req_id) { + pr_debug("%s:%s:%d mismatch comp_id=0x%x req_id=0x%x\n", + skdev->name, __func__, __LINE__, + req_id, skreq->id); + { + u16 new_id = cmp_cntxt; + pr_err("(%s): Completion mismatch " + "comp_id=0x%04x skreq=0x%04x new=0x%04x\n", + skd_name(skdev), req_id, + skreq->id, new_id); + + continue; + } + } + + SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY); + + if (skreq->state == SKD_REQ_STATE_ABORTED) { + pr_debug("%s:%s:%d reclaim req %p id=%04x\n", + skdev->name, __func__, __LINE__, + skreq, skreq->id); + /* a previously timed out command can + * now be cleaned up */ + skd_release_skreq(skdev, skreq); + continue; + } + + skreq->completion = *skcmp; + if (unlikely(cmp_status == SAM_STAT_CHECK_CONDITION)) { + skreq->err_info = *skerr; + skd_log_check_status(skdev, cmp_status, skerr->key, + skerr->code, skerr->qual, + skerr->fruc); + } + /* Release DMA resources for the request. */ + if (skreq->n_sg > 0) + skd_postop_sg_list(skdev, skreq); + + if (!skreq->req) { + pr_debug("%s:%s:%d NULL backptr skdreq %p, " + "req=0x%x req_id=0x%x\n", + skdev->name, __func__, __LINE__, + skreq, skreq->id, req_id); + } else { + /* + * Capture the outcome and post it back to the + * native request. + */ + if (likely(cmp_status == SAM_STAT_GOOD)) + skd_end_request(skdev, skreq, 0); + else + skd_resolve_req_exception(skdev, skreq); + } + + /* + * Release the skreq, its FIT msg (if one), timeout slot, + * and queue depth. + */ + skd_release_skreq(skdev, skreq); + + /* skd_isr_comp_limit equal zero means no limit */ + if (limit) { + if (++processed >= limit) { + rc = 1; + break; + } + } + } + + if ((skdev->state == SKD_DRVR_STATE_PAUSING) + && (skdev->in_flight) == 0) { + skdev->state = SKD_DRVR_STATE_PAUSED; + wake_up_interruptible(&skdev->waitq); + } + + return rc; +} + +static void skd_complete_other(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 *skcomp, + volatile struct fit_comp_error_info *skerr) +{ + u32 req_id = 0; + u32 req_table; + u32 req_slot; + struct skd_special_context *skspcl; + + req_id = skcomp->tag; + req_table = req_id & SKD_ID_TABLE_MASK; + req_slot = req_id & SKD_ID_SLOT_MASK; + + pr_debug("%s:%s:%d table=0x%x id=0x%x slot=%d\n", + skdev->name, __func__, __LINE__, + req_table, req_id, req_slot); + + /* + * Based on the request id, determine how to dispatch this completion. + * This swich/case is finding the good cases and forwarding the + * completion entry. Errors are reported below the switch. + */ + switch (req_table) { + case SKD_ID_RW_REQUEST: + /* + * The caller, skd_completion_posted_isr() above, + * handles r/w requests. The only way we get here + * is if the req_slot is out of bounds. + */ + break; + + case SKD_ID_SPECIAL_REQUEST: + /* + * Make sure the req_slot is in bounds and that the id + * matches. + */ + if (req_slot < skdev->n_special) { + skspcl = &skdev->skspcl_table[req_slot]; + if (skspcl->req.id == req_id && + skspcl->req.state == SKD_REQ_STATE_BUSY) { + skd_complete_special(skdev, + skcomp, skerr, skspcl); + return; + } + } + break; + + case SKD_ID_INTERNAL: + if (req_slot == 0) { + skspcl = &skdev->internal_skspcl; + if (skspcl->req.id == req_id && + skspcl->req.state == SKD_REQ_STATE_BUSY) { + skd_complete_internal(skdev, + skcomp, skerr, skspcl); + return; + } + } + break; + + case SKD_ID_FIT_MSG: + /* + * These id's should never appear in a completion record. + */ + break; + + default: + /* + * These id's should never appear anywhere; + */ + break; + } + + /* + * If we get here it is a bad or stale id. + */ +} + +static void skd_complete_special(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 + *skcomp, + volatile struct fit_comp_error_info *skerr, + struct skd_special_context *skspcl) +{ + pr_debug("%s:%s:%d completing special request %p\n", + skdev->name, __func__, __LINE__, skspcl); + if (skspcl->orphaned) { + /* Discard orphaned request */ + /* ?: Can this release directly or does it need + * to use a worker? */ + pr_debug("%s:%s:%d release orphaned %p\n", + skdev->name, __func__, __LINE__, skspcl); + skd_release_special(skdev, skspcl); + return; + } + + skd_process_scsi_inq(skdev, skcomp, skerr, skspcl); + + skspcl->req.state = SKD_REQ_STATE_COMPLETED; + skspcl->req.completion = *skcomp; + skspcl->req.err_info = *skerr; + + skd_log_check_status(skdev, skspcl->req.completion.status, skerr->key, + skerr->code, skerr->qual, skerr->fruc); + + wake_up_interruptible(&skdev->waitq); +} + +/* assume spinlock is already held */ +static void skd_release_special(struct skd_device *skdev, + struct skd_special_context *skspcl) +{ + int i, was_depleted; + + for (i = 0; i < skspcl->req.n_sg; i++) { + struct page *page = sg_page(&skspcl->req.sg[i]); + __free_page(page); + } + + was_depleted = (skdev->skspcl_free_list == NULL); + + skspcl->req.state = SKD_REQ_STATE_IDLE; + skspcl->req.id += SKD_ID_INCR; + skspcl->req.next = + (struct skd_request_context *)skdev->skspcl_free_list; + skdev->skspcl_free_list = (struct skd_special_context *)skspcl; + + if (was_depleted) { + pr_debug("%s:%s:%d skspcl was depleted\n", + skdev->name, __func__, __LINE__); + /* Free list was depleted. Their might be waiters. */ + wake_up_interruptible(&skdev->waitq); + } +} + +static void skd_reset_skcomp(struct skd_device *skdev) +{ + u32 nbytes; + struct fit_completion_entry_v1 *skcomp; + + nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY; + nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY; + + memset(skdev->skcomp_table, 0, nbytes); + + skdev->skcomp_ix = 0; + skdev->skcomp_cycle = 1; +} + +/* + ***************************************************************************** + * INTERRUPTS + ***************************************************************************** + */ +static void skd_completion_worker(struct work_struct *work) +{ + struct skd_device *skdev = + container_of(work, struct skd_device, completion_worker); + unsigned long flags; + int flush_enqueued = 0; + + spin_lock_irqsave(&skdev->lock, flags); + + /* + * pass in limit=0, which means no limit.. + * process everything in compq + */ + skd_isr_completion_posted(skdev, 0, &flush_enqueued); + skd_request_fn(skdev->queue); + + spin_unlock_irqrestore(&skdev->lock, flags); +} + +static void skd_isr_msg_from_dev(struct skd_device *skdev); + +irqreturn_t +static skd_isr(int irq, void *ptr) +{ + struct skd_device *skdev; + u32 intstat; + u32 ack; + int rc = 0; + int deferred = 0; + int flush_enqueued = 0; + + skdev = (struct skd_device *)ptr; + spin_lock(&skdev->lock); + + for (;; ) { + intstat = SKD_READL(skdev, FIT_INT_STATUS_HOST); + + ack = FIT_INT_DEF_MASK; + ack &= intstat; + + pr_debug("%s:%s:%d intstat=0x%x ack=0x%x\n", + skdev->name, __func__, __LINE__, intstat, ack); + + /* As long as there is an int pending on device, keep + * running loop. When none, get out, but if we've never + * done any processing, call completion handler? + */ + if (ack == 0) { + /* No interrupts on device, but run the completion + * processor anyway? + */ + if (rc == 0) + if (likely (skdev->state + == SKD_DRVR_STATE_ONLINE)) + deferred = 1; + break; + } + + rc = IRQ_HANDLED; + + SKD_WRITEL(skdev, ack, FIT_INT_STATUS_HOST); + + if (likely((skdev->state != SKD_DRVR_STATE_LOAD) && + (skdev->state != SKD_DRVR_STATE_STOPPING))) { + if (intstat & FIT_ISH_COMPLETION_POSTED) { + /* + * If we have already deferred completion + * processing, don't bother running it again + */ + if (deferred == 0) + deferred = + skd_isr_completion_posted(skdev, + skd_isr_comp_limit, &flush_enqueued); + } + + if (intstat & FIT_ISH_FW_STATE_CHANGE) { + skd_isr_fwstate(skdev); + if (skdev->state == SKD_DRVR_STATE_FAULT || + skdev->state == + SKD_DRVR_STATE_DISAPPEARED) { + spin_unlock(&skdev->lock); + return rc; + } + } + + if (intstat & FIT_ISH_MSG_FROM_DEV) + skd_isr_msg_from_dev(skdev); + } + } + + if (unlikely(flush_enqueued)) + skd_request_fn(skdev->queue); + + if (deferred) + schedule_work(&skdev->completion_worker); + else if (!flush_enqueued) + skd_request_fn(skdev->queue); + + spin_unlock(&skdev->lock); + + return rc; +} + +static void skd_drive_fault(struct skd_device *skdev) +{ + skdev->state = SKD_DRVR_STATE_FAULT; + pr_err("(%s): Drive FAULT\n", skd_name(skdev)); +} + +static void skd_drive_disappeared(struct skd_device *skdev) +{ + skdev->state = SKD_DRVR_STATE_DISAPPEARED; + pr_err("(%s): Drive DISAPPEARED\n", skd_name(skdev)); +} + +static void skd_isr_fwstate(struct skd_device *skdev) +{ + u32 sense; + u32 state; + u32 mtd; + int prev_driver_state = skdev->state; + + sense = SKD_READL(skdev, FIT_STATUS); + state = sense & FIT_SR_DRIVE_STATE_MASK; + + pr_err("(%s): s1120 state %s(%d)=>%s(%d)\n", + skd_name(skdev), + skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, + skd_drive_state_to_str(state), state); + + skdev->drive_state = state; + + switch (skdev->drive_state) { + case FIT_SR_DRIVE_INIT: + if (skdev->state == SKD_DRVR_STATE_PROTOCOL_MISMATCH) { + skd_disable_interrupts(skdev); + break; + } + if (skdev->state == SKD_DRVR_STATE_RESTARTING) + skd_recover_requests(skdev, 0); + if (skdev->state == SKD_DRVR_STATE_WAIT_BOOT) { + skdev->timer_countdown = SKD_STARTING_TIMO; + skdev->state = SKD_DRVR_STATE_STARTING; + skd_soft_reset(skdev); + break; + } + mtd = FIT_MXD_CONS(FIT_MTD_FITFW_INIT, 0, 0); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_SR_DRIVE_ONLINE: + skdev->cur_max_queue_depth = skd_max_queue_depth; + if (skdev->cur_max_queue_depth > skdev->dev_max_queue_depth) + skdev->cur_max_queue_depth = skdev->dev_max_queue_depth; + + skdev->queue_low_water_mark = + skdev->cur_max_queue_depth * 2 / 3 + 1; + if (skdev->queue_low_water_mark < 1) + skdev->queue_low_water_mark = 1; + pr_info( + "(%s): Queue depth limit=%d dev=%d lowat=%d\n", + skd_name(skdev), + skdev->cur_max_queue_depth, + skdev->dev_max_queue_depth, skdev->queue_low_water_mark); + + skd_refresh_device_data(skdev); + break; + + case FIT_SR_DRIVE_BUSY: + skdev->state = SKD_DRVR_STATE_BUSY; + skdev->timer_countdown = SKD_BUSY_TIMO; + skd_quiesce_dev(skdev); + break; + case FIT_SR_DRIVE_BUSY_SANITIZE: + /* set timer for 3 seconds, we'll abort any unfinished + * commands after that expires + */ + skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE; + skdev->timer_countdown = SKD_TIMER_SECONDS(3); + blk_start_queue(skdev->queue); + break; + case FIT_SR_DRIVE_BUSY_ERASE: + skdev->state = SKD_DRVR_STATE_BUSY_ERASE; + skdev->timer_countdown = SKD_BUSY_TIMO; + break; + case FIT_SR_DRIVE_OFFLINE: + skdev->state = SKD_DRVR_STATE_IDLE; + break; + case FIT_SR_DRIVE_SOFT_RESET: + switch (skdev->state) { + case SKD_DRVR_STATE_STARTING: + case SKD_DRVR_STATE_RESTARTING: + /* Expected by a caller of skd_soft_reset() */ + break; + default: + skdev->state = SKD_DRVR_STATE_RESTARTING; + break; + } + break; + case FIT_SR_DRIVE_FW_BOOTING: + pr_debug("%s:%s:%d ISR FIT_SR_DRIVE_FW_BOOTING %s\n", + skdev->name, __func__, __LINE__, skdev->name); + skdev->state = SKD_DRVR_STATE_WAIT_BOOT; + skdev->timer_countdown = SKD_WAIT_BOOT_TIMO; + break; + + case FIT_SR_DRIVE_DEGRADED: + case FIT_SR_PCIE_LINK_DOWN: + case FIT_SR_DRIVE_NEED_FW_DOWNLOAD: + break; + + case FIT_SR_DRIVE_FAULT: + skd_drive_fault(skdev); + skd_recover_requests(skdev, 0); + blk_start_queue(skdev->queue); + break; + + /* PCIe bus returned all Fs? */ + case 0xFF: + pr_info("(%s): state=0x%x sense=0x%x\n", + skd_name(skdev), state, sense); + skd_drive_disappeared(skdev); + skd_recover_requests(skdev, 0); + blk_start_queue(skdev->queue); + break; + default: + /* + * Uknown FW State. Wait for a state we recognize. + */ + break; + } + pr_err("(%s): Driver state %s(%d)=>%s(%d)\n", + skd_name(skdev), + skd_skdev_state_to_str(prev_driver_state), prev_driver_state, + skd_skdev_state_to_str(skdev->state), skdev->state); +} + +static void skd_recover_requests(struct skd_device *skdev, int requeue) +{ + int i; + + for (i = 0; i < skdev->num_req_context; i++) { + struct skd_request_context *skreq = &skdev->skreq_table[i]; + + if (skreq->state == SKD_REQ_STATE_BUSY) { + skd_log_skreq(skdev, skreq, "recover"); + + SKD_ASSERT((skreq->id & SKD_ID_INCR) != 0); + SKD_ASSERT(skreq->req != NULL); + + /* Release DMA resources for the request. */ + if (skreq->n_sg > 0) + skd_postop_sg_list(skdev, skreq); + + if (requeue && + (unsigned long) ++skreq->req->special < + SKD_MAX_RETRIES) + blk_requeue_request(skdev->queue, skreq->req); + else + skd_end_request(skdev, skreq, -EIO); + + skreq->req = NULL; + + skreq->state = SKD_REQ_STATE_IDLE; + skreq->id += SKD_ID_INCR; + } + if (i > 0) + skreq[-1].next = skreq; + skreq->next = NULL; + } + skdev->skreq_free_list = skdev->skreq_table; + + for (i = 0; i < skdev->num_fitmsg_context; i++) { + struct skd_fitmsg_context *skmsg = &skdev->skmsg_table[i]; + + if (skmsg->state == SKD_MSG_STATE_BUSY) { + skd_log_skmsg(skdev, skmsg, "salvaged"); + SKD_ASSERT((skmsg->id & SKD_ID_INCR) != 0); + skmsg->state = SKD_MSG_STATE_IDLE; + skmsg->id += SKD_ID_INCR; + } + if (i > 0) + skmsg[-1].next = skmsg; + skmsg->next = NULL; + } + skdev->skmsg_free_list = skdev->skmsg_table; + + for (i = 0; i < skdev->n_special; i++) { + struct skd_special_context *skspcl = &skdev->skspcl_table[i]; + + /* If orphaned, reclaim it because it has already been reported + * to the process as an error (it was just waiting for + * a completion that didn't come, and now it will never come) + * If busy, change to a state that will cause it to error + * out in the wait routine and let it do the normal + * reporting and reclaiming + */ + if (skspcl->req.state == SKD_REQ_STATE_BUSY) { + if (skspcl->orphaned) { + pr_debug("%s:%s:%d orphaned %p\n", + skdev->name, __func__, __LINE__, + skspcl); + skd_release_special(skdev, skspcl); + } else { + pr_debug("%s:%s:%d not orphaned %p\n", + skdev->name, __func__, __LINE__, + skspcl); + skspcl->req.state = SKD_REQ_STATE_ABORTED; + } + } + } + skdev->skspcl_free_list = skdev->skspcl_table; + + for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++) + skdev->timeout_slot[i] = 0; + + skdev->in_flight = 0; +} + +static void skd_isr_msg_from_dev(struct skd_device *skdev) +{ + u32 mfd; + u32 mtd; + u32 data; + + mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE); + + pr_debug("%s:%s:%d mfd=0x%x last_mtd=0x%x\n", + skdev->name, __func__, __LINE__, mfd, skdev->last_mtd); + + /* ignore any mtd that is an ack for something we didn't send */ + if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd)) + return; + + switch (FIT_MXD_TYPE(mfd)) { + case FIT_MTD_FITFW_INIT: + skdev->proto_ver = FIT_PROTOCOL_MAJOR_VER(mfd); + + if (skdev->proto_ver != FIT_PROTOCOL_VERSION_1) { + pr_err("(%s): protocol mismatch\n", + skdev->name); + pr_err("(%s): got=%d support=%d\n", + skdev->name, skdev->proto_ver, + FIT_PROTOCOL_VERSION_1); + pr_err("(%s): please upgrade driver\n", + skdev->name); + skdev->state = SKD_DRVR_STATE_PROTOCOL_MISMATCH; + skd_soft_reset(skdev); + break; + } + mtd = FIT_MXD_CONS(FIT_MTD_GET_CMDQ_DEPTH, 0, 0); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_GET_CMDQ_DEPTH: + skdev->dev_max_queue_depth = FIT_MXD_DATA(mfd); + mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_DEPTH, 0, + SKD_N_COMPLETION_ENTRY); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_SET_COMPQ_DEPTH: + SKD_WRITEQ(skdev, skdev->cq_dma_address, FIT_MSG_TO_DEVICE_ARG); + mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_ADDR, 0, 0); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_SET_COMPQ_ADDR: + skd_reset_skcomp(skdev); + mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_HOST_ID, 0, skdev->devno); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_CMD_LOG_HOST_ID: + skdev->connect_time_stamp = get_seconds(); + data = skdev->connect_time_stamp & 0xFFFF; + mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_CMD_LOG_TIME_STAMP_LO: + skdev->drive_jiffies = FIT_MXD_DATA(mfd); + data = (skdev->connect_time_stamp >> 16) & 0xFFFF; + mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_HI, 0, data); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_CMD_LOG_TIME_STAMP_HI: + skdev->drive_jiffies |= (FIT_MXD_DATA(mfd) << 16); + mtd = FIT_MXD_CONS(FIT_MTD_ARM_QUEUE, 0, 0); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + + pr_err("(%s): Time sync driver=0x%x device=0x%x\n", + skd_name(skdev), + skdev->connect_time_stamp, skdev->drive_jiffies); + break; + + case FIT_MTD_ARM_QUEUE: + skdev->last_mtd = 0; + /* + * State should be, or soon will be, FIT_SR_DRIVE_ONLINE. + */ + break; + + default: + break; + } +} + +static void skd_disable_interrupts(struct skd_device *skdev) +{ + u32 sense; + + sense = SKD_READL(skdev, FIT_CONTROL); + sense &= ~FIT_CR_ENABLE_INTERRUPTS; + SKD_WRITEL(skdev, sense, FIT_CONTROL); + pr_debug("%s:%s:%d sense 0x%x\n", + skdev->name, __func__, __LINE__, sense); + + /* Note that the 1s is written. A 1-bit means + * disable, a 0 means enable. + */ + SKD_WRITEL(skdev, ~0, FIT_INT_MASK_HOST); +} + +static void skd_enable_interrupts(struct skd_device *skdev) +{ + u32 val; + + /* unmask interrupts first */ + val = FIT_ISH_FW_STATE_CHANGE + + FIT_ISH_COMPLETION_POSTED + FIT_ISH_MSG_FROM_DEV; + + /* Note that the compliment of mask is written. A 1-bit means + * disable, a 0 means enable. */ + SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST); + pr_debug("%s:%s:%d interrupt mask=0x%x\n", + skdev->name, __func__, __LINE__, ~val); + + val = SKD_READL(skdev, FIT_CONTROL); + val |= FIT_CR_ENABLE_INTERRUPTS; + pr_debug("%s:%s:%d control=0x%x\n", + skdev->name, __func__, __LINE__, val); + SKD_WRITEL(skdev, val, FIT_CONTROL); +} + +/* + ***************************************************************************** + * START, STOP, RESTART, QUIESCE, UNQUIESCE + ***************************************************************************** + */ + +static void skd_soft_reset(struct skd_device *skdev) +{ + u32 val; + + val = SKD_READL(skdev, FIT_CONTROL); + val |= (FIT_CR_SOFT_RESET); + pr_debug("%s:%s:%d control=0x%x\n", + skdev->name, __func__, __LINE__, val); + SKD_WRITEL(skdev, val, FIT_CONTROL); +} + +static void skd_start_device(struct skd_device *skdev) +{ + unsigned long flags; + u32 sense; + u32 state; + + spin_lock_irqsave(&skdev->lock, flags); + + /* ack all ghost interrupts */ + SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); + + sense = SKD_READL(skdev, FIT_STATUS); + + pr_debug("%s:%s:%d initial status=0x%x\n", + skdev->name, __func__, __LINE__, sense); + + state = sense & FIT_SR_DRIVE_STATE_MASK; + skdev->drive_state = state; + skdev->last_mtd = 0; + + skdev->state = SKD_DRVR_STATE_STARTING; + skdev->timer_countdown = SKD_STARTING_TIMO; + + skd_enable_interrupts(skdev); + + switch (skdev->drive_state) { + case FIT_SR_DRIVE_OFFLINE: + pr_err("(%s): Drive offline...\n", skd_name(skdev)); + break; + + case FIT_SR_DRIVE_FW_BOOTING: + pr_debug("%s:%s:%d FIT_SR_DRIVE_FW_BOOTING %s\n", + skdev->name, __func__, __LINE__, skdev->name); + skdev->state = SKD_DRVR_STATE_WAIT_BOOT; + skdev->timer_countdown = SKD_WAIT_BOOT_TIMO; + break; + + case FIT_SR_DRIVE_BUSY_SANITIZE: + pr_info("(%s): Start: BUSY_SANITIZE\n", + skd_name(skdev)); + skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE; + skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; + break; + + case FIT_SR_DRIVE_BUSY_ERASE: + pr_info("(%s): Start: BUSY_ERASE\n", skd_name(skdev)); + skdev->state = SKD_DRVR_STATE_BUSY_ERASE; + skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; + break; + + case FIT_SR_DRIVE_INIT: + case FIT_SR_DRIVE_ONLINE: + skd_soft_reset(skdev); + break; + + case FIT_SR_DRIVE_BUSY: + pr_err("(%s): Drive Busy...\n", skd_name(skdev)); + skdev->state = SKD_DRVR_STATE_BUSY; + skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; + break; + + case FIT_SR_DRIVE_SOFT_RESET: + pr_err("(%s) drive soft reset in prog\n", + skd_name(skdev)); + break; + + case FIT_SR_DRIVE_FAULT: + /* Fault state is bad...soft reset won't do it... + * Hard reset, maybe, but does it work on device? + * For now, just fault so the system doesn't hang. + */ + skd_drive_fault(skdev); + /*start the queue so we can respond with error to requests */ + pr_debug("%s:%s:%d starting %s queue\n", + skdev->name, __func__, __LINE__, skdev->name); + blk_start_queue(skdev->queue); + skdev->gendisk_on = -1; + wake_up_interruptible(&skdev->waitq); + break; + + case 0xFF: + /* Most likely the device isn't there or isn't responding + * to the BAR1 addresses. */ + skd_drive_disappeared(skdev); + /*start the queue so we can respond with error to requests */ + pr_debug("%s:%s:%d starting %s queue to error-out reqs\n", + skdev->name, __func__, __LINE__, skdev->name); + blk_start_queue(skdev->queue); + skdev->gendisk_on = -1; + wake_up_interruptible(&skdev->waitq); + break; + + default: + pr_err("(%s) Start: unknown state %x\n", + skd_name(skdev), skdev->drive_state); + break; + } + + state = SKD_READL(skdev, FIT_CONTROL); + pr_debug("%s:%s:%d FIT Control Status=0x%x\n", + skdev->name, __func__, __LINE__, state); + + state = SKD_READL(skdev, FIT_INT_STATUS_HOST); + pr_debug("%s:%s:%d Intr Status=0x%x\n", + skdev->name, __func__, __LINE__, state); + + state = SKD_READL(skdev, FIT_INT_MASK_HOST); + pr_debug("%s:%s:%d Intr Mask=0x%x\n", + skdev->name, __func__, __LINE__, state); + + state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE); + pr_debug("%s:%s:%d Msg from Dev=0x%x\n", + skdev->name, __func__, __LINE__, state); + + state = SKD_READL(skdev, FIT_HW_VERSION); + pr_debug("%s:%s:%d HW version=0x%x\n", + skdev->name, __func__, __LINE__, state); + + spin_unlock_irqrestore(&skdev->lock, flags); +} + +static void skd_stop_device(struct skd_device *skdev) +{ + unsigned long flags; + struct skd_special_context *skspcl = &skdev->internal_skspcl; + u32 dev_state; + int i; + + spin_lock_irqsave(&skdev->lock, flags); + + if (skdev->state != SKD_DRVR_STATE_ONLINE) { + pr_err("(%s): skd_stop_device not online no sync\n", + skd_name(skdev)); + goto stop_out; + } + + if (skspcl->req.state != SKD_REQ_STATE_IDLE) { + pr_err("(%s): skd_stop_device no special\n", + skd_name(skdev)); + goto stop_out; + } + + skdev->state = SKD_DRVR_STATE_SYNCING; + skdev->sync_done = 0; + + skd_send_internal_skspcl(skdev, skspcl, SYNCHRONIZE_CACHE); + + spin_unlock_irqrestore(&skdev->lock, flags); + + wait_event_interruptible_timeout(skdev->waitq, + (skdev->sync_done), (10 * HZ)); + + spin_lock_irqsave(&skdev->lock, flags); + + switch (skdev->sync_done) { + case 0: + pr_err("(%s): skd_stop_device no sync\n", + skd_name(skdev)); + break; + case 1: + pr_err("(%s): skd_stop_device sync done\n", + skd_name(skdev)); + break; + default: + pr_err("(%s): skd_stop_device sync error\n", + skd_name(skdev)); + } + +stop_out: + skdev->state = SKD_DRVR_STATE_STOPPING; + spin_unlock_irqrestore(&skdev->lock, flags); + + skd_kill_timer(skdev); + + spin_lock_irqsave(&skdev->lock, flags); + skd_disable_interrupts(skdev); + + /* ensure all ints on device are cleared */ + /* soft reset the device to unload with a clean slate */ + SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); + SKD_WRITEL(skdev, FIT_CR_SOFT_RESET, FIT_CONTROL); + + spin_unlock_irqrestore(&skdev->lock, flags); + + /* poll every 100ms, 1 second timeout */ + for (i = 0; i < 10; i++) { + dev_state = + SKD_READL(skdev, FIT_STATUS) & FIT_SR_DRIVE_STATE_MASK; + if (dev_state == FIT_SR_DRIVE_INIT) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(100)); + } + + if (dev_state != FIT_SR_DRIVE_INIT) + pr_err("(%s): skd_stop_device state error 0x%02x\n", + skd_name(skdev), dev_state); +} + +/* assume spinlock is held */ +static void skd_restart_device(struct skd_device *skdev) +{ + u32 state; + + /* ack all ghost interrupts */ + SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); + + state = SKD_READL(skdev, FIT_STATUS); + + pr_debug("%s:%s:%d drive status=0x%x\n", + skdev->name, __func__, __LINE__, state); + + state &= FIT_SR_DRIVE_STATE_MASK; + skdev->drive_state = state; + skdev->last_mtd = 0; + + skdev->state = SKD_DRVR_STATE_RESTARTING; + skdev->timer_countdown = SKD_RESTARTING_TIMO; + + skd_soft_reset(skdev); +} + +/* assume spinlock is held */ +static int skd_quiesce_dev(struct skd_device *skdev) +{ + int rc = 0; + + switch (skdev->state) { + case SKD_DRVR_STATE_BUSY: + case SKD_DRVR_STATE_BUSY_IMMINENT: + pr_debug("%s:%s:%d stopping %s queue\n", + skdev->name, __func__, __LINE__, skdev->name); + blk_stop_queue(skdev->queue); + break; + case SKD_DRVR_STATE_ONLINE: + case SKD_DRVR_STATE_STOPPING: + case SKD_DRVR_STATE_SYNCING: + case SKD_DRVR_STATE_PAUSING: + case SKD_DRVR_STATE_PAUSED: + case SKD_DRVR_STATE_STARTING: + case SKD_DRVR_STATE_RESTARTING: + case SKD_DRVR_STATE_RESUMING: + default: + rc = -EINVAL; + pr_debug("%s:%s:%d state [%d] not implemented\n", + skdev->name, __func__, __LINE__, skdev->state); + } + return rc; +} + +/* assume spinlock is held */ +static int skd_unquiesce_dev(struct skd_device *skdev) +{ + int prev_driver_state = skdev->state; + + skd_log_skdev(skdev, "unquiesce"); + if (skdev->state == SKD_DRVR_STATE_ONLINE) { + pr_debug("%s:%s:%d **** device already ONLINE\n", + skdev->name, __func__, __LINE__); + return 0; + } + if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) { + /* + * If there has been an state change to other than + * ONLINE, we will rely on controller state change + * to come back online and restart the queue. + * The BUSY state means that driver is ready to + * continue normal processing but waiting for controller + * to become available. + */ + skdev->state = SKD_DRVR_STATE_BUSY; + pr_debug("%s:%s:%d drive BUSY state\n", + skdev->name, __func__, __LINE__); + return 0; + } + + /* + * Drive has just come online, driver is either in startup, + * paused performing a task, or bust waiting for hardware. + */ + switch (skdev->state) { + case SKD_DRVR_STATE_PAUSED: + case SKD_DRVR_STATE_BUSY: + case SKD_DRVR_STATE_BUSY_IMMINENT: + case SKD_DRVR_STATE_BUSY_ERASE: + case SKD_DRVR_STATE_STARTING: + case SKD_DRVR_STATE_RESTARTING: + case SKD_DRVR_STATE_FAULT: + case SKD_DRVR_STATE_IDLE: + case SKD_DRVR_STATE_LOAD: + skdev->state = SKD_DRVR_STATE_ONLINE; + pr_err("(%s): Driver state %s(%d)=>%s(%d)\n", + skd_name(skdev), + skd_skdev_state_to_str(prev_driver_state), + prev_driver_state, skd_skdev_state_to_str(skdev->state), + skdev->state); + pr_debug("%s:%s:%d **** device ONLINE...starting block queue\n", + skdev->name, __func__, __LINE__); + pr_debug("%s:%s:%d starting %s queue\n", + skdev->name, __func__, __LINE__, skdev->name); + pr_info("(%s): STEC s1120 ONLINE\n", skd_name(skdev)); + blk_start_queue(skdev->queue); + skdev->gendisk_on = 1; + wake_up_interruptible(&skdev->waitq); + break; + + case SKD_DRVR_STATE_DISAPPEARED: + default: + pr_debug("%s:%s:%d **** driver state %d, not implemented \n", + skdev->name, __func__, __LINE__, + skdev->state); + return -EBUSY; + } + return 0; +} + +/* + ***************************************************************************** + * PCIe MSI/MSI-X INTERRUPT HANDLERS + ***************************************************************************** + */ + +static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data) +{ + struct skd_device *skdev = skd_host_data; + unsigned long flags; + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d MSIX = 0x%x\n", + skdev->name, __func__, __LINE__, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + pr_err("(%s): MSIX reserved irq %d = 0x%x\n", skd_name(skdev), + irq, SKD_READL(skdev, FIT_INT_STATUS_HOST)); + SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST); + spin_unlock_irqrestore(&skdev->lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t skd_statec_isr(int irq, void *skd_host_data) +{ + struct skd_device *skdev = skd_host_data; + unsigned long flags; + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d MSIX = 0x%x\n", + skdev->name, __func__, __LINE__, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST); + skd_isr_fwstate(skdev); + spin_unlock_irqrestore(&skdev->lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t skd_comp_q(int irq, void *skd_host_data) +{ + struct skd_device *skdev = skd_host_data; + unsigned long flags; + int flush_enqueued = 0; + int deferred; + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d MSIX = 0x%x\n", + skdev->name, __func__, __LINE__, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST); + deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit, + &flush_enqueued); + if (flush_enqueued) + skd_request_fn(skdev->queue); + + if (deferred) + schedule_work(&skdev->completion_worker); + else if (!flush_enqueued) + skd_request_fn(skdev->queue); + + spin_unlock_irqrestore(&skdev->lock, flags); + + return IRQ_HANDLED; +} + +static irqreturn_t skd_msg_isr(int irq, void *skd_host_data) +{ + struct skd_device *skdev = skd_host_data; + unsigned long flags; + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d MSIX = 0x%x\n", + skdev->name, __func__, __LINE__, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST); + skd_isr_msg_from_dev(skdev); + spin_unlock_irqrestore(&skdev->lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data) +{ + struct skd_device *skdev = skd_host_data; + unsigned long flags; + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d MSIX = 0x%x\n", + skdev->name, __func__, __LINE__, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST); + spin_unlock_irqrestore(&skdev->lock, flags); + return IRQ_HANDLED; +} + +/* + ***************************************************************************** + * PCIe MSI/MSI-X SETUP + ***************************************************************************** + */ + +struct skd_msix_entry { + int have_irq; + u32 vector; + u32 entry; + struct skd_device *rsp; + char isr_name[30]; +}; + +struct skd_init_msix_entry { + const char *name; + irq_handler_t handler; +}; + +#define SKD_MAX_MSIX_COUNT 13 +#define SKD_MIN_MSIX_COUNT 7 +#define SKD_BASE_MSIX_IRQ 4 + +static struct skd_init_msix_entry msix_entries[SKD_MAX_MSIX_COUNT] = { + { "(DMA 0)", skd_reserved_isr }, + { "(DMA 1)", skd_reserved_isr }, + { "(DMA 2)", skd_reserved_isr }, + { "(DMA 3)", skd_reserved_isr }, + { "(State Change)", skd_statec_isr }, + { "(COMPL_Q)", skd_comp_q }, + { "(MSG)", skd_msg_isr }, + { "(Reserved)", skd_reserved_isr }, + { "(Reserved)", skd_reserved_isr }, + { "(Queue Full 0)", skd_qfull_isr }, + { "(Queue Full 1)", skd_qfull_isr }, + { "(Queue Full 2)", skd_qfull_isr }, + { "(Queue Full 3)", skd_qfull_isr }, +}; + +static void skd_release_msix(struct skd_device *skdev) +{ + struct skd_msix_entry *qentry; + int i; + + if (skdev->msix_entries == NULL) + return; + for (i = 0; i < skdev->msix_count; i++) { + qentry = &skdev->msix_entries[i]; + skdev = qentry->rsp; + + if (qentry->have_irq) + devm_free_irq(&skdev->pdev->dev, + qentry->vector, qentry->rsp); + } + pci_disable_msix(skdev->pdev); + kfree(skdev->msix_entries); + skdev->msix_count = 0; + skdev->msix_entries = NULL; +} + +static int skd_acquire_msix(struct skd_device *skdev) +{ + int i, rc; + struct pci_dev *pdev; + struct msix_entry *entries = NULL; + struct skd_msix_entry *qentry; + + pdev = skdev->pdev; + skdev->msix_count = SKD_MAX_MSIX_COUNT; + entries = kzalloc(sizeof(struct msix_entry) * SKD_MAX_MSIX_COUNT, + GFP_KERNEL); + if (!entries) + return -ENOMEM; + + for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) + entries[i].entry = i; + + rc = pci_enable_msix(pdev, entries, SKD_MAX_MSIX_COUNT); + if (rc < 0) + goto msix_out; + if (rc) { + if (rc < SKD_MIN_MSIX_COUNT) { + pr_err("(%s): failed to enable MSI-X %d\n", + skd_name(skdev), rc); + goto msix_out; + } + pr_debug("%s:%s:%d %s: <%s> allocated %d MSI-X vectors\n", + skdev->name, __func__, __LINE__, + pci_name(pdev), skdev->name, rc); + + skdev->msix_count = rc; + rc = pci_enable_msix(pdev, entries, skdev->msix_count); + if (rc) { + pr_err("(%s): failed to enable MSI-X " + "support (%d) %d\n", + skd_name(skdev), skdev->msix_count, rc); + goto msix_out; + } + } + skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) * + skdev->msix_count, GFP_KERNEL); + if (!skdev->msix_entries) { + rc = -ENOMEM; + skdev->msix_count = 0; + pr_err("(%s): msix table allocation error\n", + skd_name(skdev)); + goto msix_out; + } + + qentry = skdev->msix_entries; + for (i = 0; i < skdev->msix_count; i++) { + qentry->vector = entries[i].vector; + qentry->entry = entries[i].entry; + qentry->rsp = NULL; + qentry->have_irq = 0; + pr_debug("%s:%s:%d %s: <%s> msix (%d) vec %d, entry %x\n", + skdev->name, __func__, __LINE__, + pci_name(pdev), skdev->name, + i, qentry->vector, qentry->entry); + qentry++; + } + + /* Enable MSI-X vectors for the base queue */ + for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) { + qentry = &skdev->msix_entries[i]; + snprintf(qentry->isr_name, sizeof(qentry->isr_name), + "%s%d-msix %s", DRV_NAME, skdev->devno, + msix_entries[i].name); + rc = devm_request_irq(&skdev->pdev->dev, qentry->vector, + msix_entries[i].handler, 0, + qentry->isr_name, skdev); + if (rc) { + pr_err("(%s): Unable to register(%d) MSI-X " + "handler %d: %s\n", + skd_name(skdev), rc, i, qentry->isr_name); + goto msix_out; + } else { + qentry->have_irq = 1; + qentry->rsp = skdev; + } + } + pr_debug("%s:%s:%d %s: <%s> msix %d irq(s) enabled\n", + skdev->name, __func__, __LINE__, + pci_name(pdev), skdev->name, skdev->msix_count); + return 0; + +msix_out: + if (entries) + kfree(entries); + skd_release_msix(skdev); + return rc; +} + +static int skd_acquire_irq(struct skd_device *skdev) +{ + int rc; + struct pci_dev *pdev; + + pdev = skdev->pdev; + skdev->msix_count = 0; + +RETRY_IRQ_TYPE: + switch (skdev->irq_type) { + case SKD_IRQ_MSIX: + rc = skd_acquire_msix(skdev); + if (!rc) + pr_info("(%s): MSI-X %d irqs enabled\n", + skd_name(skdev), skdev->msix_count); + else { + pr_err( + "(%s): failed to enable MSI-X, re-trying with MSI %d\n", + skd_name(skdev), rc); + skdev->irq_type = SKD_IRQ_MSI; + goto RETRY_IRQ_TYPE; + } + break; + case SKD_IRQ_MSI: + snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d-msi", + DRV_NAME, skdev->devno); + rc = pci_enable_msi(pdev); + if (!rc) { + rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, 0, + skdev->isr_name, skdev); + if (rc) { + pci_disable_msi(pdev); + pr_err( + "(%s): failed to allocate the MSI interrupt %d\n", + skd_name(skdev), rc); + goto RETRY_IRQ_LEGACY; + } + pr_info("(%s): MSI irq %d enabled\n", + skd_name(skdev), pdev->irq); + } else { +RETRY_IRQ_LEGACY: + pr_err( + "(%s): failed to enable MSI, re-trying with LEGACY %d\n", + skd_name(skdev), rc); + skdev->irq_type = SKD_IRQ_LEGACY; + goto RETRY_IRQ_TYPE; + } + break; + case SKD_IRQ_LEGACY: + snprintf(skdev->isr_name, sizeof(skdev->isr_name), + "%s%d-legacy", DRV_NAME, skdev->devno); + rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, + IRQF_SHARED, skdev->isr_name, skdev); + if (!rc) + pr_info("(%s): LEGACY irq %d enabled\n", + skd_name(skdev), pdev->irq); + else + pr_err("(%s): request LEGACY irq error %d\n", + skd_name(skdev), rc); + break; + default: + pr_info("(%s): irq_type %d invalid, re-set to %d\n", + skd_name(skdev), skdev->irq_type, SKD_IRQ_DEFAULT); + skdev->irq_type = SKD_IRQ_LEGACY; + goto RETRY_IRQ_TYPE; + } + return rc; +} + +static void skd_release_irq(struct skd_device *skdev) +{ + switch (skdev->irq_type) { + case SKD_IRQ_MSIX: + skd_release_msix(skdev); + break; + case SKD_IRQ_MSI: + devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev); + pci_disable_msi(skdev->pdev); + break; + case SKD_IRQ_LEGACY: + devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev); + break; + default: + pr_err("(%s): wrong irq type %d!", + skd_name(skdev), skdev->irq_type); + break; + } +} + +/* + ***************************************************************************** + * CONSTRUCT + ***************************************************************************** + */ + +static int skd_cons_skcomp(struct skd_device *skdev) +{ + int rc = 0; + struct fit_completion_entry_v1 *skcomp; + u32 nbytes; + + nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY; + nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY; + + pr_debug("%s:%s:%d comp pci_alloc, total bytes %d entries %d\n", + skdev->name, __func__, __LINE__, + nbytes, SKD_N_COMPLETION_ENTRY); + + skcomp = pci_alloc_consistent(skdev->pdev, nbytes, + &skdev->cq_dma_address); + + if (skcomp == NULL) { + rc = -ENOMEM; + goto err_out; + } + + memset(skcomp, 0, nbytes); + + skdev->skcomp_table = skcomp; + skdev->skerr_table = (struct fit_comp_error_info *)((char *)skcomp + + sizeof(*skcomp) * + SKD_N_COMPLETION_ENTRY); + +err_out: + return rc; +} + +static int skd_cons_skmsg(struct skd_device *skdev) +{ + int rc = 0; + u32 i; + + pr_debug("%s:%s:%d skmsg_table kzalloc, struct %lu, count %u total %lu\n", + skdev->name, __func__, __LINE__, + sizeof(struct skd_fitmsg_context), + skdev->num_fitmsg_context, + sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context); + + skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context) + *skdev->num_fitmsg_context, GFP_KERNEL); + if (skdev->skmsg_table == NULL) { + rc = -ENOMEM; + goto err_out; + } + + for (i = 0; i < skdev->num_fitmsg_context; i++) { + struct skd_fitmsg_context *skmsg; + + skmsg = &skdev->skmsg_table[i]; + + skmsg->id = i + SKD_ID_FIT_MSG; + + skmsg->state = SKD_MSG_STATE_IDLE; + skmsg->msg_buf = pci_alloc_consistent(skdev->pdev, + SKD_N_FITMSG_BYTES + 64, + &skmsg->mb_dma_address); + + if (skmsg->msg_buf == NULL) { + rc = -ENOMEM; + goto err_out; + } + + skmsg->offset = (u32)((u64)skmsg->msg_buf & + (~FIT_QCMD_BASE_ADDRESS_MASK)); + skmsg->msg_buf += ~FIT_QCMD_BASE_ADDRESS_MASK; + skmsg->msg_buf = (u8 *)((u64)skmsg->msg_buf & + FIT_QCMD_BASE_ADDRESS_MASK); + skmsg->mb_dma_address += ~FIT_QCMD_BASE_ADDRESS_MASK; + skmsg->mb_dma_address &= FIT_QCMD_BASE_ADDRESS_MASK; + memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES); + + skmsg->next = &skmsg[1]; + } + + /* Free list is in order starting with the 0th entry. */ + skdev->skmsg_table[i - 1].next = NULL; + skdev->skmsg_free_list = skdev->skmsg_table; + +err_out: + return rc; +} + +static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev, + u32 n_sg, + dma_addr_t *ret_dma_addr) +{ + struct fit_sg_descriptor *sg_list; + u32 nbytes; + + nbytes = sizeof(*sg_list) * n_sg; + + sg_list = pci_alloc_consistent(skdev->pdev, nbytes, ret_dma_addr); + + if (sg_list != NULL) { + uint64_t dma_address = *ret_dma_addr; + u32 i; + + memset(sg_list, 0, nbytes); + + for (i = 0; i < n_sg - 1; i++) { + uint64_t ndp_off; + ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor); + + sg_list[i].next_desc_ptr = dma_address + ndp_off; + } + sg_list[i].next_desc_ptr = 0LL; + } + + return sg_list; +} + +static int skd_cons_skreq(struct skd_device *skdev) +{ + int rc = 0; + u32 i; + + pr_debug("%s:%s:%d skreq_table kzalloc, struct %lu, count %u total %lu\n", + skdev->name, __func__, __LINE__, + sizeof(struct skd_request_context), + skdev->num_req_context, + sizeof(struct skd_request_context) * skdev->num_req_context); + + skdev->skreq_table = kzalloc(sizeof(struct skd_request_context) + * skdev->num_req_context, GFP_KERNEL); + if (skdev->skreq_table == NULL) { + rc = -ENOMEM; + goto err_out; + } + + pr_debug("%s:%s:%d alloc sg_table sg_per_req %u scatlist %lu total %lu\n", + skdev->name, __func__, __LINE__, + skdev->sgs_per_request, sizeof(struct scatterlist), + skdev->sgs_per_request * sizeof(struct scatterlist)); + + for (i = 0; i < skdev->num_req_context; i++) { + struct skd_request_context *skreq; + + skreq = &skdev->skreq_table[i]; + + skreq->id = i + SKD_ID_RW_REQUEST; + skreq->state = SKD_REQ_STATE_IDLE; + + skreq->sg = kzalloc(sizeof(struct scatterlist) * + skdev->sgs_per_request, GFP_KERNEL); + if (skreq->sg == NULL) { + rc = -ENOMEM; + goto err_out; + } + sg_init_table(skreq->sg, skdev->sgs_per_request); + + skreq->sksg_list = skd_cons_sg_list(skdev, + skdev->sgs_per_request, + &skreq->sksg_dma_address); + + if (skreq->sksg_list == NULL) { + rc = -ENOMEM; + goto err_out; + } + + skreq->next = &skreq[1]; + } + + /* Free list is in order starting with the 0th entry. */ + skdev->skreq_table[i - 1].next = NULL; + skdev->skreq_free_list = skdev->skreq_table; + +err_out: + return rc; +} + +static int skd_cons_skspcl(struct skd_device *skdev) +{ + int rc = 0; + u32 i, nbytes; + + pr_debug("%s:%s:%d skspcl_table kzalloc, struct %lu, count %u total %lu\n", + skdev->name, __func__, __LINE__, + sizeof(struct skd_special_context), + skdev->n_special, + sizeof(struct skd_special_context) * skdev->n_special); + + skdev->skspcl_table = kzalloc(sizeof(struct skd_special_context) + * skdev->n_special, GFP_KERNEL); + if (skdev->skspcl_table == NULL) { + rc = -ENOMEM; + goto err_out; + } + + for (i = 0; i < skdev->n_special; i++) { + struct skd_special_context *skspcl; + + skspcl = &skdev->skspcl_table[i]; + + skspcl->req.id = i + SKD_ID_SPECIAL_REQUEST; + skspcl->req.state = SKD_REQ_STATE_IDLE; + + skspcl->req.next = &skspcl[1].req; + + nbytes = SKD_N_SPECIAL_FITMSG_BYTES; + + skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes, + &skspcl->mb_dma_address); + if (skspcl->msg_buf == NULL) { + rc = -ENOMEM; + goto err_out; + } + + memset(skspcl->msg_buf, 0, nbytes); + + skspcl->req.sg = kzalloc(sizeof(struct scatterlist) * + SKD_N_SG_PER_SPECIAL, GFP_KERNEL); + if (skspcl->req.sg == NULL) { + rc = -ENOMEM; + goto err_out; + } + + skspcl->req.sksg_list = skd_cons_sg_list(skdev, + SKD_N_SG_PER_SPECIAL, + &skspcl->req. + sksg_dma_address); + if (skspcl->req.sksg_list == NULL) { + rc = -ENOMEM; + goto err_out; + } + } + + /* Free list is in order starting with the 0th entry. */ + skdev->skspcl_table[i - 1].req.next = NULL; + skdev->skspcl_free_list = skdev->skspcl_table; + + return rc; + +err_out: + return rc; +} + +static int skd_cons_sksb(struct skd_device *skdev) +{ + int rc = 0; + struct skd_special_context *skspcl; + u32 nbytes; + + skspcl = &skdev->internal_skspcl; + + skspcl->req.id = 0 + SKD_ID_INTERNAL; + skspcl->req.state = SKD_REQ_STATE_IDLE; + + nbytes = SKD_N_INTERNAL_BYTES; + + skspcl->data_buf = pci_alloc_consistent(skdev->pdev, nbytes, + &skspcl->db_dma_address); + if (skspcl->data_buf == NULL) { + rc = -ENOMEM; + goto err_out; + } + + memset(skspcl->data_buf, 0, nbytes); + + nbytes = SKD_N_SPECIAL_FITMSG_BYTES; + skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes, + &skspcl->mb_dma_address); + if (skspcl->msg_buf == NULL) { + rc = -ENOMEM; + goto err_out; + } + + memset(skspcl->msg_buf, 0, nbytes); + + skspcl->req.sksg_list = skd_cons_sg_list(skdev, 1, + &skspcl->req.sksg_dma_address); + if (skspcl->req.sksg_list == NULL) { + rc = -ENOMEM; + goto err_out; + } + + if (!skd_format_internal_skspcl(skdev)) { + rc = -EINVAL; + goto err_out; + } + +err_out: + return rc; +} + +static int skd_cons_disk(struct skd_device *skdev) +{ + int rc = 0; + struct gendisk *disk; + struct request_queue *q; + unsigned long flags; + + disk = alloc_disk(SKD_MINORS_PER_DEVICE); + if (!disk) { + rc = -ENOMEM; + goto err_out; + } + + skdev->disk = disk; + sprintf(disk->disk_name, DRV_NAME "%u", skdev->devno); + + disk->major = skdev->major; + disk->first_minor = skdev->devno * SKD_MINORS_PER_DEVICE; + disk->fops = &skd_blockdev_ops; + disk->private_data = skdev; + + q = blk_init_queue(skd_request_fn, &skdev->lock); + if (!q) { + rc = -ENOMEM; + goto err_out; + } + + skdev->queue = q; + disk->queue = q; + q->queuedata = skdev; + + blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_max_segments(q, skdev->sgs_per_request); + blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS); + + /* set sysfs ptimal_io_size to 8K */ + blk_queue_io_opt(q, 8192); + + /* DISCARD Flag initialization. */ + q->limits.discard_granularity = 8192; + q->limits.discard_alignment = 0; + q->limits.max_discard_sectors = UINT_MAX >> 9; + q->limits.discard_zeroes_data = 1; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d stopping %s queue\n", + skdev->name, __func__, __LINE__, skdev->name); + blk_stop_queue(skdev->queue); + spin_unlock_irqrestore(&skdev->lock, flags); + +err_out: + return rc; +} + +#define SKD_N_DEV_TABLE 16u +static u32 skd_next_devno; + +static struct skd_device *skd_construct(struct pci_dev *pdev) +{ + struct skd_device *skdev; + int blk_major = skd_major; + int rc; + + skdev = kzalloc(sizeof(*skdev), GFP_KERNEL); + + if (!skdev) { + pr_err(PFX "(%s): memory alloc failure\n", + pci_name(pdev)); + return NULL; + } + + skdev->state = SKD_DRVR_STATE_LOAD; + skdev->pdev = pdev; + skdev->devno = skd_next_devno++; + skdev->major = blk_major; + skdev->irq_type = skd_isr_type; + sprintf(skdev->name, DRV_NAME "%d", skdev->devno); + skdev->dev_max_queue_depth = 0; + + skdev->num_req_context = skd_max_queue_depth; + skdev->num_fitmsg_context = skd_max_queue_depth; + skdev->n_special = skd_max_pass_thru; + skdev->cur_max_queue_depth = 1; + skdev->queue_low_water_mark = 1; + skdev->proto_ver = 99; + skdev->sgs_per_request = skd_sgs_per_request; + skdev->dbg_level = skd_dbg_level; + + atomic_set(&skdev->device_count, 0); + + spin_lock_init(&skdev->lock); + + INIT_WORK(&skdev->completion_worker, skd_completion_worker); + + pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__); + rc = skd_cons_skcomp(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__); + rc = skd_cons_skmsg(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__); + rc = skd_cons_skreq(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__); + rc = skd_cons_skspcl(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__); + rc = skd_cons_sksb(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__); + rc = skd_cons_disk(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d VICTORY\n", skdev->name, __func__, __LINE__); + return skdev; + +err_out: + pr_debug("%s:%s:%d construct failed\n", + skdev->name, __func__, __LINE__); + skd_destruct(skdev); + return NULL; +} + +/* + ***************************************************************************** + * DESTRUCT (FREE) + ***************************************************************************** + */ + +static void skd_free_skcomp(struct skd_device *skdev) +{ + if (skdev->skcomp_table != NULL) { + u32 nbytes; + + nbytes = sizeof(skdev->skcomp_table[0]) * + SKD_N_COMPLETION_ENTRY; + pci_free_consistent(skdev->pdev, nbytes, + skdev->skcomp_table, skdev->cq_dma_address); + } + + skdev->skcomp_table = NULL; + skdev->cq_dma_address = 0; +} + +static void skd_free_skmsg(struct skd_device *skdev) +{ + u32 i; + + if (skdev->skmsg_table == NULL) + return; + + for (i = 0; i < skdev->num_fitmsg_context; i++) { + struct skd_fitmsg_context *skmsg; + + skmsg = &skdev->skmsg_table[i]; + + if (skmsg->msg_buf != NULL) { + skmsg->msg_buf += skmsg->offset; + skmsg->mb_dma_address += skmsg->offset; + pci_free_consistent(skdev->pdev, SKD_N_FITMSG_BYTES, + skmsg->msg_buf, + skmsg->mb_dma_address); + } + skmsg->msg_buf = NULL; + skmsg->mb_dma_address = 0; + } + + kfree(skdev->skmsg_table); + skdev->skmsg_table = NULL; +} + +static void skd_free_sg_list(struct skd_device *skdev, + struct fit_sg_descriptor *sg_list, + u32 n_sg, dma_addr_t dma_addr) +{ + if (sg_list != NULL) { + u32 nbytes; + + nbytes = sizeof(*sg_list) * n_sg; + + pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr); + } +} + +static void skd_free_skreq(struct skd_device *skdev) +{ + u32 i; + + if (skdev->skreq_table == NULL) + return; + + for (i = 0; i < skdev->num_req_context; i++) { + struct skd_request_context *skreq; + + skreq = &skdev->skreq_table[i]; + + skd_free_sg_list(skdev, skreq->sksg_list, + skdev->sgs_per_request, + skreq->sksg_dma_address); + + skreq->sksg_list = NULL; + skreq->sksg_dma_address = 0; + + kfree(skreq->sg); + } + + kfree(skdev->skreq_table); + skdev->skreq_table = NULL; +} + +static void skd_free_skspcl(struct skd_device *skdev) +{ + u32 i; + u32 nbytes; + + if (skdev->skspcl_table == NULL) + return; + + for (i = 0; i < skdev->n_special; i++) { + struct skd_special_context *skspcl; + + skspcl = &skdev->skspcl_table[i]; + + if (skspcl->msg_buf != NULL) { + nbytes = SKD_N_SPECIAL_FITMSG_BYTES; + pci_free_consistent(skdev->pdev, nbytes, + skspcl->msg_buf, + skspcl->mb_dma_address); + } + + skspcl->msg_buf = NULL; + skspcl->mb_dma_address = 0; + + skd_free_sg_list(skdev, skspcl->req.sksg_list, + SKD_N_SG_PER_SPECIAL, + skspcl->req.sksg_dma_address); + + skspcl->req.sksg_list = NULL; + skspcl->req.sksg_dma_address = 0; + + kfree(skspcl->req.sg); + } + + kfree(skdev->skspcl_table); + skdev->skspcl_table = NULL; +} + +static void skd_free_sksb(struct skd_device *skdev) +{ + struct skd_special_context *skspcl; + u32 nbytes; + + skspcl = &skdev->internal_skspcl; + + if (skspcl->data_buf != NULL) { + nbytes = SKD_N_INTERNAL_BYTES; + + pci_free_consistent(skdev->pdev, nbytes, + skspcl->data_buf, skspcl->db_dma_address); + } + + skspcl->data_buf = NULL; + skspcl->db_dma_address = 0; + + if (skspcl->msg_buf != NULL) { + nbytes = SKD_N_SPECIAL_FITMSG_BYTES; + pci_free_consistent(skdev->pdev, nbytes, + skspcl->msg_buf, skspcl->mb_dma_address); + } + + skspcl->msg_buf = NULL; + skspcl->mb_dma_address = 0; + + skd_free_sg_list(skdev, skspcl->req.sksg_list, 1, + skspcl->req.sksg_dma_address); + + skspcl->req.sksg_list = NULL; + skspcl->req.sksg_dma_address = 0; +} + +static void skd_free_disk(struct skd_device *skdev) +{ + struct gendisk *disk = skdev->disk; + + if (disk != NULL) { + struct request_queue *q = disk->queue; + + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + if (q) + blk_cleanup_queue(q); + put_disk(disk); + } + skdev->disk = NULL; +} + +static void skd_destruct(struct skd_device *skdev) +{ + if (skdev == NULL) + return; + + + pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__); + skd_free_disk(skdev); + + pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__); + skd_free_sksb(skdev); + + pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__); + skd_free_skspcl(skdev); + + pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__); + skd_free_skreq(skdev); + + pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__); + skd_free_skmsg(skdev); + + pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__); + skd_free_skcomp(skdev); + + pr_debug("%s:%s:%d skdev\n", skdev->name, __func__, __LINE__); + kfree(skdev); +} + +/* + ***************************************************************************** + * BLOCK DEVICE (BDEV) GLUE + ***************************************************************************** + */ + +static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct skd_device *skdev; + u64 capacity; + + skdev = bdev->bd_disk->private_data; + + pr_debug("%s:%s:%d %s: CMD[%s] getgeo device\n", + skdev->name, __func__, __LINE__, + bdev->bd_disk->disk_name, current->comm); + + if (skdev->read_cap_is_valid) { + capacity = get_capacity(skdev->disk); + geo->heads = 64; + geo->sectors = 255; + geo->cylinders = (capacity) / (255 * 64); + + return 0; + } + return -EIO; +} + +static int skd_bdev_attach(struct skd_device *skdev) +{ + pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__); + add_disk(skdev->disk); + return 0; +} + +static const struct block_device_operations skd_blockdev_ops = { + .owner = THIS_MODULE, + .ioctl = skd_bdev_ioctl, + .getgeo = skd_bdev_getgeo, +}; + + +/* + ***************************************************************************** + * PCIe DRIVER GLUE + ***************************************************************************** + */ + +static DEFINE_PCI_DEVICE_TABLE(skd_pci_tbl) = { + { PCI_VENDOR_ID_STEC, PCI_DEVICE_ID_S1120, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, + { 0 } /* terminate list */ +}; + +MODULE_DEVICE_TABLE(pci, skd_pci_tbl); + +static char *skd_pci_info(struct skd_device *skdev, char *str) +{ + int pcie_reg; + + strcpy(str, "PCIe ("); + pcie_reg = pci_find_capability(skdev->pdev, PCI_CAP_ID_EXP); + + if (pcie_reg) { + + char lwstr[6]; + uint16_t pcie_lstat, lspeed, lwidth; + + pcie_reg += 0x12; + pci_read_config_word(skdev->pdev, pcie_reg, &pcie_lstat); + lspeed = pcie_lstat & (0xF); + lwidth = (pcie_lstat & 0x3F0) >> 4; + + if (lspeed == 1) + strcat(str, "2.5GT/s "); + else if (lspeed == 2) + strcat(str, "5.0GT/s "); + else + strcat(str, "<unknown> "); + snprintf(lwstr, sizeof(lwstr), "%dX)", lwidth); + strcat(str, lwstr); + } + return str; +} + +static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int i; + int rc = 0; + char pci_str[32]; + struct skd_device *skdev; + + pr_info("STEC s1120 Driver(%s) version %s-b%s\n", + DRV_NAME, DRV_VERSION, DRV_BUILD_ID); + pr_info("(skd?:??:[%s]): vendor=%04X device=%04x\n", + pci_name(pdev), pdev->vendor, pdev->device); + + rc = pci_enable_device(pdev); + if (rc) + return rc; + rc = pci_request_regions(pdev, DRV_NAME); + if (rc) + goto err_out; + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (!rc) { + if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) { + + pr_err("(%s): consistent DMA mask error %d\n", + pci_name(pdev), rc); + } + } else { + (rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))); + if (rc) { + + pr_err("(%s): DMA mask error %d\n", + pci_name(pdev), rc); + goto err_out_regions; + } + } + + if (!skd_major) { + rc = register_blkdev(0, DRV_NAME); + if (rc < 0) + goto err_out_regions; + BUG_ON(!rc); + skd_major = rc; + } + + skdev = skd_construct(pdev); + if (skdev == NULL) { + rc = -ENOMEM; + goto err_out_regions; + } + + skd_pci_info(skdev, pci_str); + pr_info("(%s): %s 64bit\n", skd_name(skdev), pci_str); + + pci_set_master(pdev); + rc = pci_enable_pcie_error_reporting(pdev); + if (rc) { + pr_err( + "(%s): bad enable of PCIe error reporting rc=%d\n", + skd_name(skdev), rc); + skdev->pcie_error_reporting_is_enabled = 0; + } else + skdev->pcie_error_reporting_is_enabled = 1; + + + pci_set_drvdata(pdev, skdev); + + skdev->disk->driverfs_dev = &pdev->dev; + + for (i = 0; i < SKD_MAX_BARS; i++) { + skdev->mem_phys[i] = pci_resource_start(pdev, i); + skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); + skdev->mem_map[i] = ioremap(skdev->mem_phys[i], + skdev->mem_size[i]); + if (!skdev->mem_map[i]) { + pr_err("(%s): Unable to map adapter memory!\n", + skd_name(skdev)); + rc = -ENODEV; + goto err_out_iounmap; + } + pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n", + skdev->name, __func__, __LINE__, + skdev->mem_map[i], + (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]); + } + + rc = skd_acquire_irq(skdev); + if (rc) { + pr_err("(%s): interrupt resource error %d\n", + skd_name(skdev), rc); + goto err_out_iounmap; + } + + rc = skd_start_timer(skdev); + if (rc) + goto err_out_timer; + + init_waitqueue_head(&skdev->waitq); + + skd_start_device(skdev); + + rc = wait_event_interruptible_timeout(skdev->waitq, + (skdev->gendisk_on), + (SKD_START_WAIT_SECONDS * HZ)); + if (skdev->gendisk_on > 0) { + /* device came on-line after reset */ + skd_bdev_attach(skdev); + rc = 0; + } else { + /* we timed out, something is wrong with the device, + don't add the disk structure */ + pr_err( + "(%s): error: waiting for s1120 timed out %d!\n", + skd_name(skdev), rc); + /* in case of no error; we timeout with ENXIO */ + if (!rc) + rc = -ENXIO; + goto err_out_timer; + } + + +#ifdef SKD_VMK_POLL_HANDLER + if (skdev->irq_type == SKD_IRQ_MSIX) { + /* MSIX completion handler is being used for coredump */ + vmklnx_scsi_register_poll_handler(skdev->scsi_host, + skdev->msix_entries[5].vector, + skd_comp_q, skdev); + } else { + vmklnx_scsi_register_poll_handler(skdev->scsi_host, + skdev->pdev->irq, skd_isr, + skdev); + } +#endif /* SKD_VMK_POLL_HANDLER */ + + return rc; + +err_out_timer: + skd_stop_device(skdev); + skd_release_irq(skdev); + +err_out_iounmap: + for (i = 0; i < SKD_MAX_BARS; i++) + if (skdev->mem_map[i]) + iounmap(skdev->mem_map[i]); + + if (skdev->pcie_error_reporting_is_enabled) + pci_disable_pcie_error_reporting(pdev); + + skd_destruct(skdev); + +err_out_regions: + pci_release_regions(pdev); + +err_out: + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + return rc; +} + +static void skd_pci_remove(struct pci_dev *pdev) +{ + int i; + struct skd_device *skdev; + + skdev = pci_get_drvdata(pdev); + if (!skdev) { + pr_err("%s: no device data for PCI\n", pci_name(pdev)); + return; + } + skd_stop_device(skdev); + skd_release_irq(skdev); + + for (i = 0; i < SKD_MAX_BARS; i++) + if (skdev->mem_map[i]) + iounmap((u32 *)skdev->mem_map[i]); + + if (skdev->pcie_error_reporting_is_enabled) + pci_disable_pcie_error_reporting(pdev); + + skd_destruct(skdev); + + pci_release_regions(pdev); + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + + return; +} + +static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state) +{ + int i; + struct skd_device *skdev; + + skdev = pci_get_drvdata(pdev); + if (!skdev) { + pr_err("%s: no device data for PCI\n", pci_name(pdev)); + return -EIO; + } + + skd_stop_device(skdev); + + skd_release_irq(skdev); + + for (i = 0; i < SKD_MAX_BARS; i++) + if (skdev->mem_map[i]) + iounmap((u32 *)skdev->mem_map[i]); + + if (skdev->pcie_error_reporting_is_enabled) + pci_disable_pcie_error_reporting(pdev); + + pci_release_regions(pdev); + pci_save_state(pdev); + pci_disable_device(pdev); + pci_set_power_state(pdev, pci_choose_state(pdev, state)); + return 0; +} + +static int skd_pci_resume(struct pci_dev *pdev) +{ + int i; + int rc = 0; + struct skd_device *skdev; + + skdev = pci_get_drvdata(pdev); + if (!skdev) { + pr_err("%s: no device data for PCI\n", pci_name(pdev)); + return -1; + } + + pci_set_power_state(pdev, PCI_D0); + pci_enable_wake(pdev, PCI_D0, 0); + pci_restore_state(pdev); + + rc = pci_enable_device(pdev); + if (rc) + return rc; + rc = pci_request_regions(pdev, DRV_NAME); + if (rc) + goto err_out; + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (!rc) { + if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) { + + pr_err("(%s): consistent DMA mask error %d\n", + pci_name(pdev), rc); + } + } else { + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (rc) { + + pr_err("(%s): DMA mask error %d\n", + pci_name(pdev), rc); + goto err_out_regions; + } + } + + pci_set_master(pdev); + rc = pci_enable_pcie_error_reporting(pdev); + if (rc) { + pr_err("(%s): bad enable of PCIe error reporting rc=%d\n", + skdev->name, rc); + skdev->pcie_error_reporting_is_enabled = 0; + } else + skdev->pcie_error_reporting_is_enabled = 1; + + for (i = 0; i < SKD_MAX_BARS; i++) { + + skdev->mem_phys[i] = pci_resource_start(pdev, i); + skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); + skdev->mem_map[i] = ioremap(skdev->mem_phys[i], + skdev->mem_size[i]); + if (!skdev->mem_map[i]) { + pr_err("(%s): Unable to map adapter memory!\n", + skd_name(skdev)); + rc = -ENODEV; + goto err_out_iounmap; + } + pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n", + skdev->name, __func__, __LINE__, + skdev->mem_map[i], + (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]); + } + rc = skd_acquire_irq(skdev); + if (rc) { + + pr_err("(%s): interrupt resource error %d\n", + pci_name(pdev), rc); + goto err_out_iounmap; + } + + rc = skd_start_timer(skdev); + if (rc) + goto err_out_timer; + + init_waitqueue_head(&skdev->waitq); + + skd_start_device(skdev); + + return rc; + +err_out_timer: + skd_stop_device(skdev); + skd_release_irq(skdev); + +err_out_iounmap: + for (i = 0; i < SKD_MAX_BARS; i++) + if (skdev->mem_map[i]) + iounmap(skdev->mem_map[i]); + + if (skdev->pcie_error_reporting_is_enabled) + pci_disable_pcie_error_reporting(pdev); + +err_out_regions: + pci_release_regions(pdev); + +err_out: + pci_disable_device(pdev); + return rc; +} + +static void skd_pci_shutdown(struct pci_dev *pdev) +{ + struct skd_device *skdev; + + pr_err("skd_pci_shutdown called\n"); + + skdev = pci_get_drvdata(pdev); + if (!skdev) { + pr_err("%s: no device data for PCI\n", pci_name(pdev)); + return; + } + + pr_err("%s: calling stop\n", skd_name(skdev)); + skd_stop_device(skdev); +} + +static struct pci_driver skd_driver = { + .name = DRV_NAME, + .id_table = skd_pci_tbl, + .probe = skd_pci_probe, + .remove = skd_pci_remove, + .suspend = skd_pci_suspend, + .resume = skd_pci_resume, + .shutdown = skd_pci_shutdown, +}; + +/* + ***************************************************************************** + * LOGGING SUPPORT + ***************************************************************************** + */ + +static const char *skd_name(struct skd_device *skdev) +{ + memset(skdev->id_str, 0, sizeof(skdev->id_str)); + + if (skdev->inquiry_is_valid) + snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:%s:[%s]", + skdev->name, skdev->inq_serial_num, + pci_name(skdev->pdev)); + else + snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:??:[%s]", + skdev->name, pci_name(skdev->pdev)); + + return skdev->id_str; +} + +const char *skd_drive_state_to_str(int state) +{ + switch (state) { + case FIT_SR_DRIVE_OFFLINE: + return "OFFLINE"; + case FIT_SR_DRIVE_INIT: + return "INIT"; + case FIT_SR_DRIVE_ONLINE: + return "ONLINE"; + case FIT_SR_DRIVE_BUSY: + return "BUSY"; + case FIT_SR_DRIVE_FAULT: + return "FAULT"; + case FIT_SR_DRIVE_DEGRADED: + return "DEGRADED"; + case FIT_SR_PCIE_LINK_DOWN: + return "INK_DOWN"; + case FIT_SR_DRIVE_SOFT_RESET: + return "SOFT_RESET"; + case FIT_SR_DRIVE_NEED_FW_DOWNLOAD: + return "NEED_FW"; + case FIT_SR_DRIVE_INIT_FAULT: + return "INIT_FAULT"; + case FIT_SR_DRIVE_BUSY_SANITIZE: + return "BUSY_SANITIZE"; + case FIT_SR_DRIVE_BUSY_ERASE: + return "BUSY_ERASE"; + case FIT_SR_DRIVE_FW_BOOTING: + return "FW_BOOTING"; + default: + return "???"; + } +} + +const char *skd_skdev_state_to_str(enum skd_drvr_state state) +{ + switch (state) { + case SKD_DRVR_STATE_LOAD: + return "LOAD"; + case SKD_DRVR_STATE_IDLE: + return "IDLE"; + case SKD_DRVR_STATE_BUSY: + return "BUSY"; + case SKD_DRVR_STATE_STARTING: + return "STARTING"; + case SKD_DRVR_STATE_ONLINE: + return "ONLINE"; + case SKD_DRVR_STATE_PAUSING: + return "PAUSING"; + case SKD_DRVR_STATE_PAUSED: + return "PAUSED"; + case SKD_DRVR_STATE_DRAINING_TIMEOUT: + return "DRAINING_TIMEOUT"; + case SKD_DRVR_STATE_RESTARTING: + return "RESTARTING"; + case SKD_DRVR_STATE_RESUMING: + return "RESUMING"; + case SKD_DRVR_STATE_STOPPING: + return "STOPPING"; + case SKD_DRVR_STATE_SYNCING: + return "SYNCING"; + case SKD_DRVR_STATE_FAULT: + return "FAULT"; + case SKD_DRVR_STATE_DISAPPEARED: + return "DISAPPEARED"; + case SKD_DRVR_STATE_BUSY_ERASE: + return "BUSY_ERASE"; + case SKD_DRVR_STATE_BUSY_SANITIZE: + return "BUSY_SANITIZE"; + case SKD_DRVR_STATE_BUSY_IMMINENT: + return "BUSY_IMMINENT"; + case SKD_DRVR_STATE_WAIT_BOOT: + return "WAIT_BOOT"; + + default: + return "???"; + } +} + +const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state) +{ + switch (state) { + case SKD_MSG_STATE_IDLE: + return "IDLE"; + case SKD_MSG_STATE_BUSY: + return "BUSY"; + default: + return "???"; + } +} + +const char *skd_skreq_state_to_str(enum skd_req_state state) +{ + switch (state) { + case SKD_REQ_STATE_IDLE: + return "IDLE"; + case SKD_REQ_STATE_SETUP: + return "SETUP"; + case SKD_REQ_STATE_BUSY: + return "BUSY"; + case SKD_REQ_STATE_COMPLETED: + return "COMPLETED"; + case SKD_REQ_STATE_TIMEOUT: + return "TIMEOUT"; + case SKD_REQ_STATE_ABORTED: + return "ABORTED"; + default: + return "???"; + } +} + +static void skd_log_skdev(struct skd_device *skdev, const char *event) +{ + pr_debug("%s:%s:%d (%s) skdev=%p event='%s'\n", + skdev->name, __func__, __LINE__, skdev->name, skdev, event); + pr_debug("%s:%s:%d drive_state=%s(%d) driver_state=%s(%d)\n", + skdev->name, __func__, __LINE__, + skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, + skd_skdev_state_to_str(skdev->state), skdev->state); + pr_debug("%s:%s:%d busy=%d limit=%d dev=%d lowat=%d\n", + skdev->name, __func__, __LINE__, + skdev->in_flight, skdev->cur_max_queue_depth, + skdev->dev_max_queue_depth, skdev->queue_low_water_mark); + pr_debug("%s:%s:%d timestamp=0x%x cycle=%d cycle_ix=%d\n", + skdev->name, __func__, __LINE__, + skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix); +} + +static void skd_log_skmsg(struct skd_device *skdev, + struct skd_fitmsg_context *skmsg, const char *event) +{ + pr_debug("%s:%s:%d (%s) skmsg=%p event='%s'\n", + skdev->name, __func__, __LINE__, skdev->name, skmsg, event); + pr_debug("%s:%s:%d state=%s(%d) id=0x%04x length=%d\n", + skdev->name, __func__, __LINE__, + skd_skmsg_state_to_str(skmsg->state), skmsg->state, + skmsg->id, skmsg->length); +} + +static void skd_log_skreq(struct skd_device *skdev, + struct skd_request_context *skreq, const char *event) +{ + pr_debug("%s:%s:%d (%s) skreq=%p event='%s'\n", + skdev->name, __func__, __LINE__, skdev->name, skreq, event); + pr_debug("%s:%s:%d state=%s(%d) id=0x%04x fitmsg=0x%04x\n", + skdev->name, __func__, __LINE__, + skd_skreq_state_to_str(skreq->state), skreq->state, + skreq->id, skreq->fitmsg_id); + pr_debug("%s:%s:%d timo=0x%x sg_dir=%d n_sg=%d\n", + skdev->name, __func__, __LINE__, + skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg); + + if (skreq->req != NULL) { + struct request *req = skreq->req; + u32 lba = (u32)blk_rq_pos(req); + u32 count = blk_rq_sectors(req); + + pr_debug("%s:%s:%d " + "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", + skdev->name, __func__, __LINE__, + req, lba, lba, count, count, + (int)rq_data_dir(req)); + } else + pr_debug("%s:%s:%d req=NULL\n", + skdev->name, __func__, __LINE__); +} + +/* + ***************************************************************************** + * MODULE GLUE + ***************************************************************************** + */ + +static int __init skd_init(void) +{ + pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID); + + switch (skd_isr_type) { + case SKD_IRQ_LEGACY: + case SKD_IRQ_MSI: + case SKD_IRQ_MSIX: + break; + default: + pr_err(PFX "skd_isr_type %d invalid, re-set to %d\n", + skd_isr_type, SKD_IRQ_DEFAULT); + skd_isr_type = SKD_IRQ_DEFAULT; + } + + if (skd_max_queue_depth < 1 || + skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) { + pr_err(PFX "skd_max_queue_depth %d invalid, re-set to %d\n", + skd_max_queue_depth, SKD_MAX_QUEUE_DEPTH_DEFAULT); + skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT; + } + + if (skd_max_req_per_msg < 1 || skd_max_req_per_msg > 14) { + pr_err(PFX "skd_max_req_per_msg %d invalid, re-set to %d\n", + skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT); + skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT; + } + + if (skd_sgs_per_request < 1 || skd_sgs_per_request > 4096) { + pr_err(PFX "skd_sg_per_request %d invalid, re-set to %d\n", + skd_sgs_per_request, SKD_N_SG_PER_REQ_DEFAULT); + skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT; + } + + if (skd_dbg_level < 0 || skd_dbg_level > 2) { + pr_err(PFX "skd_dbg_level %d invalid, re-set to %d\n", + skd_dbg_level, 0); + skd_dbg_level = 0; + } + + if (skd_isr_comp_limit < 0) { + pr_err(PFX "skd_isr_comp_limit %d invalid, set to %d\n", + skd_isr_comp_limit, 0); + skd_isr_comp_limit = 0; + } + + if (skd_max_pass_thru < 1 || skd_max_pass_thru > 50) { + pr_err(PFX "skd_max_pass_thru %d invalid, re-set to %d\n", + skd_max_pass_thru, SKD_N_SPECIAL_CONTEXT); + skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT; + } + + return pci_register_driver(&skd_driver); +} + +static void __exit skd_exit(void) +{ + pr_info(PFX " v%s-b%s unloading\n", DRV_VERSION, DRV_BUILD_ID); + + pci_unregister_driver(&skd_driver); + + if (skd_major) + unregister_blkdev(skd_major, DRV_NAME); +} + +module_init(skd_init); +module_exit(skd_exit); diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h new file mode 100644 index 000000000000..61c757ff0161 --- /dev/null +++ b/drivers/block/skd_s1120.h @@ -0,0 +1,330 @@ +/* Copyright 2012 STEC, Inc. + * + * This file is licensed under the terms of the 3-clause + * BSD License (http://opensource.org/licenses/BSD-3-Clause) + * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html), + * at your option. Both licenses are also available in the LICENSE file + * distributed with this project. This file may not be copied, modified, + * or distributed except in accordance with those terms. + */ + + +#ifndef SKD_S1120_H +#define SKD_S1120_H + +#pragma pack(push, s1120_h, 1) + +/* + * Q-channel, 64-bit r/w + */ +#define FIT_Q_COMMAND 0x400u +#define FIT_QCMD_QID_MASK (0x3 << 1) +#define FIT_QCMD_QID0 (0x0 << 1) +#define FIT_QCMD_QID_NORMAL FIT_QCMD_QID0 +#define FIT_QCMD_QID1 (0x1 << 1) +#define FIT_QCMD_QID2 (0x2 << 1) +#define FIT_QCMD_QID3 (0x3 << 1) +#define FIT_QCMD_FLUSH_QUEUE (0ull) /* add QID */ +#define FIT_QCMD_MSGSIZE_MASK (0x3 << 4) +#define FIT_QCMD_MSGSIZE_64 (0x0 << 4) +#define FIT_QCMD_MSGSIZE_128 (0x1 << 4) +#define FIT_QCMD_MSGSIZE_256 (0x2 << 4) +#define FIT_QCMD_MSGSIZE_512 (0x3 << 4) +#define FIT_QCMD_BASE_ADDRESS_MASK (0xFFFFFFFFFFFFFFC0ull) + +/* + * Control, 32-bit r/w + */ +#define FIT_CONTROL 0x500u +#define FIT_CR_HARD_RESET (1u << 0u) +#define FIT_CR_SOFT_RESET (1u << 1u) +#define FIT_CR_DIS_TIMESTAMPS (1u << 6u) +#define FIT_CR_ENABLE_INTERRUPTS (1u << 7u) + +/* + * Status, 32-bit, r/o + */ +#define FIT_STATUS 0x510u +#define FIT_SR_DRIVE_STATE_MASK 0x000000FFu +#define FIT_SR_SIGNATURE (0xFF << 8) +#define FIT_SR_PIO_DMA (1 << 16) +#define FIT_SR_DRIVE_OFFLINE 0x00 +#define FIT_SR_DRIVE_INIT 0x01 +/* #define FIT_SR_DRIVE_READY 0x02 */ +#define FIT_SR_DRIVE_ONLINE 0x03 +#define FIT_SR_DRIVE_BUSY 0x04 +#define FIT_SR_DRIVE_FAULT 0x05 +#define FIT_SR_DRIVE_DEGRADED 0x06 +#define FIT_SR_PCIE_LINK_DOWN 0x07 +#define FIT_SR_DRIVE_SOFT_RESET 0x08 +#define FIT_SR_DRIVE_INIT_FAULT 0x09 +#define FIT_SR_DRIVE_BUSY_SANITIZE 0x0A +#define FIT_SR_DRIVE_BUSY_ERASE 0x0B +#define FIT_SR_DRIVE_FW_BOOTING 0x0C +#define FIT_SR_DRIVE_NEED_FW_DOWNLOAD 0xFE +#define FIT_SR_DEVICE_MISSING 0xFF +#define FIT_SR__RESERVED 0xFFFFFF00u + +/* + * FIT_STATUS - Status register data definition + */ +#define FIT_SR_STATE_MASK (0xFF << 0) +#define FIT_SR_SIGNATURE (0xFF << 8) +#define FIT_SR_PIO_DMA (1 << 16) + +/* + * Interrupt status, 32-bit r/w1c (w1c ==> write 1 to clear) + */ +#define FIT_INT_STATUS_HOST 0x520u +#define FIT_ISH_FW_STATE_CHANGE (1u << 0u) +#define FIT_ISH_COMPLETION_POSTED (1u << 1u) +#define FIT_ISH_MSG_FROM_DEV (1u << 2u) +#define FIT_ISH_UNDEFINED_3 (1u << 3u) +#define FIT_ISH_UNDEFINED_4 (1u << 4u) +#define FIT_ISH_Q0_FULL (1u << 5u) +#define FIT_ISH_Q1_FULL (1u << 6u) +#define FIT_ISH_Q2_FULL (1u << 7u) +#define FIT_ISH_Q3_FULL (1u << 8u) +#define FIT_ISH_QCMD_FIFO_OVERRUN (1u << 9u) +#define FIT_ISH_BAD_EXP_ROM_READ (1u << 10u) + +#define FIT_INT_DEF_MASK \ + (FIT_ISH_FW_STATE_CHANGE | \ + FIT_ISH_COMPLETION_POSTED | \ + FIT_ISH_MSG_FROM_DEV | \ + FIT_ISH_Q0_FULL | \ + FIT_ISH_Q1_FULL | \ + FIT_ISH_Q2_FULL | \ + FIT_ISH_Q3_FULL | \ + FIT_ISH_QCMD_FIFO_OVERRUN | \ + FIT_ISH_BAD_EXP_ROM_READ) + +#define FIT_INT_QUEUE_FULL \ + (FIT_ISH_Q0_FULL | \ + FIT_ISH_Q1_FULL | \ + FIT_ISH_Q2_FULL | \ + FIT_ISH_Q3_FULL) + +#define MSI_MSG_NWL_ERROR_0 0x00000000 +#define MSI_MSG_NWL_ERROR_1 0x00000001 +#define MSI_MSG_NWL_ERROR_2 0x00000002 +#define MSI_MSG_NWL_ERROR_3 0x00000003 +#define MSI_MSG_STATE_CHANGE 0x00000004 +#define MSI_MSG_COMPLETION_POSTED 0x00000005 +#define MSI_MSG_MSG_FROM_DEV 0x00000006 +#define MSI_MSG_RESERVED_0 0x00000007 +#define MSI_MSG_RESERVED_1 0x00000008 +#define MSI_MSG_QUEUE_0_FULL 0x00000009 +#define MSI_MSG_QUEUE_1_FULL 0x0000000A +#define MSI_MSG_QUEUE_2_FULL 0x0000000B +#define MSI_MSG_QUEUE_3_FULL 0x0000000C + +#define FIT_INT_RESERVED_MASK \ + (FIT_ISH_UNDEFINED_3 | \ + FIT_ISH_UNDEFINED_4) + +/* + * Interrupt mask, 32-bit r/w + * Bit definitions are the same as FIT_INT_STATUS_HOST + */ +#define FIT_INT_MASK_HOST 0x528u + +/* + * Message to device, 32-bit r/w + */ +#define FIT_MSG_TO_DEVICE 0x540u + +/* + * Message from device, 32-bit, r/o + */ +#define FIT_MSG_FROM_DEVICE 0x548u + +/* + * 32-bit messages to/from device, composition/extraction macros + */ +#define FIT_MXD_CONS(TYPE, PARAM, DATA) \ + ((((TYPE) & 0xFFu) << 24u) | \ + (((PARAM) & 0xFFu) << 16u) | \ + (((DATA) & 0xFFFFu) << 0u)) +#define FIT_MXD_TYPE(MXD) (((MXD) >> 24u) & 0xFFu) +#define FIT_MXD_PARAM(MXD) (((MXD) >> 16u) & 0xFFu) +#define FIT_MXD_DATA(MXD) (((MXD) >> 0u) & 0xFFFFu) + +/* + * Types of messages to/from device + */ +#define FIT_MTD_FITFW_INIT 0x01u +#define FIT_MTD_GET_CMDQ_DEPTH 0x02u +#define FIT_MTD_SET_COMPQ_DEPTH 0x03u +#define FIT_MTD_SET_COMPQ_ADDR 0x04u +#define FIT_MTD_ARM_QUEUE 0x05u +#define FIT_MTD_CMD_LOG_HOST_ID 0x07u +#define FIT_MTD_CMD_LOG_TIME_STAMP_LO 0x08u +#define FIT_MTD_CMD_LOG_TIME_STAMP_HI 0x09u +#define FIT_MFD_SMART_EXCEEDED 0x10u +#define FIT_MFD_POWER_DOWN 0x11u +#define FIT_MFD_OFFLINE 0x12u +#define FIT_MFD_ONLINE 0x13u +#define FIT_MFD_FW_RESTARTING 0x14u +#define FIT_MFD_PM_ACTIVE 0x15u +#define FIT_MFD_PM_STANDBY 0x16u +#define FIT_MFD_PM_SLEEP 0x17u +#define FIT_MFD_CMD_PROGRESS 0x18u + +#define FIT_MTD_DEBUG 0xFEu +#define FIT_MFD_DEBUG 0xFFu + +#define FIT_MFD_MASK (0xFFu) +#define FIT_MFD_DATA_MASK (0xFFu) +#define FIT_MFD_MSG(x) (((x) >> 24) & FIT_MFD_MASK) +#define FIT_MFD_DATA(x) ((x) & FIT_MFD_MASK) + +/* + * Extra arg to FIT_MSG_TO_DEVICE, 64-bit r/w + * Used to set completion queue address (FIT_MTD_SET_COMPQ_ADDR) + * (was Response buffer in docs) + */ +#define FIT_MSG_TO_DEVICE_ARG 0x580u + +/* + * Hardware (ASIC) version, 32-bit r/o + */ +#define FIT_HW_VERSION 0x588u + +/* + * Scatter/gather list descriptor. + * 32-bytes and must be aligned on a 32-byte boundary. + * All fields are in little endian order. + */ +struct fit_sg_descriptor { + uint32_t control; + uint32_t byte_count; + uint64_t host_side_addr; + uint64_t dev_side_addr; + uint64_t next_desc_ptr; +}; + +#define FIT_SGD_CONTROL_NOT_LAST 0x000u +#define FIT_SGD_CONTROL_LAST 0x40Eu + +/* + * Header at the beginning of a FIT message. The header + * is followed by SSDI requests each 64 bytes. + * A FIT message can be up to 512 bytes long and must start + * on a 64-byte boundary. + */ +struct fit_msg_hdr { + uint8_t protocol_id; + uint8_t num_protocol_cmds_coalesced; + uint8_t _reserved[62]; +}; + +#define FIT_PROTOCOL_ID_FIT 1 +#define FIT_PROTOCOL_ID_SSDI 2 +#define FIT_PROTOCOL_ID_SOFIT 3 + + +#define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF) +#define FIT_PROTOCOL_MAJOR_VER(mtd_val) ((mtd_val >> 20) & 0xF) + +/* + * Format of a completion entry. The completion queue is circular + * and must have at least as many entries as the maximum number + * of commands that may be issued to the device. + * + * There are no head/tail pointers. The cycle value is used to + * infer the presence of new completion records. + * Initially the cycle in all entries is 0, the index is 0, and + * the cycle value to expect is 1. When completions are added + * their cycle values are set to 1. When the index wraps the + * cycle value to expect is incremented. + * + * Command_context is opaque and taken verbatim from the SSDI command. + * All other fields are big endian. + */ +#define FIT_PROTOCOL_VERSION_0 0 + +/* + * Protocol major version 1 completion entry. + * The major protocol version is found in bits + * 20-23 of the FIT_MTD_FITFW_INIT response. + */ +struct fit_completion_entry_v1 { + uint32_t num_returned_bytes; + uint16_t tag; + uint8_t status; /* SCSI status */ + uint8_t cycle; +}; +#define FIT_PROTOCOL_VERSION_1 1 +#define FIT_PROTOCOL_VERSION_CURRENT FIT_PROTOCOL_VERSION_1 + +struct fit_comp_error_info { + uint8_t type:7; /* 00: Bits0-6 indicates the type of sense data. */ + uint8_t valid:1; /* 00: Bit 7 := 1 ==> info field is valid. */ + uint8_t reserved0; /* 01: Obsolete field */ + uint8_t key:4; /* 02: Bits0-3 indicate the sense key. */ + uint8_t reserved2:1; /* 02: Reserved bit. */ + uint8_t bad_length:1; /* 02: Incorrect Length Indicator */ + uint8_t end_medium:1; /* 02: End of Medium */ + uint8_t file_mark:1; /* 02: Filemark */ + uint8_t info[4]; /* 03: */ + uint8_t reserved1; /* 07: Additional Sense Length */ + uint8_t cmd_spec[4]; /* 08: Command Specific Information */ + uint8_t code; /* 0C: Additional Sense Code */ + uint8_t qual; /* 0D: Additional Sense Code Qualifier */ + uint8_t fruc; /* 0E: Field Replaceable Unit Code */ + uint8_t sks_high:7; /* 0F: Sense Key Specific (MSB) */ + uint8_t sks_valid:1; /* 0F: Sense Key Specific Valid */ + uint16_t sks_low; /* 10: Sense Key Specific (LSW) */ + uint16_t reserved3; /* 12: Part of additional sense bytes (unused) */ + uint16_t uec; /* 14: Additional Sense Bytes */ + uint64_t per; /* 16: Additional Sense Bytes */ + uint8_t reserved4[2]; /* 1E: Additional Sense Bytes (unused) */ +}; + + +/* Task management constants */ +#define SOFT_TASK_SIMPLE 0x00 +#define SOFT_TASK_HEAD_OF_QUEUE 0x01 +#define SOFT_TASK_ORDERED 0x02 + +/* Version zero has the last 32 bits reserved, + * Version one has the last 32 bits sg_list_len_bytes; + */ +struct skd_command_header { + uint64_t sg_list_dma_address; + uint16_t tag; + uint8_t attribute; + uint8_t add_cdb_len; /* In 32 bit words */ + uint32_t sg_list_len_bytes; +}; + +struct skd_scsi_request { + struct skd_command_header hdr; + unsigned char cdb[16]; +/* unsigned char _reserved[16]; */ +}; + +struct driver_inquiry_data { + uint8_t peripheral_device_type:5; + uint8_t qualifier:3; + uint8_t page_code; + uint16_t page_length; + uint16_t pcie_bus_number; + uint8_t pcie_device_number; + uint8_t pcie_function_number; + uint8_t pcie_link_speed; + uint8_t pcie_link_lanes; + uint16_t pcie_vendor_id; + uint16_t pcie_device_id; + uint16_t pcie_subsystem_vendor_id; + uint16_t pcie_subsystem_device_id; + uint8_t reserved1[2]; + uint8_t reserved2[3]; + uint8_t driver_version_length; + uint8_t driver_version[0x14]; +}; + +#pragma pack(pop, s1120_h) + +#endif /* SKD_S1120_H */ diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index bf4b9d282c04..6620b73d0490 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -887,6 +887,8 @@ static int dispatch_discard_io(struct xen_blkif *blkif, unsigned long secure; struct phys_req preq; + xen_blkif_get(blkif); + preq.sector_number = req->u.discard.sector_number; preq.nr_sects = req->u.discard.nr_sectors; @@ -899,7 +901,6 @@ static int dispatch_discard_io(struct xen_blkif *blkif, } blkif->st_ds_req++; - xen_blkif_get(blkif); secure = (blkif->vbd.discard_secure && (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? BLKDEV_DISCARD_SECURE : 0; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 8d53ed293606..432db1b59b00 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -121,7 +121,8 @@ struct blkfront_info struct work_struct work; struct gnttab_free_callback callback; struct blk_shadow shadow[BLK_RING_SIZE]; - struct list_head persistent_gnts; + struct list_head grants; + struct list_head indirect_pages; unsigned int persistent_gnts_c; unsigned long shadow_free; unsigned int feature_flush; @@ -200,15 +201,17 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) if (!gnt_list_entry) goto out_of_memory; - granted_page = alloc_page(GFP_NOIO); - if (!granted_page) { - kfree(gnt_list_entry); - goto out_of_memory; + if (info->feature_persistent) { + granted_page = alloc_page(GFP_NOIO); + if (!granted_page) { + kfree(gnt_list_entry); + goto out_of_memory; + } + gnt_list_entry->pfn = page_to_pfn(granted_page); } - gnt_list_entry->pfn = page_to_pfn(granted_page); gnt_list_entry->gref = GRANT_INVALID_REF; - list_add(&gnt_list_entry->node, &info->persistent_gnts); + list_add(&gnt_list_entry->node, &info->grants); i++; } @@ -216,9 +219,10 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) out_of_memory: list_for_each_entry_safe(gnt_list_entry, n, - &info->persistent_gnts, node) { + &info->grants, node) { list_del(&gnt_list_entry->node); - __free_page(pfn_to_page(gnt_list_entry->pfn)); + if (info->feature_persistent) + __free_page(pfn_to_page(gnt_list_entry->pfn)); kfree(gnt_list_entry); i--; } @@ -227,13 +231,14 @@ out_of_memory: } static struct grant *get_grant(grant_ref_t *gref_head, + unsigned long pfn, struct blkfront_info *info) { struct grant *gnt_list_entry; unsigned long buffer_mfn; - BUG_ON(list_empty(&info->persistent_gnts)); - gnt_list_entry = list_first_entry(&info->persistent_gnts, struct grant, + BUG_ON(list_empty(&info->grants)); + gnt_list_entry = list_first_entry(&info->grants, struct grant, node); list_del(&gnt_list_entry->node); @@ -245,6 +250,10 @@ static struct grant *get_grant(grant_ref_t *gref_head, /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); + if (!info->feature_persistent) { + BUG_ON(!pfn); + gnt_list_entry->pfn = pfn; + } buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); gnttab_grant_foreign_access_ref(gnt_list_entry->gref, info->xbdev->otherend_id, @@ -400,10 +409,13 @@ static int blkif_queue_request(struct request *req) if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return 1; - max_grefs = info->max_indirect_segments ? - info->max_indirect_segments + - INDIRECT_GREFS(info->max_indirect_segments) : - BLKIF_MAX_SEGMENTS_PER_REQUEST; + max_grefs = req->nr_phys_segments; + if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) + /* + * If we are using indirect segments we need to account + * for the indirect grefs used in the request. + */ + max_grefs += INDIRECT_GREFS(req->nr_phys_segments); /* Check if we have enough grants to allocate a requests */ if (info->persistent_gnts_c < max_grefs) { @@ -477,22 +489,34 @@ static int blkif_queue_request(struct request *req) if ((ring_req->operation == BLKIF_OP_INDIRECT) && (i % SEGS_PER_INDIRECT_FRAME == 0)) { + unsigned long pfn; + if (segments) kunmap_atomic(segments); n = i / SEGS_PER_INDIRECT_FRAME; - gnt_list_entry = get_grant(&gref_head, info); + if (!info->feature_persistent) { + struct page *indirect_page; + + /* Fetch a pre-allocated page to use for indirect grefs */ + BUG_ON(list_empty(&info->indirect_pages)); + indirect_page = list_first_entry(&info->indirect_pages, + struct page, lru); + list_del(&indirect_page->lru); + pfn = page_to_pfn(indirect_page); + } + gnt_list_entry = get_grant(&gref_head, pfn, info); info->shadow[id].indirect_grants[n] = gnt_list_entry; segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; } - gnt_list_entry = get_grant(&gref_head, info); + gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info); ref = gnt_list_entry->gref; info->shadow[id].grants_used[i] = gnt_list_entry; - if (rq_data_dir(req)) { + if (rq_data_dir(req) && info->feature_persistent) { char *bvec_data; void *shared_data; @@ -904,21 +928,36 @@ static void blkif_free(struct blkfront_info *info, int suspend) blk_stop_queue(info->rq); /* Remove all persistent grants */ - if (!list_empty(&info->persistent_gnts)) { + if (!list_empty(&info->grants)) { list_for_each_entry_safe(persistent_gnt, n, - &info->persistent_gnts, node) { + &info->grants, node) { list_del(&persistent_gnt->node); if (persistent_gnt->gref != GRANT_INVALID_REF) { gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); info->persistent_gnts_c--; } - __free_page(pfn_to_page(persistent_gnt->pfn)); + if (info->feature_persistent) + __free_page(pfn_to_page(persistent_gnt->pfn)); kfree(persistent_gnt); } } BUG_ON(info->persistent_gnts_c != 0); + /* + * Remove indirect pages, this only happens when using indirect + * descriptors but not persistent grants + */ + if (!list_empty(&info->indirect_pages)) { + struct page *indirect_page, *n; + + BUG_ON(info->feature_persistent); + list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_del(&indirect_page->lru); + __free_page(indirect_page); + } + } + for (i = 0; i < BLK_RING_SIZE; i++) { /* * Clear persistent grants present in requests already @@ -933,7 +972,8 @@ static void blkif_free(struct blkfront_info *info, int suspend) for (j = 0; j < segs; j++) { persistent_gnt = info->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); - __free_page(pfn_to_page(persistent_gnt->pfn)); + if (info->feature_persistent) + __free_page(pfn_to_page(persistent_gnt->pfn)); kfree(persistent_gnt); } @@ -992,7 +1032,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, nseg = s->req.operation == BLKIF_OP_INDIRECT ? s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; - if (bret->operation == BLKIF_OP_READ) { + if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { /* * Copy the data received from the backend into the bvec. * Since bv_offset can be different than 0, and bv_len different @@ -1013,13 +1053,51 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, } /* Add the persistent grant into the list of free grants */ for (i = 0; i < nseg; i++) { - list_add(&s->grants_used[i]->node, &info->persistent_gnts); - info->persistent_gnts_c++; + if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { + /* + * If the grant is still mapped by the backend (the + * backend has chosen to make this grant persistent) + * we add it at the head of the list, so it will be + * reused first. + */ + if (!info->feature_persistent) + pr_alert_ratelimited("backed has not unmapped grant: %u\n", + s->grants_used[i]->gref); + list_add(&s->grants_used[i]->node, &info->grants); + info->persistent_gnts_c++; + } else { + /* + * If the grant is not mapped by the backend we end the + * foreign access and add it to the tail of the list, + * so it will not be picked again unless we run out of + * persistent grants. + */ + gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); + s->grants_used[i]->gref = GRANT_INVALID_REF; + list_add_tail(&s->grants_used[i]->node, &info->grants); + } } if (s->req.operation == BLKIF_OP_INDIRECT) { for (i = 0; i < INDIRECT_GREFS(nseg); i++) { - list_add(&s->indirect_grants[i]->node, &info->persistent_gnts); - info->persistent_gnts_c++; + if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { + if (!info->feature_persistent) + pr_alert_ratelimited("backed has not unmapped grant: %u\n", + s->indirect_grants[i]->gref); + list_add(&s->indirect_grants[i]->node, &info->grants); + info->persistent_gnts_c++; + } else { + struct page *indirect_page; + + gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); + /* + * Add the used indirect page back to the list of + * available pages for indirect grefs. + */ + indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); + list_add(&indirect_page->lru, &info->indirect_pages); + s->indirect_grants[i]->gref = GRANT_INVALID_REF; + list_add_tail(&s->indirect_grants[i]->node, &info->grants); + } } } } @@ -1313,7 +1391,8 @@ static int blkfront_probe(struct xenbus_device *dev, spin_lock_init(&info->io_lock); info->xbdev = dev; info->vdevice = vdevice; - INIT_LIST_HEAD(&info->persistent_gnts); + INIT_LIST_HEAD(&info->grants); + INIT_LIST_HEAD(&info->indirect_pages); info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); @@ -1609,6 +1688,23 @@ static int blkfront_setup_indirect(struct blkfront_info *info) if (err) goto out_of_memory; + if (!info->feature_persistent && info->max_indirect_segments) { + /* + * We are using indirect descriptors but not persistent + * grants, we need to allocate a set of pages that can be + * used for mapping indirect grefs + */ + int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE; + + BUG_ON(!list_empty(&info->indirect_pages)); + for (i = 0; i < num; i++) { + struct page *indirect_page = alloc_page(GFP_NOIO); + if (!indirect_page) + goto out_of_memory; + list_add(&indirect_page->lru, &info->indirect_pages); + } + } + for (i = 0; i < BLK_RING_SIZE; i++) { info->shadow[i].grants_used = kzalloc( sizeof(info->shadow[i].grants_used[0]) * segs, @@ -1639,6 +1735,13 @@ out_of_memory: kfree(info->shadow[i].indirect_grants); info->shadow[i].indirect_grants = NULL; } + if (!list_empty(&info->indirect_pages)) { + struct page *indirect_page, *n; + list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_del(&indirect_page->lru); + __free_page(indirect_page); + } + } return -ENOMEM; } diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index f950c9d29f3e..2638417b19aa 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -13,15 +13,8 @@ config BCACHE_DEBUG ---help--- Don't select this option unless you're a developer - Enables extra debugging tools (primarily a fuzz tester) - -config BCACHE_EDEBUG - bool "Extended runtime checks" - depends on BCACHE - ---help--- - Don't select this option unless you're a developer - - Enables extra runtime checks which significantly affect performance + Enables extra debugging tools, allows expensive runtime checks to be + turned on. config BCACHE_CLOSURES_DEBUG bool "Debug closures" diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index e45f5575fd4d..2b46bf1d7e40 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -63,13 +63,12 @@ #include "bcache.h" #include "btree.h" +#include <linux/blkdev.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/random.h> #include <trace/events/bcache.h> -#define MAX_IN_FLIGHT_DISCARDS 8U - /* Bucket heap / gen */ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) @@ -121,75 +120,6 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) mutex_unlock(&c->bucket_lock); } -/* Discard/TRIM */ - -struct discard { - struct list_head list; - struct work_struct work; - struct cache *ca; - long bucket; - - struct bio bio; - struct bio_vec bv; -}; - -static void discard_finish(struct work_struct *w) -{ - struct discard *d = container_of(w, struct discard, work); - struct cache *ca = d->ca; - char buf[BDEVNAME_SIZE]; - - if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { - pr_notice("discard error on %s, disabling", - bdevname(ca->bdev, buf)); - d->ca->discard = 0; - } - - mutex_lock(&ca->set->bucket_lock); - - fifo_push(&ca->free, d->bucket); - list_add(&d->list, &ca->discards); - atomic_dec(&ca->discards_in_flight); - - mutex_unlock(&ca->set->bucket_lock); - - closure_wake_up(&ca->set->bucket_wait); - wake_up_process(ca->alloc_thread); - - closure_put(&ca->set->cl); -} - -static void discard_endio(struct bio *bio, int error) -{ - struct discard *d = container_of(bio, struct discard, bio); - schedule_work(&d->work); -} - -static void do_discard(struct cache *ca, long bucket) -{ - struct discard *d = list_first_entry(&ca->discards, - struct discard, list); - - list_del(&d->list); - d->bucket = bucket; - - atomic_inc(&ca->discards_in_flight); - closure_get(&ca->set->cl); - - bio_init(&d->bio); - - d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket); - d->bio.bi_bdev = ca->bdev; - d->bio.bi_rw = REQ_WRITE|REQ_DISCARD; - d->bio.bi_max_vecs = 1; - d->bio.bi_io_vec = d->bio.bi_inline_vecs; - d->bio.bi_size = bucket_bytes(ca); - d->bio.bi_end_io = discard_endio; - bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - - submit_bio(0, &d->bio); -} - /* Allocation */ static inline bool can_inc_bucket_gen(struct bucket *b) @@ -280,7 +210,7 @@ static void invalidate_buckets_lru(struct cache *ca) * multiple times when it can't do anything */ ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } @@ -305,7 +235,7 @@ static void invalidate_buckets_fifo(struct cache *ca) if (++checked >= ca->sb.nbuckets) { ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } } @@ -330,7 +260,7 @@ static void invalidate_buckets_random(struct cache *ca) if (++checked >= ca->sb.nbuckets / 2) { ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } } @@ -398,16 +328,18 @@ static int bch_allocator_thread(void *arg) else break; - allocator_wait(ca, (int) fifo_free(&ca->free) > - atomic_read(&ca->discards_in_flight)); - if (ca->discard) { - allocator_wait(ca, !list_empty(&ca->discards)); - do_discard(ca, bucket); - } else { - fifo_push(&ca->free, bucket); - closure_wake_up(&ca->set->bucket_wait); + mutex_unlock(&ca->set->bucket_lock); + blkdev_issue_discard(ca->bdev, + bucket_to_sector(ca->set, bucket), + ca->sb.block_size, GFP_KERNEL, 0); + mutex_lock(&ca->set->bucket_lock); } + + allocator_wait(ca, !fifo_full(&ca->free)); + + fifo_push(&ca->free, bucket); + wake_up(&ca->set->bucket_wait); } /* @@ -433,16 +365,40 @@ static int bch_allocator_thread(void *arg) } } -long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) +long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) { - long r = -1; -again: + DEFINE_WAIT(w); + struct bucket *b; + long r; + + /* fastpath */ + if (fifo_used(&ca->free) > ca->watermark[watermark]) { + fifo_pop(&ca->free, r); + goto out; + } + + if (!wait) + return -1; + + while (1) { + if (fifo_used(&ca->free) > ca->watermark[watermark]) { + fifo_pop(&ca->free, r); + break; + } + + prepare_to_wait(&ca->set->bucket_wait, &w, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&ca->set->bucket_lock); + schedule(); + mutex_lock(&ca->set->bucket_lock); + } + + finish_wait(&ca->set->bucket_wait, &w); +out: wake_up_process(ca->alloc_thread); - if (fifo_used(&ca->free) > ca->watermark[watermark] && - fifo_pop(&ca->free, r)) { - struct bucket *b = ca->buckets + r; -#ifdef CONFIG_BCACHE_EDEBUG + if (expensive_debug_checks(ca->set)) { size_t iter; long i; @@ -455,36 +411,23 @@ again: BUG_ON(i == r); fifo_for_each(i, &ca->unused, iter) BUG_ON(i == r); -#endif - BUG_ON(atomic_read(&b->pin) != 1); - - SET_GC_SECTORS_USED(b, ca->sb.bucket_size); - - if (watermark <= WATERMARK_METADATA) { - SET_GC_MARK(b, GC_MARK_METADATA); - b->prio = BTREE_PRIO; - } else { - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); - b->prio = INITIAL_PRIO; - } - - return r; } - trace_bcache_alloc_fail(ca); + b = ca->buckets + r; - if (cl) { - closure_wait(&ca->set->bucket_wait, cl); + BUG_ON(atomic_read(&b->pin) != 1); - if (closure_blocking(cl)) { - mutex_unlock(&ca->set->bucket_lock); - closure_sync(cl); - mutex_lock(&ca->set->bucket_lock); - goto again; - } + SET_GC_SECTORS_USED(b, ca->sb.bucket_size); + + if (watermark <= WATERMARK_METADATA) { + SET_GC_MARK(b, GC_MARK_METADATA); + b->prio = BTREE_PRIO; + } else { + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + b->prio = INITIAL_PRIO; } - return -1; + return r; } void bch_bucket_free(struct cache_set *c, struct bkey *k) @@ -501,7 +444,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) } int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, - struct bkey *k, int n, struct closure *cl) + struct bkey *k, int n, bool wait) { int i; @@ -514,7 +457,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, for (i = 0; i < n; i++) { struct cache *ca = c->cache_by_alloc[i]; - long b = bch_bucket_alloc(ca, watermark, cl); + long b = bch_bucket_alloc(ca, watermark, wait); if (b == -1) goto err; @@ -529,22 +472,202 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, return 0; err: bch_bucket_free(c, k); - __bkey_put(c, k); + bkey_put(c, k); return -1; } int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, - struct bkey *k, int n, struct closure *cl) + struct bkey *k, int n, bool wait) { int ret; mutex_lock(&c->bucket_lock); - ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); + ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); mutex_unlock(&c->bucket_lock); return ret; } +/* Sector allocator */ + +struct open_bucket { + struct list_head list; + unsigned last_write_point; + unsigned sectors_free; + BKEY_PADDED(key); +}; + +/* + * We keep multiple buckets open for writes, and try to segregate different + * write streams for better cache utilization: first we look for a bucket where + * the last write to it was sequential with the current write, and failing that + * we look for a bucket that was last used by the same task. + * + * The ideas is if you've got multiple tasks pulling data into the cache at the + * same time, you'll get better cache utilization if you try to segregate their + * data and preserve locality. + * + * For example, say you've starting Firefox at the same time you're copying a + * bunch of files. Firefox will likely end up being fairly hot and stay in the + * cache awhile, but the data you copied might not be; if you wrote all that + * data to the same buckets it'd get invalidated at the same time. + * + * Both of those tasks will be doing fairly random IO so we can't rely on + * detecting sequential IO to segregate their data, but going off of the task + * should be a sane heuristic. + */ +static struct open_bucket *pick_data_bucket(struct cache_set *c, + const struct bkey *search, + unsigned write_point, + struct bkey *alloc) +{ + struct open_bucket *ret, *ret_task = NULL; + + list_for_each_entry_reverse(ret, &c->data_buckets, list) + if (!bkey_cmp(&ret->key, search)) + goto found; + else if (ret->last_write_point == write_point) + ret_task = ret; + + ret = ret_task ?: list_first_entry(&c->data_buckets, + struct open_bucket, list); +found: + if (!ret->sectors_free && KEY_PTRS(alloc)) { + ret->sectors_free = c->sb.bucket_size; + bkey_copy(&ret->key, alloc); + bkey_init(alloc); + } + + if (!ret->sectors_free) + ret = NULL; + + return ret; +} + +/* + * Allocates some space in the cache to write to, and k to point to the newly + * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the + * end of the newly allocated space). + * + * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many + * sectors were actually allocated. + * + * If s->writeback is true, will not fail. + */ +bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, + unsigned write_point, unsigned write_prio, bool wait) +{ + struct open_bucket *b; + BKEY_PADDED(key) alloc; + unsigned i; + + /* + * We might have to allocate a new bucket, which we can't do with a + * spinlock held. So if we have to allocate, we drop the lock, allocate + * and then retry. KEY_PTRS() indicates whether alloc points to + * allocated bucket(s). + */ + + bkey_init(&alloc.key); + spin_lock(&c->data_bucket_lock); + + while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { + unsigned watermark = write_prio + ? WATERMARK_MOVINGGC + : WATERMARK_NONE; + + spin_unlock(&c->data_bucket_lock); + + if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) + return false; + + spin_lock(&c->data_bucket_lock); + } + + /* + * If we had to allocate, we might race and not need to allocate the + * second time we call find_data_bucket(). If we allocated a bucket but + * didn't use it, drop the refcount bch_bucket_alloc_set() took: + */ + if (KEY_PTRS(&alloc.key)) + bkey_put(c, &alloc.key); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + EBUG_ON(ptr_stale(c, &b->key, i)); + + /* Set up the pointer to the space we're allocating: */ + + for (i = 0; i < KEY_PTRS(&b->key); i++) + k->ptr[i] = b->key.ptr[i]; + + sectors = min(sectors, b->sectors_free); + + SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); + SET_KEY_SIZE(k, sectors); + SET_KEY_PTRS(k, KEY_PTRS(&b->key)); + + /* + * Move b to the end of the lru, and keep track of what this bucket was + * last used for: + */ + list_move_tail(&b->list, &c->data_buckets); + bkey_copy_key(&b->key, k); + b->last_write_point = write_point; + + b->sectors_free -= sectors; + + for (i = 0; i < KEY_PTRS(&b->key); i++) { + SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); + + atomic_long_add(sectors, + &PTR_CACHE(c, &b->key, i)->sectors_written); + } + + if (b->sectors_free < c->sb.block_size) + b->sectors_free = 0; + + /* + * k takes refcounts on the buckets it points to until it's inserted + * into the btree, but if we're done with this bucket we just transfer + * get_data_bucket()'s refcount. + */ + if (b->sectors_free) + for (i = 0; i < KEY_PTRS(&b->key); i++) + atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); + + spin_unlock(&c->data_bucket_lock); + return true; +} + /* Init */ +void bch_open_buckets_free(struct cache_set *c) +{ + struct open_bucket *b; + + while (!list_empty(&c->data_buckets)) { + b = list_first_entry(&c->data_buckets, + struct open_bucket, list); + list_del(&b->list); + kfree(b); + } +} + +int bch_open_buckets_alloc(struct cache_set *c) +{ + int i; + + spin_lock_init(&c->data_bucket_lock); + + for (i = 0; i < 6; i++) { + struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return -ENOMEM; + + list_add(&b->list, &c->data_buckets); + } + + return 0; +} + int bch_cache_allocator_start(struct cache *ca) { struct task_struct *k = kthread_run(bch_allocator_thread, @@ -556,22 +679,8 @@ int bch_cache_allocator_start(struct cache *ca) return 0; } -void bch_cache_allocator_exit(struct cache *ca) -{ - struct discard *d; - - while (!list_empty(&ca->discards)) { - d = list_first_entry(&ca->discards, struct discard, list); - cancel_work_sync(&d->work); - list_del(&d->list); - kfree(d); - } -} - int bch_cache_allocator_init(struct cache *ca) { - unsigned i; - /* * Reserve: * Prio/gen writes first @@ -589,15 +698,5 @@ int bch_cache_allocator_init(struct cache *ca) ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + ca->watermark[WATERMARK_MOVINGGC]; - for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { - struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); - if (!d) - return -ENOMEM; - - d->ca = ca; - INIT_WORK(&d->work, discard_finish); - list_add(&d->list, &ca->discards); - } - return 0; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0f12382aa35d..4beb55a0ff30 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -177,6 +177,7 @@ #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#include <linux/bcache.h> #include <linux/bio.h> #include <linux/kobject.h> #include <linux/list.h> @@ -210,168 +211,6 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define GC_MARK_METADATA 2 BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); -struct bkey { - uint64_t high; - uint64_t low; - uint64_t ptr[]; -}; - -/* Enough for a key with 6 pointers */ -#define BKEY_PAD 8 - -#define BKEY_PADDED(key) \ - union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } - -/* Version 0: Cache device - * Version 1: Backing device - * Version 2: Seed pointer into btree node checksum - * Version 3: Cache device with new UUID format - * Version 4: Backing device with data offset - */ -#define BCACHE_SB_VERSION_CDEV 0 -#define BCACHE_SB_VERSION_BDEV 1 -#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 -#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 -#define BCACHE_SB_MAX_VERSION 4 - -#define SB_SECTOR 8 -#define SB_SIZE 4096 -#define SB_LABEL_SIZE 32 -#define SB_JOURNAL_BUCKETS 256U -/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ -#define MAX_CACHES_PER_SET 8 - -#define BDEV_DATA_START_DEFAULT 16 /* sectors */ - -struct cache_sb { - uint64_t csum; - uint64_t offset; /* sector where this sb was written */ - uint64_t version; - - uint8_t magic[16]; - - uint8_t uuid[16]; - union { - uint8_t set_uuid[16]; - uint64_t set_magic; - }; - uint8_t label[SB_LABEL_SIZE]; - - uint64_t flags; - uint64_t seq; - uint64_t pad[8]; - - union { - struct { - /* Cache devices */ - uint64_t nbuckets; /* device size */ - - uint16_t block_size; /* sectors */ - uint16_t bucket_size; /* sectors */ - - uint16_t nr_in_set; - uint16_t nr_this_dev; - }; - struct { - /* Backing devices */ - uint64_t data_offset; - - /* - * block_size from the cache device section is still used by - * backing devices, so don't add anything here until we fix - * things to not need it for backing devices anymore - */ - }; - }; - - uint32_t last_mount; /* time_t */ - - uint16_t first_bucket; - union { - uint16_t njournal_buckets; - uint16_t keys; - }; - uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ -}; - -BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); -BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); -BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); -#define CACHE_REPLACEMENT_LRU 0U -#define CACHE_REPLACEMENT_FIFO 1U -#define CACHE_REPLACEMENT_RANDOM 2U - -BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); -#define CACHE_MODE_WRITETHROUGH 0U -#define CACHE_MODE_WRITEBACK 1U -#define CACHE_MODE_WRITEAROUND 2U -#define CACHE_MODE_NONE 3U -BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); -#define BDEV_STATE_NONE 0U -#define BDEV_STATE_CLEAN 1U -#define BDEV_STATE_DIRTY 2U -#define BDEV_STATE_STALE 3U - -/* Version 1: Seed pointer into btree node checksum - */ -#define BCACHE_BSET_VERSION 1 - -/* - * This is the on disk format for btree nodes - a btree node on disk is a list - * of these; within each set the keys are sorted - */ -struct bset { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t keys; - - union { - struct bkey start[0]; - uint64_t d[0]; - }; -}; - -/* - * On disk format for priorities and gens - see super.c near prio_write() for - * more. - */ -struct prio_set { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t pad; - - uint64_t next_bucket; - - struct bucket_disk { - uint16_t prio; - uint8_t gen; - } __attribute((packed)) data[]; -}; - -struct uuid_entry { - union { - struct { - uint8_t uuid[16]; - uint8_t label[32]; - uint32_t first_reg; - uint32_t last_reg; - uint32_t invalidated; - - uint32_t flags; - /* Size of flash only volumes */ - uint64_t sectors; - }; - - uint8_t pad[128]; - }; -}; - -BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); - #include "journal.h" #include "stats.h" struct search; @@ -384,8 +223,6 @@ struct keybuf_key { void *private; }; -typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); - struct keybuf { struct bkey last_scanned; spinlock_t lock; @@ -400,7 +237,7 @@ struct keybuf { struct rb_root keys; -#define KEYBUF_NR 100 +#define KEYBUF_NR 500 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); }; @@ -429,16 +266,15 @@ struct bcache_device { struct gendisk *disk; - /* If nonzero, we're closing */ - atomic_t closing; - - /* If nonzero, we're detaching/unregistering from cache set */ - atomic_t detaching; - int flush_done; + unsigned long flags; +#define BCACHE_DEV_CLOSING 0 +#define BCACHE_DEV_DETACHING 1 +#define BCACHE_DEV_UNLINK_DONE 2 - uint64_t nr_stripes; - unsigned stripe_size_bits; + unsigned nr_stripes; + unsigned stripe_size; atomic_t *stripe_sectors_dirty; + unsigned long *full_dirty_stripes; unsigned long sectors_dirty_last; long sectors_dirty_derivative; @@ -509,7 +345,7 @@ struct cached_dev { /* Limit number of writeback bios in flight */ struct semaphore in_flight; - struct closure_with_timer writeback; + struct task_struct *writeback_thread; struct keybuf writeback_keys; @@ -527,8 +363,8 @@ struct cached_dev { unsigned sequential_cutoff; unsigned readahead; - unsigned sequential_merge:1; unsigned verify:1; + unsigned bypass_torture_test:1; unsigned partial_stripes_expensive:1; unsigned writeback_metadata:1; @@ -620,15 +456,6 @@ struct cache { bool discard; /* Get rid of? */ - /* - * We preallocate structs for issuing discards to buckets, and keep them - * on this list when they're not in use; do_discard() issues discards - * whenever there's work to do and is called by free_some_buckets() and - * when a discard finishes. - */ - atomic_t discards_in_flight; - struct list_head discards; - struct journal_device journal; /* The rest of this all shows up in sysfs */ @@ -649,7 +476,6 @@ struct gc_stat { size_t nkeys; uint64_t data; /* sectors */ - uint64_t dirty; /* sectors */ unsigned in_use; /* percent */ }; @@ -744,8 +570,8 @@ struct cache_set { * basically a lock for this that we can wait on asynchronously. The * btree_root() macro releases the lock when it returns. */ - struct closure *try_harder; - struct closure_waitlist try_wait; + struct task_struct *try_harder; + wait_queue_head_t try_wait; uint64_t try_harder_start; /* @@ -759,7 +585,7 @@ struct cache_set { * written. */ atomic_t prio_blocked; - struct closure_waitlist bucket_wait; + wait_queue_head_t bucket_wait; /* * For any bio we don't skip we subtract the number of sectors from @@ -782,7 +608,7 @@ struct cache_set { struct gc_stat gc_stats; size_t nbuckets; - struct closure_with_waitlist gc; + struct task_struct *gc_thread; /* Where in the btree gc currently is */ struct bkey gc_done; @@ -795,11 +621,10 @@ struct cache_set { /* Counts how many sectors bio_insert has added to the cache */ atomic_t sectors_to_gc; - struct closure moving_gc; - struct closure_waitlist moving_gc_wait; + wait_queue_head_t moving_gc_wait; struct keybuf moving_gc_keys; /* Number of moving GC bios in flight */ - atomic_t in_flight; + struct semaphore moving_in_flight; struct btree *root; @@ -841,22 +666,27 @@ struct cache_set { unsigned congested_read_threshold_us; unsigned congested_write_threshold_us; - spinlock_t sort_time_lock; struct time_stats sort_time; struct time_stats btree_gc_time; struct time_stats btree_split_time; - spinlock_t btree_read_time_lock; struct time_stats btree_read_time; struct time_stats try_harder_time; atomic_long_t cache_read_races; atomic_long_t writeback_keys_done; atomic_long_t writeback_keys_failed; + + enum { + ON_ERROR_UNREGISTER, + ON_ERROR_PANIC, + } on_error; unsigned error_limit; unsigned error_decay; + unsigned short journal_delay_ms; unsigned verify:1; unsigned key_merging_disabled:1; + unsigned expensive_debug_checks:1; unsigned gc_always_rewrite:1; unsigned shrinker_disabled:1; unsigned copy_gc_enabled:1; @@ -865,21 +695,6 @@ struct cache_set { struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; }; -static inline bool key_merging_disabled(struct cache_set *c) -{ -#ifdef CONFIG_BCACHE_DEBUG - return c->key_merging_disabled; -#else - return 0; -#endif -} - -static inline bool SB_IS_BDEV(const struct cache_sb *sb) -{ - return sb->version == BCACHE_SB_VERSION_BDEV - || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; -} - struct bbio { unsigned submit_time_us; union { @@ -933,59 +748,6 @@ static inline unsigned local_clock_us(void) #define prio_buckets(c) \ DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) -#define JSET_MAGIC 0x245235c1a3625032ULL -#define PSET_MAGIC 0x6750e15f87337f91ULL -#define BSET_MAGIC 0x90135c78b99e07f5ULL - -#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) -#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) -#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) - -/* Bkey fields: all units are in sectors */ - -#define KEY_FIELD(name, field, offset, size) \ - BITMASK(name, struct bkey, field, offset, size) - -#define PTR_FIELD(name, offset, size) \ - static inline uint64_t name(const struct bkey *k, unsigned i) \ - { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ - \ - static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ - { \ - k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ - k->ptr[i] |= v << offset; \ - } - -KEY_FIELD(KEY_PTRS, high, 60, 3) -KEY_FIELD(HEADER_SIZE, high, 58, 2) -KEY_FIELD(KEY_CSUM, high, 56, 2) -KEY_FIELD(KEY_PINNED, high, 55, 1) -KEY_FIELD(KEY_DIRTY, high, 36, 1) - -KEY_FIELD(KEY_SIZE, high, 20, 16) -KEY_FIELD(KEY_INODE, high, 0, 20) - -/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ - -static inline uint64_t KEY_OFFSET(const struct bkey *k) -{ - return k->low; -} - -static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) -{ - k->low = v; -} - -PTR_FIELD(PTR_DEV, 51, 12) -PTR_FIELD(PTR_OFFSET, 8, 43) -PTR_FIELD(PTR_GEN, 0, 8) - -#define PTR_CHECK_DEV ((1 << 12) - 1) - -#define PTR(gen, offset, dev) \ - ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) - static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) { return s >> c->bucket_bits; @@ -1024,27 +786,11 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, /* Btree key macros */ -/* - * The high bit being set is a relic from when we used it to do binary - * searches - it told you where a key started. It's not used anymore, - * and can probably be safely dropped. - */ -#define KEY(dev, sector, len) \ -((struct bkey) { \ - .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ - .low = (sector) \ -}) - static inline void bkey_init(struct bkey *k) { - *k = KEY(0, 0, 0); + *k = ZERO_KEY; } -#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) -#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) -#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) -#define ZERO_KEY KEY(0, 0, 0) - /* * This is used for various on disk data structures - cache_sb, prio_set, bset, * jset: The checksum is _always_ the first 8 bytes of these structs @@ -1094,14 +840,6 @@ do { \ for (b = (ca)->buckets + (ca)->sb.first_bucket; \ b < (ca)->buckets + (ca)->sb.nbuckets; b++) -static inline void __bkey_put(struct cache_set *c, struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); -} - static inline void cached_dev_put(struct cached_dev *dc) { if (atomic_dec_and_test(&dc->count)) @@ -1173,13 +911,15 @@ uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); bool bch_bucket_add_unused(struct cache *, struct bucket *); -long bch_bucket_alloc(struct cache *, unsigned, struct closure *); +long bch_bucket_alloc(struct cache *, unsigned, bool); void bch_bucket_free(struct cache_set *, struct bkey *); int __bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, struct closure *); + struct bkey *, int, bool); int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, struct closure *); + struct bkey *, int, bool); +bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, + unsigned, unsigned, bool); __printf(2, 3) bool bch_cache_set_error(struct cache_set *, const char *, ...); @@ -1187,7 +927,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...); void bch_prio_write(struct cache *); void bch_write_bdev_super(struct cached_dev *, struct closure *); -extern struct workqueue_struct *bcache_wq, *bch_gc_wq; +extern struct workqueue_struct *bcache_wq; extern const char * const bch_cache_modes[]; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; @@ -1220,15 +960,14 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); void bch_moving_init_cache_set(struct cache_set *); +int bch_open_buckets_alloc(struct cache_set *); +void bch_open_buckets_free(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); -void bch_cache_allocator_exit(struct cache *ca); int bch_cache_allocator_init(struct cache *ca); void bch_debug_exit(void); int bch_debug_init(struct kobject *); -void bch_writeback_exit(void); -int bch_writeback_init(void); void bch_request_exit(void); int bch_request_init(void); void bch_btree_exit(void); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 22d1ae72c282..7d388b8bb50e 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -14,22 +14,12 @@ /* Keylists */ -void bch_keylist_copy(struct keylist *dest, struct keylist *src) -{ - *dest = *src; - - if (src->list == src->d) { - size_t n = (uint64_t *) src->top - src->d; - dest->top = (struct bkey *) &dest->d[n]; - dest->list = dest->d; - } -} - int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) { - unsigned oldsize = (uint64_t *) l->top - l->list; - unsigned newsize = oldsize + 2 + nptrs; - uint64_t *new; + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + 2 + nptrs; + uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; + uint64_t *new_keys; /* The journalling code doesn't handle the case where the keys to insert * is bigger than an empty write: If we just return -ENOMEM here, @@ -45,24 +35,23 @@ int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) roundup_pow_of_two(oldsize) == newsize) return 0; - new = krealloc(l->list == l->d ? NULL : l->list, - sizeof(uint64_t) * newsize, GFP_NOIO); + new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO); - if (!new) + if (!new_keys) return -ENOMEM; - if (l->list == l->d) - memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE); + if (!old_keys) + memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize); - l->list = new; - l->top = (struct bkey *) (&l->list[oldsize]); + l->keys_p = new_keys; + l->top_p = new_keys + oldsize; return 0; } struct bkey *bch_keylist_pop(struct keylist *l) { - struct bkey *k = l->bottom; + struct bkey *k = l->keys; if (k == l->top) return NULL; @@ -73,21 +62,20 @@ struct bkey *bch_keylist_pop(struct keylist *l) return l->top = k; } -/* Pointer validation */ - -bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) +void bch_keylist_pop_front(struct keylist *l) { - unsigned i; - char buf[80]; + l->top_p -= bkey_u64s(l->keys); - if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) - goto bad; + memmove(l->keys, + bkey_next(l->keys), + bch_keylist_bytes(l)); +} - if (!level && KEY_SIZE(k) > KEY_OFFSET(k)) - goto bad; +/* Pointer validation */ - if (!KEY_SIZE(k)) - return true; +static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + unsigned i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { @@ -98,13 +86,83 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) if (KEY_SIZE(k) + r > c->sb.bucket_size || bucket < ca->sb.first_bucket || bucket >= ca->sb.nbuckets) - goto bad; + return true; } return false; +} + +bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; +bad: + bch_bkey_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); + return true; +} + +bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_SIZE(k)) + return true; + + if (KEY_SIZE(k) > KEY_OFFSET(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; bad: bch_bkey_to_text(buf, sizeof(buf), k); - cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); + cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); + return true; +} + +static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, + unsigned ptr) +{ + struct bucket *g = PTR_BUCKET(b->c, k, ptr); + char buf[80]; + + if (mutex_trylock(&b->c->bucket_lock)) { + if (b->level) { + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto err; + + } else { + if (g->prio == BTREE_PRIO) + goto err; + + if (KEY_DIRTY(k) && + b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_DIRTY) + goto err; + } + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_bkey_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); return true; } @@ -118,64 +176,29 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) bch_ptr_invalid(b, k)) return true; - if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV) - return true; + for (i = 0; i < KEY_PTRS(k); i++) { + if (!ptr_available(b->c, k, i)) + return true; - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(b->c, k, i)) { - g = PTR_BUCKET(b->c, k, i); - stale = ptr_stale(b->c, k, i); + g = PTR_BUCKET(b->c, k, i); + stale = ptr_stale(b->c, k, i); - btree_bug_on(stale > 96, b, - "key too stale: %i, need_gc %u", - stale, b->c->need_gc); + btree_bug_on(stale > 96, b, + "key too stale: %i, need_gc %u", + stale, b->c->need_gc); - btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), - b, "stale dirty pointer"); + btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), + b, "stale dirty pointer"); - if (stale) - return true; + if (stale) + return true; -#ifdef CONFIG_BCACHE_EDEBUG - if (!mutex_trylock(&b->c->bucket_lock)) - continue; - - if (b->level) { - if (KEY_DIRTY(k) || - g->prio != BTREE_PRIO || - (b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_METADATA)) - goto bug; - - } else { - if (g->prio == BTREE_PRIO) - goto bug; - - if (KEY_DIRTY(k) && - b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_DIRTY) - goto bug; - } - mutex_unlock(&b->c->bucket_lock); -#endif - } + if (expensive_debug_checks(b->c) && + ptr_bad_expensive_checks(b, k, i)) + return true; + } return false; -#ifdef CONFIG_BCACHE_EDEBUG -bug: - mutex_unlock(&b->c->bucket_lock); - - { - char buf[80]; - - bch_bkey_to_text(buf, sizeof(buf), k); - btree_bug(b, -"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); - } - return true; -#endif } /* Key/pointer manipulation */ @@ -458,16 +481,8 @@ static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline) static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) { -#ifdef CONFIG_X86_64 - asm("shrd %[shift],%[high],%[low]" - : [low] "+Rm" (low) - : [high] "R" (high), - [shift] "ci" (shift) - : "cc"); -#else low >>= shift; low |= (high << 1) << (63U - shift); -#endif return low; } @@ -686,7 +701,7 @@ void bch_bset_init_next(struct btree *b) } else get_random_bytes(&i->seq, sizeof(uint64_t)); - i->magic = bset_magic(b->c); + i->magic = bset_magic(&b->c->sb); i->version = 0; i->keys = 0; @@ -824,16 +839,16 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, } else i = bset_search_write_set(b, t, search); -#ifdef CONFIG_BCACHE_EDEBUG - BUG_ON(bset_written(b, t) && - i.l != t->data->start && - bkey_cmp(tree_to_prev_bkey(t, - inorder_to_tree(bkey_to_cacheline(t, i.l), t)), - search) > 0); + if (expensive_debug_checks(b->c)) { + BUG_ON(bset_written(b, t) && + i.l != t->data->start && + bkey_cmp(tree_to_prev_bkey(t, + inorder_to_tree(bkey_to_cacheline(t, i.l), t)), + search) > 0); - BUG_ON(i.r != end(t->data) && - bkey_cmp(i.r, search) <= 0); -#endif + BUG_ON(i.r != end(t->data) && + bkey_cmp(i.r, search) <= 0); + } while (likely(i.l != i.r) && bkey_cmp(i.l, search) <= 0) @@ -844,6 +859,13 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, /* Btree iterator */ +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ static inline bool btree_iter_cmp(struct btree_iter_set l, struct btree_iter_set r) { @@ -867,12 +889,16 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, } struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, - struct bkey *search, struct bset_tree *start) + struct bkey *search, struct bset_tree *start) { struct bkey *ret = NULL; iter->size = ARRAY_SIZE(iter->data); iter->used = 0; +#ifdef CONFIG_BCACHE_DEBUG + iter->b = b; +#endif + for (; start <= &b->sets[b->nsets]; start++) { ret = bch_bset_search(b, start, search); bch_btree_iter_push(iter, ret, end(start->data)); @@ -887,6 +913,8 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) struct bkey *ret = NULL; if (!btree_iter_end(iter)) { + bch_btree_iter_next_check(iter); + ret = iter->data->k; iter->data->k = bkey_next(iter->data->k); @@ -916,14 +944,6 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, return ret; } -struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) -{ - struct btree_iter iter; - - bch_btree_iter_init(b, &iter, search); - return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); -} - /* Mergesort */ static void sort_key_next(struct btree_iter *iter, @@ -998,7 +1018,6 @@ static void btree_mergesort(struct btree *b, struct bset *out, out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; pr_debug("sorted %i keys", out->keys); - bch_check_key_order(b, out); } static void __btree_sort(struct btree *b, struct btree_iter *iter, @@ -1029,7 +1048,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, * memcpy() */ - out->magic = bset_magic(b->c); + out->magic = bset_magic(&b->c->sb); out->seq = b->sets[0].data->seq; out->version = b->sets[0].data->version; swap(out, b->sets[0].data); @@ -1050,24 +1069,21 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, if (b->written) bset_build_written_tree(b); - if (!start) { - spin_lock(&b->c->sort_time_lock); + if (!start) bch_time_stats_update(&b->c->sort_time, start_time); - spin_unlock(&b->c->sort_time_lock); - } } void bch_btree_sort_partial(struct btree *b, unsigned start) { - size_t oldsize = 0, order = b->page_order, keys = 0; + size_t order = b->page_order, keys = 0; struct btree_iter iter; + int oldsize = bch_count_data(b); + __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); BUG_ON(b->sets[b->nsets].data == write_block(b) && (b->sets[b->nsets].size || b->nsets)); - if (b->written) - oldsize = bch_count_data(b); if (start) { unsigned i; @@ -1083,7 +1099,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start) __btree_sort(b, &iter, start, order, false); - EBUG_ON(b->written && bch_count_data(b) != oldsize); + EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); } void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) @@ -1101,9 +1117,7 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) btree_mergesort(b, new->sets->data, &iter, false, true); - spin_lock(&b->c->sort_time_lock); bch_time_stats_update(&b->c->sort_time, start_time); - spin_unlock(&b->c->sort_time_lock); bkey_copy_key(&new->key, &b->key); new->sets->size = 0; @@ -1148,16 +1162,16 @@ out: /* Sysfs stuff */ struct bset_stats { + struct btree_op op; size_t nodes; size_t sets_written, sets_unwritten; size_t bytes_written, bytes_unwritten; size_t floats, failed; }; -static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, - struct bset_stats *stats) +static int btree_bset_stats(struct btree_op *op, struct btree *b) { - struct bkey *k; + struct bset_stats *stats = container_of(op, struct bset_stats, op); unsigned i; stats->nodes++; @@ -1182,30 +1196,19 @@ static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, } } - if (b->level) { - struct btree_iter iter; - - for_each_key_filter(b, k, &iter, bch_ptr_bad) { - int ret = btree(bset_stats, k, b, op, stats); - if (ret) - return ret; - } - } - - return 0; + return MAP_CONTINUE; } int bch_bset_print_stats(struct cache_set *c, char *buf) { - struct btree_op op; struct bset_stats t; int ret; - bch_btree_op_init_stack(&op); memset(&t, 0, sizeof(struct bset_stats)); + bch_btree_op_init(&t.op, -1); - ret = btree_root(bset_stats, c, &op, &t); - if (ret) + ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); + if (ret < 0) return ret; return snprintf(buf, PAGE_SIZE, diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index ae115a253d73..1d3c24f9fa0e 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -148,6 +148,9 @@ struct btree_iter { size_t size, used; +#ifdef CONFIG_BCACHE_DEBUG + struct btree *b; +#endif struct btree_iter_set { struct bkey *k, *end; } data[MAX_BSETS]; @@ -193,54 +196,26 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); } -static inline size_t bkey_u64s(const struct bkey *k) -{ - BUG_ON(KEY_CSUM(k) > 1); - return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0); -} - -static inline size_t bkey_bytes(const struct bkey *k) -{ - return bkey_u64s(k) * sizeof(uint64_t); -} - -static inline void bkey_copy(struct bkey *dest, const struct bkey *src) -{ - memcpy(dest, src, bkey_bytes(src)); -} - -static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) -{ - if (!src) - src = &KEY(0, 0, 0); - - SET_KEY_INODE(dest, KEY_INODE(src)); - SET_KEY_OFFSET(dest, KEY_OFFSET(src)); -} - -static inline struct bkey *bkey_next(const struct bkey *k) -{ - uint64_t *d = (void *) k; - return (struct bkey *) (d + bkey_u64s(k)); -} - /* Keylists */ struct keylist { - struct bkey *top; union { - uint64_t *list; - struct bkey *bottom; + struct bkey *keys; + uint64_t *keys_p; + }; + union { + struct bkey *top; + uint64_t *top_p; }; /* Enough room for btree_split's keys without realloc */ #define KEYLIST_INLINE 16 - uint64_t d[KEYLIST_INLINE]; + uint64_t inline_keys[KEYLIST_INLINE]; }; static inline void bch_keylist_init(struct keylist *l) { - l->top = (void *) (l->list = l->d); + l->top_p = l->keys_p = l->inline_keys; } static inline void bch_keylist_push(struct keylist *l) @@ -256,17 +231,32 @@ static inline void bch_keylist_add(struct keylist *l, struct bkey *k) static inline bool bch_keylist_empty(struct keylist *l) { - return l->top == (void *) l->list; + return l->top == l->keys; +} + +static inline void bch_keylist_reset(struct keylist *l) +{ + l->top = l->keys; } static inline void bch_keylist_free(struct keylist *l) { - if (l->list != l->d) - kfree(l->list); + if (l->keys_p != l->inline_keys) + kfree(l->keys_p); +} + +static inline size_t bch_keylist_nkeys(struct keylist *l) +{ + return l->top_p - l->keys_p; +} + +static inline size_t bch_keylist_bytes(struct keylist *l) +{ + return bch_keylist_nkeys(l) * sizeof(uint64_t); } -void bch_keylist_copy(struct keylist *, struct keylist *); struct bkey *bch_keylist_pop(struct keylist *); +void bch_keylist_pop_front(struct keylist *); int bch_keylist_realloc(struct keylist *, int, struct cache_set *); void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, @@ -287,7 +277,9 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) } const char *bch_ptr_status(struct cache_set *, const struct bkey *); -bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); +bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); +bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); + bool bch_ptr_bad(struct btree *, const struct bkey *); static inline uint8_t gen_after(uint8_t a, uint8_t b) @@ -311,7 +303,6 @@ static inline bool ptr_available(struct cache_set *c, const struct bkey *k, typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); -struct bkey *bch_next_recurse_key(struct btree *, struct bkey *); struct bkey *bch_btree_iter_next(struct btree_iter *); struct bkey *bch_btree_iter_next_filter(struct btree_iter *, struct btree *, ptr_filter_fn); @@ -361,12 +352,30 @@ void bch_bset_fix_lookup_table(struct btree *, struct bkey *); struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, const struct bkey *); +/* + * Returns the first key that is strictly greater than search + */ static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, const struct bkey *search) { return search ? __bch_bset_search(b, t, search) : t->data->start; } +#define PRECEDING_KEY(_k) \ +({ \ + struct bkey *_ret = NULL; \ + \ + if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ + _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ + \ + if (!_ret->low) \ + _ret->high--; \ + _ret->low--; \ + } \ + \ + _ret; \ +}) + bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); void bch_btree_sort_lazy(struct btree *); void bch_btree_sort_into(struct btree *, struct btree *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f42fc7ed9cd6..5e2765aadce1 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -23,12 +23,13 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "request.h" #include "writeback.h" #include <linux/slab.h> #include <linux/bitops.h> +#include <linux/freezer.h> #include <linux/hash.h> +#include <linux/kthread.h> #include <linux/prefetch.h> #include <linux/random.h> #include <linux/rcupdate.h> @@ -88,15 +89,13 @@ * Test module load/unload */ -static const char * const op_types[] = { - "insert", "replace" +enum { + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, }; -static const char *op_type(struct btree_op *op) -{ - return op_types[op->type]; -} - #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 @@ -105,23 +104,89 @@ static const char *op_type(struct btree_op *op) #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) -struct workqueue_struct *bch_gc_wq; static struct workqueue_struct *btree_io_wq; -void bch_btree_op_init_stack(struct btree_op *op) +static inline bool should_split(struct btree *b) { - memset(op, 0, sizeof(struct btree_op)); - closure_init_stack(&op->cl); - op->lock = -1; - bch_keylist_init(&op->keys); + struct bset *i = write_block(b); + return b->written >= btree_blocks(b) || + (b->written + __set_blocks(i, i->keys + 15, b->c) + > btree_blocks(b)); } +#define insert_lock(s, b) ((b)->level <= (s)->lock) + +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ + if (!IS_ERR(_child)) { \ + _child->parent = (b); \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ + } else \ + _r = PTR_ERR(_child); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) { \ + _b->parent = NULL; \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ + if (_r == -ENOSPC) { \ + wait_event((c)->try_wait, \ + !(c)->try_harder); \ + _r = -EINTR; \ + } \ + } while (_r == -EINTR); \ + \ + _r; \ +}) + /* Btree key manipulation */ -static void bkey_put(struct cache_set *c, struct bkey *k, int level) +void bkey_put(struct cache_set *c, struct bkey *k) { - if ((level && KEY_OFFSET(k)) || !level) - __bkey_put(c, k); + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); } /* Btree IO */ @@ -145,6 +210,10 @@ static void bch_btree_node_read_done(struct btree *b) iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; +#ifdef CONFIG_BCACHE_DEBUG + iter->b = b; +#endif + if (!i->seq) goto err; @@ -160,7 +229,7 @@ static void bch_btree_node_read_done(struct btree *b) goto err; err = "bad magic"; - if (i->magic != bset_magic(b->c)) + if (i->magic != bset_magic(&b->c->sb)) goto err; err = "bad checksum"; @@ -248,10 +317,7 @@ void bch_btree_node_read(struct btree *b) goto err; bch_btree_node_read_done(b); - - spin_lock(&b->c->btree_read_time_lock); bch_time_stats_update(&b->c->btree_read_time, start_time); - spin_unlock(&b->c->btree_read_time_lock); return; err: @@ -327,7 +393,7 @@ static void do_btree_node_write(struct btree *b) b->bio = bch_bbio_alloc(b->c); b->bio->bi_end_io = btree_node_write_endio; - b->bio->bi_private = &b->io.cl; + b->bio->bi_private = cl; b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); bch_bio_map(b->bio, i); @@ -383,7 +449,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); BUG_ON(b->sets->data->seq != i->seq); - bch_check_key_order(b, i); + bch_check_keys(b, "writing"); cancel_delayed_work(&b->work); @@ -405,6 +471,15 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) bch_bset_init_next(b); } +static void bch_btree_node_write_sync(struct btree *b) +{ + struct closure cl; + + closure_init_stack(&cl); + bch_btree_node_write(b, &cl); + closure_sync(&cl); +} + static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); @@ -416,7 +491,7 @@ static void btree_node_write_work(struct work_struct *w) rw_unlock(true, b); } -static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) +static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) { struct bset *i = b->sets[b->nsets].data; struct btree_write *w = btree_current_write(b); @@ -429,15 +504,15 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) set_btree_node_dirty(b); - if (op && op->journal) { + if (journal_ref) { if (w->journal && - journal_pin_cmp(b->c, w, op)) { + journal_pin_cmp(b->c, w->journal, journal_ref)) { atomic_dec_bug(w->journal); w->journal = NULL; } if (!w->journal) { - w->journal = op->journal; + w->journal = journal_ref; atomic_inc(w->journal); } } @@ -566,33 +641,32 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, return b; } -static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) +static int mca_reap(struct btree *b, unsigned min_order, bool flush) { + struct closure cl; + + closure_init_stack(&cl); lockdep_assert_held(&b->c->bucket_lock); if (!down_write_trylock(&b->lock)) return -ENOMEM; - if (b->page_order < min_order) { + BUG_ON(btree_node_dirty(b) && !b->sets[0].data); + + if (b->page_order < min_order || + (!flush && + (btree_node_dirty(b) || + atomic_read(&b->io.cl.remaining) != -1))) { rw_unlock(true, b); return -ENOMEM; } - BUG_ON(btree_node_dirty(b) && !b->sets[0].data); - - if (cl && btree_node_dirty(b)) - bch_btree_node_write(b, NULL); - - if (cl) - closure_wait_event_async(&b->io.wait, cl, - atomic_read(&b->io.cl.remaining) == -1); + if (btree_node_dirty(b)) + bch_btree_node_write_sync(b); - if (btree_node_dirty(b) || - !closure_is_unlocked(&b->io.cl) || - work_pending(&b->work.work)) { - rw_unlock(true, b); - return -EAGAIN; - } + /* wait for any in flight btree write */ + closure_wait_event(&b->io.wait, &cl, + atomic_read(&b->io.cl.remaining) == -1); return 0; } @@ -633,7 +707,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, break; if (++i > 3 && - !mca_reap(b, NULL, 0)) { + !mca_reap(b, 0, false)) { mca_data_free(b); rw_unlock(true, b); freed++; @@ -652,7 +726,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, list_rotate_left(&c->btree_cache); if (!b->accessed && - !mca_reap(b, NULL, 0)) { + !mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); @@ -723,12 +797,9 @@ int bch_btree_cache_alloc(struct cache_set *c) { unsigned i; - /* XXX: doesn't check for errors */ - - closure_init_unlocked(&c->gc); - for (i = 0; i < mca_reserve(c); i++) - mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) + return -ENOMEM; list_splice_init(&c->btree_cache, &c->btree_cache_freeable); @@ -775,52 +846,27 @@ out: return b; } -static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, - int level, struct closure *cl) +static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) { - int ret = -ENOMEM; - struct btree *i; + struct btree *b; trace_bcache_btree_cache_cannibalize(c); - if (!cl) - return ERR_PTR(-ENOMEM); - - /* - * Trying to free up some memory - i.e. reuse some btree nodes - may - * require initiating IO to flush the dirty part of the node. If we're - * running under generic_make_request(), that IO will never finish and - * we would deadlock. Returning -EAGAIN causes the cache lookup code to - * punt to workqueue and retry. - */ - if (current->bio_list) - return ERR_PTR(-EAGAIN); - - if (c->try_harder && c->try_harder != cl) { - closure_wait_event_async(&c->try_wait, cl, !c->try_harder); - return ERR_PTR(-EAGAIN); - } + if (!c->try_harder) { + c->try_harder = current; + c->try_harder_start = local_clock(); + } else if (c->try_harder != current) + return ERR_PTR(-ENOSPC); - c->try_harder = cl; - c->try_harder_start = local_clock(); -retry: - list_for_each_entry_reverse(i, &c->btree_cache, list) { - int r = mca_reap(i, cl, btree_order(k)); - if (!r) - return i; - if (r != -ENOMEM) - ret = r; - } + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), false)) + return b; - if (ret == -EAGAIN && - closure_blocking(cl)) { - mutex_unlock(&c->bucket_lock); - closure_sync(cl); - mutex_lock(&c->bucket_lock); - goto retry; - } + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), true)) + return b; - return ERR_PTR(ret); + return ERR_PTR(-ENOMEM); } /* @@ -829,20 +875,21 @@ retry: * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) +static void bch_cannibalize_unlock(struct cache_set *c) { - if (c->try_harder == cl) { + if (c->try_harder == current) { bch_time_stats_update(&c->try_harder_time, c->try_harder_start); c->try_harder = NULL; - __closure_wake_up(&c->try_wait); + wake_up(&c->try_wait); } } -static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, - int level, struct closure *cl) +static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) { struct btree *b; + BUG_ON(current->bio_list); + lockdep_assert_held(&c->bucket_lock); if (mca_find(c, k)) @@ -852,14 +899,14 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, * the list. Check if there's any freed nodes there: */ list_for_each_entry(b, &c->btree_cache_freeable, list) - if (!mca_reap(b, NULL, btree_order(k))) + if (!mca_reap(b, btree_order(k), false)) goto out; /* We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, &c->btree_cache_freed, list) - if (!mca_reap(b, NULL, 0)) { + if (!mca_reap(b, 0, false)) { mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); if (!b->sets[0].data) goto err; @@ -884,6 +931,7 @@ out: lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); b->level = level; + b->parent = (void *) ~0UL; mca_reinit(b); @@ -892,7 +940,7 @@ err: if (b) rw_unlock(true, b); - b = mca_cannibalize(c, k, level, cl); + b = mca_cannibalize(c, k); if (!IS_ERR(b)) goto out; @@ -903,17 +951,15 @@ err: * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * - * If IO is necessary, it uses the closure embedded in struct btree_op to wait; - * if that closure is in non blocking mode, will return -EAGAIN. + * If IO is necessary and running under generic_make_request, returns -EAGAIN. * * The btree node will have either a read or a write lock held, depending on * level and op->lock. */ struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, - int level, struct btree_op *op) + int level, bool write) { int i = 0; - bool write = level <= op->lock; struct btree *b; BUG_ON(level < 0); @@ -925,7 +971,7 @@ retry: return ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level, &op->cl); + b = mca_alloc(c, k, level); mutex_unlock(&c->bucket_lock); if (!b) @@ -971,7 +1017,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) struct btree *b; mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level, NULL); + b = mca_alloc(c, k, level); mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { @@ -982,17 +1028,12 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) /* Btree alloc */ -static void btree_node_free(struct btree *b, struct btree_op *op) +static void btree_node_free(struct btree *b) { unsigned i; trace_bcache_btree_node_free(b); - /* - * The BUG_ON() in btree_node_get() implies that we must have a write - * lock on parent to free or even invalidate a node - */ - BUG_ON(op->lock <= b->level); BUG_ON(b == b->c->root); if (btree_node_dirty(b)) @@ -1015,27 +1056,26 @@ static void btree_node_free(struct btree *b, struct btree_op *op) mutex_unlock(&b->c->bucket_lock); } -struct btree *bch_btree_node_alloc(struct cache_set *c, int level, - struct closure *cl) +struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) { BKEY_PADDED(key) k; struct btree *b = ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) + if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) goto err; + bkey_put(c, &k.key); SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); - b = mca_alloc(c, &k.key, level, cl); + b = mca_alloc(c, &k.key, level); if (IS_ERR(b)) goto err_free; if (!b) { cache_bug(c, "Tried to allocate bucket that was in btree cache"); - __bkey_put(c, &k.key); goto retry; } @@ -1048,7 +1088,6 @@ retry: return b; err_free: bch_bucket_free(c, &k.key); - __bkey_put(c, &k.key); err: mutex_unlock(&c->bucket_lock); @@ -1056,16 +1095,31 @@ err: return b; } -static struct btree *btree_node_alloc_replacement(struct btree *b, - struct closure *cl) +static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) { - struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); + struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); if (!IS_ERR_OR_NULL(n)) bch_btree_sort_into(b, n); return n; } +static void make_btree_freeing_key(struct btree *b, struct bkey *k) +{ + unsigned i; + + bkey_copy(k, &b->key); + bkey_copy_key(k, &ZERO_KEY); + + for (i = 0; i < KEY_PTRS(k); i++) { + uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1; + + SET_PTR_GEN(k, i, g); + } + + atomic_inc(&b->c->prio_blocked); +} + /* Garbage collection */ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) @@ -1119,12 +1173,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) -static int btree_gc_mark_node(struct btree *b, unsigned *keys, - struct gc_stat *gc) +static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; - unsigned last_dev = -1; - struct bcache_device *d = NULL; + unsigned keys = 0, good_keys = 0; struct bkey *k; struct btree_iter iter; struct bset_tree *t; @@ -1132,27 +1184,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, gc->nodes++; for_each_key_filter(b, k, &iter, bch_ptr_invalid) { - if (last_dev != KEY_INODE(k)) { - last_dev = KEY_INODE(k); - - d = KEY_INODE(k) < b->c->nr_uuids - ? b->c->devices[last_dev] - : NULL; - } - stale = max(stale, btree_mark_key(b, k)); + keys++; if (bch_ptr_bad(b, k)) continue; - *keys += bkey_u64s(k); - gc->key_bytes += bkey_u64s(k); gc->nkeys++; + good_keys++; gc->data += KEY_SIZE(k); - if (KEY_DIRTY(k)) - gc->dirty += KEY_SIZE(k); } for (t = b->sets; t <= &b->sets[b->nsets]; t++) @@ -1161,78 +1203,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, bkey_cmp(&b->key, &t->end) < 0, b, "found short btree key in gc"); - return stale; -} - -static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, - struct btree_op *op) -{ - /* - * We block priorities from being written for the duration of garbage - * collection, so we can't sleep in btree_alloc() -> - * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it - * our closure. - */ - struct btree *n = btree_node_alloc_replacement(b, NULL); - - if (!IS_ERR_OR_NULL(n)) { - swap(b, n); - __bkey_put(b->c, &b->key); + if (b->c->gc_always_rewrite) + return true; - memcpy(k->ptr, b->key.ptr, - sizeof(uint64_t) * KEY_PTRS(&b->key)); + if (stale > 10) + return true; - btree_node_free(n, op); - up_write(&n->lock); - } + if ((keys - good_keys) * 2 > keys) + return true; - return b; + return false; } -/* - * Leaving this at 2 until we've got incremental garbage collection done; it - * could be higher (and has been tested with 4) except that garbage collection - * could take much longer, adversely affecting latency. - */ -#define GC_MERGE_NODES 2U +#define GC_MERGE_NODES 4U struct gc_merge_info { struct btree *b; - struct bkey *k; unsigned keys; }; -static void btree_gc_coalesce(struct btree *b, struct btree_op *op, - struct gc_stat *gc, struct gc_merge_info *r) +static int bch_btree_insert_node(struct btree *, struct btree_op *, + struct keylist *, atomic_t *, struct bkey *); + +static int btree_gc_coalesce(struct btree *b, struct btree_op *op, + struct keylist *keylist, struct gc_stat *gc, + struct gc_merge_info *r) { - unsigned nodes = 0, keys = 0, blocks; - int i; + unsigned i, nodes = 0, keys = 0, blocks; + struct btree *new_nodes[GC_MERGE_NODES]; + struct closure cl; + struct bkey *k; + + memset(new_nodes, 0, sizeof(new_nodes)); + closure_init_stack(&cl); - while (nodes < GC_MERGE_NODES && r[nodes].b) + while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) keys += r[nodes++].keys; blocks = btree_default_blocks(b->c) * 2 / 3; if (nodes < 2 || __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) - return; - - for (i = nodes - 1; i >= 0; --i) { - if (r[i].b->written) - r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); + return 0; - if (r[i].b->written) - return; + for (i = 0; i < nodes; i++) { + new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); + if (IS_ERR_OR_NULL(new_nodes[i])) + goto out_nocoalesce; } for (i = nodes - 1; i > 0; --i) { - struct bset *n1 = r[i].b->sets->data; - struct bset *n2 = r[i - 1].b->sets->data; + struct bset *n1 = new_nodes[i]->sets->data; + struct bset *n2 = new_nodes[i - 1]->sets->data; struct bkey *k, *last = NULL; keys = 0; - if (i == 1) { + if (i > 1) { + for (k = n2->start; + k < end(n2); + k = bkey_next(k)) { + if (__set_blocks(n1, n1->keys + keys + + bkey_u64s(k), b->c) > blocks) + break; + + last = k; + keys += bkey_u64s(k); + } + } else { /* * Last node we're not getting rid of - we're getting * rid of the node at r[0]. Have to try and fit all of @@ -1241,37 +1279,27 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, * length keys (shouldn't be possible in practice, * though) */ - if (__set_blocks(n1, n1->keys + r->keys, - b->c) > btree_blocks(r[i].b)) - return; + if (__set_blocks(n1, n1->keys + n2->keys, + b->c) > btree_blocks(new_nodes[i])) + goto out_nocoalesce; keys = n2->keys; + /* Take the key of the node we're getting rid of */ last = &r->b->key; - } else - for (k = n2->start; - k < end(n2); - k = bkey_next(k)) { - if (__set_blocks(n1, n1->keys + keys + - bkey_u64s(k), b->c) > blocks) - break; - - last = k; - keys += bkey_u64s(k); - } + } BUG_ON(__set_blocks(n1, n1->keys + keys, - b->c) > btree_blocks(r[i].b)); + b->c) > btree_blocks(new_nodes[i])); - if (last) { - bkey_copy_key(&r[i].b->key, last); - bkey_copy_key(r[i].k, last); - } + if (last) + bkey_copy_key(&new_nodes[i]->key, last); memcpy(end(n1), n2->start, (void *) node(n2, keys) - (void *) n2->start); n1->keys += keys; + r[i].keys = n1->keys; memmove(n2->start, node(n2, keys), @@ -1279,95 +1307,176 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, n2->keys -= keys; - r[i].keys = n1->keys; - r[i - 1].keys = n2->keys; + if (bch_keylist_realloc(keylist, + KEY_PTRS(&new_nodes[i]->key), b->c)) + goto out_nocoalesce; + + bch_btree_node_write(new_nodes[i], &cl); + bch_keylist_add(keylist, &new_nodes[i]->key); } - btree_node_free(r->b, op); - up_write(&r->b->lock); + for (i = 0; i < nodes; i++) { + if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) + goto out_nocoalesce; - trace_bcache_btree_gc_coalesce(nodes); + make_btree_freeing_key(r[i].b, keylist->top); + bch_keylist_push(keylist); + } + + /* We emptied out this node */ + BUG_ON(new_nodes[0]->sets->data->keys); + btree_node_free(new_nodes[0]); + rw_unlock(true, new_nodes[0]); + + closure_sync(&cl); + + for (i = 0; i < nodes; i++) { + btree_node_free(r[i].b); + rw_unlock(true, r[i].b); + + r[i].b = new_nodes[i]; + } + + bch_btree_insert_node(b, op, keylist, NULL, NULL); + BUG_ON(!bch_keylist_empty(keylist)); + + memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); + r[nodes - 1].b = ERR_PTR(-EINTR); + trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; - nodes--; - memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); - memset(&r[nodes], 0, sizeof(struct gc_merge_info)); + /* Invalidated our iterator */ + return -EINTR; + +out_nocoalesce: + closure_sync(&cl); + + while ((k = bch_keylist_pop(keylist))) + if (!bkey_cmp(k, &ZERO_KEY)) + atomic_dec(&b->c->prio_blocked); + + for (i = 0; i < nodes; i++) + if (!IS_ERR_OR_NULL(new_nodes[i])) { + btree_node_free(new_nodes[i]); + rw_unlock(true, new_nodes[i]); + } + return 0; } -static int btree_gc_recurse(struct btree *b, struct btree_op *op, - struct closure *writes, struct gc_stat *gc) +static unsigned btree_gc_count_keys(struct btree *b) { - void write(struct btree *r) - { - if (!r->written) - bch_btree_node_write(r, &op->cl); - else if (btree_node_dirty(r)) - bch_btree_node_write(r, writes); + struct bkey *k; + struct btree_iter iter; + unsigned ret = 0; - up_write(&r->lock); - } + for_each_key_filter(b, k, &iter, bch_ptr_bad) + ret += bkey_u64s(k); + + return ret; +} - int ret = 0, stale; +static int btree_gc_recurse(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ unsigned i; + int ret = 0; + bool should_rewrite; + struct btree *n; + struct bkey *k; + struct keylist keys; + struct btree_iter iter; struct gc_merge_info r[GC_MERGE_NODES]; + struct gc_merge_info *last = r + GC_MERGE_NODES - 1; - memset(r, 0, sizeof(r)); + bch_keylist_init(&keys); + bch_btree_iter_init(b, &iter, &b->c->gc_done); - while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { - r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); + for (i = 0; i < GC_MERGE_NODES; i++) + r[i].b = ERR_PTR(-EINTR); - if (IS_ERR(r->b)) { - ret = PTR_ERR(r->b); - break; + while (1) { + k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + if (k) { + r->b = bch_btree_node_get(b->c, k, b->level - 1, true); + if (IS_ERR(r->b)) { + ret = PTR_ERR(r->b); + break; + } + + r->keys = btree_gc_count_keys(r->b); + + ret = btree_gc_coalesce(b, op, &keys, gc, r); + if (ret) + break; } - r->keys = 0; - stale = btree_gc_mark_node(r->b, &r->keys, gc); + if (!last->b) + break; - if (!b->written && - (r->b->level || stale > 10 || - b->c->gc_always_rewrite)) - r->b = btree_gc_alloc(r->b, r->k, op); + if (!IS_ERR(last->b)) { + should_rewrite = btree_gc_mark_node(last->b, gc); + if (should_rewrite) { + n = btree_node_alloc_replacement(last->b, + false); - if (r->b->level) - ret = btree_gc_recurse(r->b, op, writes, gc); + if (!IS_ERR_OR_NULL(n)) { + bch_btree_node_write_sync(n); + bch_keylist_add(&keys, &n->key); - if (ret) { - write(r->b); - break; - } + make_btree_freeing_key(last->b, + keys.top); + bch_keylist_push(&keys); + + btree_node_free(last->b); + + bch_btree_insert_node(b, op, &keys, + NULL, NULL); + BUG_ON(!bch_keylist_empty(&keys)); - bkey_copy_key(&b->c->gc_done, r->k); + rw_unlock(true, last->b); + last->b = n; - if (!b->written) - btree_gc_coalesce(b, op, gc, r); + /* Invalidated our iterator */ + ret = -EINTR; + break; + } + } - if (r[GC_MERGE_NODES - 1].b) - write(r[GC_MERGE_NODES - 1].b); + if (last->b->level) { + ret = btree_gc_recurse(last->b, op, writes, gc); + if (ret) + break; + } - memmove(&r[1], &r[0], - sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); + bkey_copy_key(&b->c->gc_done, &last->b->key); + + /* + * Must flush leaf nodes before gc ends, since replace + * operations aren't journalled + */ + if (btree_node_dirty(last->b)) + bch_btree_node_write(last->b, writes); + rw_unlock(true, last->b); + } + + memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); + r->b = NULL; - /* When we've got incremental GC working, we'll want to do - * if (should_resched()) - * return -EAGAIN; - */ - cond_resched(); -#if 0 if (need_resched()) { ret = -EAGAIN; break; } -#endif } - for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) - write(r[i].b); + for (i = 0; i < GC_MERGE_NODES; i++) + if (!IS_ERR_OR_NULL(r[i].b)) { + if (btree_node_dirty(r[i].b)) + bch_btree_node_write(r[i].b, writes); + rw_unlock(true, r[i].b); + } - /* Might have freed some children, must remove their keys */ - if (!b->written) - bch_btree_sort(b); + bch_keylist_free(&keys); return ret; } @@ -1376,29 +1485,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { struct btree *n = NULL; - unsigned keys = 0; - int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); - - if (b->level || stale > 10) - n = btree_node_alloc_replacement(b, NULL); + int ret = 0; + bool should_rewrite; - if (!IS_ERR_OR_NULL(n)) - swap(b, n); + should_rewrite = btree_gc_mark_node(b, gc); + if (should_rewrite) { + n = btree_node_alloc_replacement(b, false); - if (b->level) - ret = btree_gc_recurse(b, op, writes, gc); + if (!IS_ERR_OR_NULL(n)) { + bch_btree_node_write_sync(n); + bch_btree_set_root(n); + btree_node_free(b); + rw_unlock(true, n); - if (!b->written || btree_node_dirty(b)) { - bch_btree_node_write(b, n ? &op->cl : NULL); + return -EINTR; + } } - if (!IS_ERR_OR_NULL(n)) { - closure_sync(&op->cl); - bch_btree_set_root(b); - btree_node_free(n, op); - rw_unlock(true, b); + if (b->level) { + ret = btree_gc_recurse(b, op, writes, gc); + if (ret) + return ret; } + bkey_copy_key(&b->c->gc_done, &b->key); + return ret; } @@ -1479,9 +1590,8 @@ size_t bch_btree_gc_finish(struct cache_set *c) return available; } -static void bch_btree_gc(struct closure *cl) +static void bch_btree_gc(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, gc.cl); int ret; unsigned long available; struct gc_stat stats; @@ -1493,47 +1603,73 @@ static void bch_btree_gc(struct closure *cl) memset(&stats, 0, sizeof(struct gc_stat)); closure_init_stack(&writes); - bch_btree_op_init_stack(&op); - op.lock = SHRT_MAX; + bch_btree_op_init(&op, SHRT_MAX); btree_gc_start(c); - atomic_inc(&c->prio_blocked); - - ret = btree_root(gc_root, c, &op, &writes, &stats); - closure_sync(&op.cl); - closure_sync(&writes); - - if (ret) { - pr_warn("gc failed!"); - continue_at(cl, bch_btree_gc, bch_gc_wq); - } + do { + ret = btree_root(gc_root, c, &op, &writes, &stats); + closure_sync(&writes); - /* Possibly wait for new UUIDs or whatever to hit disk */ - bch_journal_meta(c, &op.cl); - closure_sync(&op.cl); + if (ret && ret != -EAGAIN) + pr_warn("gc failed!"); + } while (ret); available = bch_btree_gc_finish(c); - - atomic_dec(&c->prio_blocked); wake_up_allocators(c); bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); - stats.dirty <<= 9; stats.data <<= 9; stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); trace_bcache_gc_end(c); - continue_at(cl, bch_moving_gc, bch_gc_wq); + bch_moving_gc(c); +} + +static int bch_gc_thread(void *arg) +{ + struct cache_set *c = arg; + struct cache *ca; + unsigned i; + + while (1) { +again: + bch_btree_gc(c); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + + mutex_lock(&c->bucket_lock); + + for_each_cache(ca, c, i) + if (ca->invalidate_needs_gc) { + mutex_unlock(&c->bucket_lock); + set_current_state(TASK_RUNNING); + goto again; + } + + mutex_unlock(&c->bucket_lock); + + try_to_freeze(); + schedule(); + } + + return 0; } -void bch_queue_gc(struct cache_set *c) +int bch_gc_thread_start(struct cache_set *c) { - closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); + c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); + if (IS_ERR(c->gc_thread)) + return PTR_ERR(c->gc_thread); + + set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); + return 0; } /* Initial partial gc */ @@ -1541,9 +1677,9 @@ void bch_queue_gc(struct cache_set *c) static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, unsigned long **seen) { - int ret; + int ret = 0; unsigned i; - struct bkey *k; + struct bkey *k, *p = NULL; struct bucket *g; struct btree_iter iter; @@ -1570,31 +1706,32 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, } if (b->level) { - k = bch_next_recurse_key(b, &ZERO_KEY); + bch_btree_iter_init(b, &iter, NULL); - while (k) { - struct bkey *p = bch_next_recurse_key(b, k); - if (p) - btree_node_prefetch(b->c, p, b->level - 1); + do { + k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + if (k) + btree_node_prefetch(b->c, k, b->level - 1); - ret = btree(check_recurse, k, b, op, seen); - if (ret) - return ret; + if (p) + ret = btree(check_recurse, p, b, op, seen); - k = p; - } + p = k; + } while (p && !ret); } return 0; } -int bch_btree_check(struct cache_set *c, struct btree_op *op) +int bch_btree_check(struct cache_set *c) { int ret = -ENOMEM; unsigned i; unsigned long *seen[MAX_CACHES_PER_SET]; + struct btree_op op; memset(seen, 0, sizeof(seen)); + bch_btree_op_init(&op, SHRT_MAX); for (i = 0; c->cache[i]; i++) { size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); @@ -1606,7 +1743,7 @@ int bch_btree_check(struct cache_set *c, struct btree_op *op) memset(seen[i], 0xFF, n); } - ret = btree_root(check_recurse, c, op, seen); + ret = btree_root(check_recurse, c, &op, seen); err: for (i = 0; i < MAX_CACHES_PER_SET; i++) kfree(seen[i]); @@ -1628,10 +1765,9 @@ static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) bch_bset_fix_lookup_table(b, where); } -static bool fix_overlapping_extents(struct btree *b, - struct bkey *insert, +static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, struct btree_iter *iter, - struct btree_op *op) + struct bkey *replace_key) { void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) { @@ -1659,39 +1795,38 @@ static bool fix_overlapping_extents(struct btree *b, * We might overlap with 0 size extents; we can't skip these * because if they're in the set we're inserting to we have to * adjust them so they don't overlap with the key we're - * inserting. But we don't want to check them for BTREE_REPLACE + * inserting. But we don't want to check them for replace * operations. */ - if (op->type == BTREE_REPLACE && - KEY_SIZE(k)) { + if (replace_key && KEY_SIZE(k)) { /* * k might have been split since we inserted/found the * key we're replacing */ unsigned i; uint64_t offset = KEY_START(k) - - KEY_START(&op->replace); + KEY_START(replace_key); /* But it must be a subset of the replace key */ - if (KEY_START(k) < KEY_START(&op->replace) || - KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) + if (KEY_START(k) < KEY_START(replace_key) || + KEY_OFFSET(k) > KEY_OFFSET(replace_key)) goto check_failed; /* We didn't find a key that we were supposed to */ if (KEY_START(k) > KEY_START(insert) + sectors_found) goto check_failed; - if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) + if (KEY_PTRS(replace_key) != KEY_PTRS(k)) goto check_failed; /* skip past gen */ offset <<= 8; - BUG_ON(!KEY_PTRS(&op->replace)); + BUG_ON(!KEY_PTRS(replace_key)); - for (i = 0; i < KEY_PTRS(&op->replace); i++) - if (k->ptr[i] != op->replace.ptr[i] + offset) + for (i = 0; i < KEY_PTRS(replace_key); i++) + if (k->ptr[i] != replace_key->ptr[i] + offset) goto check_failed; sectors_found = KEY_OFFSET(k) - KEY_START(insert); @@ -1742,6 +1877,9 @@ static bool fix_overlapping_extents(struct btree *b, if (bkey_cmp(insert, k) < 0) { bch_cut_front(insert, k); } else { + if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) + old_offset = KEY_START(insert); + if (bkey_written(b, k) && bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { /* @@ -1759,9 +1897,8 @@ static bool fix_overlapping_extents(struct btree *b, } check_failed: - if (op->type == BTREE_REPLACE) { + if (replace_key) { if (!sectors_found) { - op->insert_collision = true; return true; } else if (sectors_found < KEY_SIZE(insert)) { SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - @@ -1774,7 +1911,7 @@ check_failed: } static bool btree_insert_key(struct btree *b, struct btree_op *op, - struct bkey *k) + struct bkey *k, struct bkey *replace_key) { struct bset *i = b->sets[b->nsets].data; struct bkey *m, *prev; @@ -1786,22 +1923,23 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, if (!b->level) { struct btree_iter iter; - struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); /* * bset_search() returns the first key that is strictly greater * than the search key - but for back merging, we want to find - * the first key that is greater than or equal to KEY_START(k) - - * unless KEY_START(k) is 0. + * the previous key. */ - if (KEY_OFFSET(&search)) - SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); - prev = NULL; - m = bch_btree_iter_init(b, &iter, &search); + m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k))); - if (fix_overlapping_extents(b, k, &iter, op)) + if (fix_overlapping_extents(b, k, &iter, replace_key)) { + op->insert_collision = true; return false; + } + + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); while (m != end(i) && bkey_cmp(k, &START_KEY(m)) > 0) @@ -1825,84 +1963,80 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, if (m != end(i) && bch_bkey_try_merge(b, k, m)) goto copy; - } else + } else { + BUG_ON(replace_key); m = bch_bset_search(b, &b->sets[b->nsets], k); + } insert: shift_keys(b, m, k); copy: bkey_copy(m, k); merged: - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), - KEY_START(k), KEY_SIZE(k)); - - bch_check_keys(b, "%u for %s", status, op_type(op)); + bch_check_keys(b, "%u for %s", status, + replace_key ? "replace" : "insert"); if (b->level && !KEY_OFFSET(k)) btree_current_write(b)->prio_blocked++; - trace_bcache_btree_insert_key(b, k, op->type, status); + trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); return true; } -static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) +static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct bkey *replace_key) { bool ret = false; - struct bkey *k; - unsigned oldsize = bch_count_data(b); - - while ((k = bch_keylist_pop(&op->keys))) { - bkey_put(b->c, k, b->level); - ret |= btree_insert_key(b, op, k); - } - - BUG_ON(bch_count_data(b) < oldsize); - return ret; -} + int oldsize = bch_count_data(b); -bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, - struct bio *bio) -{ - bool ret = false; - uint64_t btree_ptr = b->key.ptr[0]; - unsigned long seq = b->seq; - BKEY_PADDED(k) tmp; + while (!bch_keylist_empty(insert_keys)) { + struct bset *i = write_block(b); + struct bkey *k = insert_keys->keys; - rw_unlock(false, b); - rw_lock(true, b, b->level); + if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) + > btree_blocks(b)) + break; - if (b->key.ptr[0] != btree_ptr || - b->seq != seq + 1 || - should_split(b)) - goto out; + if (bkey_cmp(k, &b->key) <= 0) { + if (!b->level) + bkey_put(b->c, k); - op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); + ret |= btree_insert_key(b, op, k, replace_key); + bch_keylist_pop_front(insert_keys); + } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, insert_keys->keys); - SET_KEY_PTRS(&op->replace, 1); - get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); + bch_cut_back(&b->key, &temp.key); + bch_cut_front(&b->key, insert_keys->keys); - SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); + ret |= btree_insert_key(b, op, &temp.key, replace_key); + break; + } else { + break; + } + } - bkey_copy(&tmp.k, &op->replace); + BUG_ON(!bch_keylist_empty(insert_keys) && b->level); - BUG_ON(op->type != BTREE_INSERT); - BUG_ON(!btree_insert_key(b, op, &tmp.k)); - ret = true; -out: - downgrade_write(&b->lock); + BUG_ON(bch_count_data(b) < oldsize); return ret; } -static int btree_split(struct btree *b, struct btree_op *op) +static int btree_split(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct bkey *replace_key) { - bool split, root = b == b->c->root; + bool split; struct btree *n1, *n2 = NULL, *n3 = NULL; uint64_t start_time = local_clock(); + struct closure cl; + struct keylist parent_keys; - if (b->level) - set_closure_blocking(&op->cl); + closure_init_stack(&cl); + bch_keylist_init(&parent_keys); - n1 = btree_node_alloc_replacement(b, &op->cl); + n1 = btree_node_alloc_replacement(b, true); if (IS_ERR(n1)) goto err; @@ -1913,19 +2047,20 @@ static int btree_split(struct btree *b, struct btree_op *op) trace_bcache_btree_node_split(b, n1->sets[0].data->keys); - n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); + n2 = bch_btree_node_alloc(b->c, b->level, true); if (IS_ERR(n2)) goto err_free1; - if (root) { - n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); + if (!b->parent) { + n3 = bch_btree_node_alloc(b->c, b->level + 1, true); if (IS_ERR(n3)) goto err_free2; } - bch_btree_insert_keys(n1, op); + bch_btree_insert_keys(n1, op, insert_keys, replace_key); - /* Has to be a linear search because we don't have an auxiliary + /* + * Has to be a linear search because we don't have an auxiliary * search tree yet */ @@ -1944,60 +2079,57 @@ static int btree_split(struct btree *b, struct btree_op *op) bkey_copy_key(&n2->key, &b->key); - bch_keylist_add(&op->keys, &n2->key); - bch_btree_node_write(n2, &op->cl); + bch_keylist_add(&parent_keys, &n2->key); + bch_btree_node_write(n2, &cl); rw_unlock(true, n2); } else { trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); - bch_btree_insert_keys(n1, op); + bch_btree_insert_keys(n1, op, insert_keys, replace_key); } - bch_keylist_add(&op->keys, &n1->key); - bch_btree_node_write(n1, &op->cl); + bch_keylist_add(&parent_keys, &n1->key); + bch_btree_node_write(n1, &cl); if (n3) { + /* Depth increases, make a new root */ bkey_copy_key(&n3->key, &MAX_KEY); - bch_btree_insert_keys(n3, op); - bch_btree_node_write(n3, &op->cl); + bch_btree_insert_keys(n3, op, &parent_keys, NULL); + bch_btree_node_write(n3, &cl); - closure_sync(&op->cl); + closure_sync(&cl); bch_btree_set_root(n3); rw_unlock(true, n3); - } else if (root) { - op->keys.top = op->keys.bottom; - closure_sync(&op->cl); - bch_btree_set_root(n1); - } else { - unsigned i; - bkey_copy(op->keys.top, &b->key); - bkey_copy_key(op->keys.top, &ZERO_KEY); + btree_node_free(b); + } else if (!b->parent) { + /* Root filled up but didn't need to be split */ + closure_sync(&cl); + bch_btree_set_root(n1); - for (i = 0; i < KEY_PTRS(&b->key); i++) { - uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; + btree_node_free(b); + } else { + /* Split a non root node */ + closure_sync(&cl); + make_btree_freeing_key(b, parent_keys.top); + bch_keylist_push(&parent_keys); - SET_PTR_GEN(op->keys.top, i, g); - } + btree_node_free(b); - bch_keylist_push(&op->keys); - closure_sync(&op->cl); - atomic_inc(&b->c->prio_blocked); + bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&parent_keys)); } rw_unlock(true, n1); - btree_node_free(b, op); bch_time_stats_update(&b->c->btree_split_time, start_time); return 0; err_free2: - __bkey_put(n2->c, &n2->key); - btree_node_free(n2, op); + btree_node_free(n2); rw_unlock(true, n2); err_free1: - __bkey_put(n1->c, &n1->key); - btree_node_free(n1, op); + btree_node_free(n1); rw_unlock(true, n1); err: if (n3 == ERR_PTR(-EAGAIN) || @@ -2009,116 +2141,126 @@ err: return -ENOMEM; } -static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, - struct keylist *stack_keys) +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + atomic_t *journal_ref, + struct bkey *replace_key) { - if (b->level) { - int ret; - struct bkey *insert = op->keys.bottom; - struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); - - if (!k) { - btree_bug(b, "no key to recurse on at level %i/%i", - b->level, b->c->root->level); + BUG_ON(b->level && replace_key); - op->keys.top = op->keys.bottom; - return -EIO; + if (should_split(b)) { + if (current->bio_list) { + op->lock = b->c->root->level + 1; + return -EAGAIN; + } else if (op->lock <= b->c->root->level) { + op->lock = b->c->root->level + 1; + return -EINTR; + } else { + /* Invalidated all iterators */ + return btree_split(b, op, insert_keys, replace_key) ?: + -EINTR; } + } else { + BUG_ON(write_block(b) != b->sets[b->nsets].data); - if (bkey_cmp(insert, k) > 0) { - unsigned i; - - if (op->type == BTREE_REPLACE) { - __bkey_put(b->c, insert); - op->keys.top = op->keys.bottom; - op->insert_collision = true; - return 0; - } + if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { + if (!b->level) + bch_btree_leaf_dirty(b, journal_ref); + else + bch_btree_node_write_sync(b); + } - for (i = 0; i < KEY_PTRS(insert); i++) - atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); + return 0; + } +} - bkey_copy(stack_keys->top, insert); +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bkey *check_key) +{ + int ret = -EINTR; + uint64_t btree_ptr = b->key.ptr[0]; + unsigned long seq = b->seq; + struct keylist insert; + bool upgrade = op->lock == -1; - bch_cut_back(k, insert); - bch_cut_front(k, stack_keys->top); + bch_keylist_init(&insert); - bch_keylist_push(stack_keys); - } + if (upgrade) { + rw_unlock(false, b); + rw_lock(true, b, b->level); - ret = btree(insert_recurse, k, b, op, stack_keys); - if (ret) - return ret; + if (b->key.ptr[0] != btree_ptr || + b->seq != seq + 1) + goto out; } - if (!bch_keylist_empty(&op->keys)) { - if (should_split(b)) { - if (op->lock <= b->c->root->level) { - BUG_ON(b->level); - op->lock = b->c->root->level + 1; - return -EINTR; - } - return btree_split(b, op); - } + SET_KEY_PTRS(check_key, 1); + get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); - BUG_ON(write_block(b) != b->sets[b->nsets].data); + SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); - if (bch_btree_insert_keys(b, op)) { - if (!b->level) - bch_btree_leaf_dirty(b, op); - else - bch_btree_node_write(b, &op->cl); - } - } + bch_keylist_add(&insert, check_key); - return 0; + ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); + + BUG_ON(!ret && !bch_keylist_empty(&insert)); +out: + if (upgrade) + downgrade_write(&b->lock); + return ret; } -int bch_btree_insert(struct btree_op *op, struct cache_set *c) +struct btree_insert_op { + struct btree_op op; + struct keylist *keys; + atomic_t *journal_ref; + struct bkey *replace_key; +}; + +int btree_insert_fn(struct btree_op *b_op, struct btree *b) { - int ret = 0; - struct keylist stack_keys; + struct btree_insert_op *op = container_of(b_op, + struct btree_insert_op, op); - /* - * Don't want to block with the btree locked unless we have to, - * otherwise we get deadlocks with try_harder and between split/gc - */ - clear_closure_blocking(&op->cl); - - BUG_ON(bch_keylist_empty(&op->keys)); - bch_keylist_copy(&stack_keys, &op->keys); - bch_keylist_init(&op->keys); - - while (!bch_keylist_empty(&stack_keys) || - !bch_keylist_empty(&op->keys)) { - if (bch_keylist_empty(&op->keys)) { - bch_keylist_add(&op->keys, - bch_keylist_pop(&stack_keys)); - op->lock = 0; - } + int ret = bch_btree_insert_node(b, &op->op, op->keys, + op->journal_ref, op->replace_key); + if (ret && !bch_keylist_empty(op->keys)) + return ret; + else + return MAP_DONE; +} - ret = btree_root(insert_recurse, c, op, &stack_keys); +int bch_btree_insert(struct cache_set *c, struct keylist *keys, + atomic_t *journal_ref, struct bkey *replace_key) +{ + struct btree_insert_op op; + int ret = 0; - if (ret == -EAGAIN) { - ret = 0; - closure_sync(&op->cl); - } else if (ret) { - struct bkey *k; + BUG_ON(current->bio_list); + BUG_ON(bch_keylist_empty(keys)); + + bch_btree_op_init(&op.op, 0); + op.keys = keys; + op.journal_ref = journal_ref; + op.replace_key = replace_key; + + while (!ret && !bch_keylist_empty(keys)) { + op.op.lock = 0; + ret = bch_btree_map_leaf_nodes(&op.op, c, + &START_KEY(keys->keys), + btree_insert_fn); + } - pr_err("error %i trying to insert key for %s", - ret, op_type(op)); + if (ret) { + struct bkey *k; - while ((k = bch_keylist_pop(&stack_keys) ?: - bch_keylist_pop(&op->keys))) - bkey_put(c, k, 0); - } - } + pr_err("error %i", ret); - bch_keylist_free(&stack_keys); + while ((k = bch_keylist_pop(keys))) + bkey_put(c, k); + } else if (op.op.insert_collision) + ret = -ESRCH; - if (op->journal) - atomic_dec_bug(op->journal); - op->journal = NULL; return ret; } @@ -2141,132 +2283,81 @@ void bch_btree_set_root(struct btree *b) mutex_unlock(&b->c->bucket_lock); b->c->root = b; - __bkey_put(b->c, &b->key); bch_journal_meta(b->c, &cl); closure_sync(&cl); } -/* Cache lookup */ +/* Map across nodes or keys */ -static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, - struct bkey *k) +static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, + btree_map_nodes_fn *fn, int flags) { - struct search *s = container_of(op, struct search, op); - struct bio *bio = &s->bio.bio; - int ret = 0; + int ret = MAP_CONTINUE; + + if (b->level) { + struct bkey *k; + struct btree_iter iter; - while (!ret && - !op->lookup_done) { - unsigned sectors = INT_MAX; + bch_btree_iter_init(b, &iter, from); - if (KEY_INODE(k) == op->inode) { - if (KEY_START(k) <= bio->bi_sector) - break; + while ((k = bch_btree_iter_next_filter(&iter, b, + bch_ptr_bad))) { + ret = btree(map_nodes_recurse, k, b, + op, from, fn, flags); + from = NULL; - sectors = min_t(uint64_t, sectors, - KEY_START(k) - bio->bi_sector); + if (ret != MAP_CONTINUE) + return ret; } - - ret = s->d->cache_miss(b, s, bio, sectors); } + if (!b->level || flags == MAP_ALL_NODES) + ret = fn(op, b); + return ret; } -/* - * Read from a single key, handling the initial cache miss if the key starts in - * the middle of the bio - */ -static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, - struct bkey *k) +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn, int flags) { - struct search *s = container_of(op, struct search, op); - struct bio *bio = &s->bio.bio; - unsigned ptr; - struct bio *n; - - int ret = submit_partial_cache_miss(b, op, k); - if (ret || op->lookup_done) - return ret; - - /* XXX: figure out best pointer - for multiple cache devices */ - ptr = 0; - - PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; - - while (!op->lookup_done && - KEY_INODE(k) == op->inode && - bio->bi_sector < KEY_OFFSET(k)) { - struct bkey *bio_key; - sector_t sector = PTR_OFFSET(k, ptr) + - (bio->bi_sector - KEY_START(k)); - unsigned sectors = min_t(uint64_t, INT_MAX, - KEY_OFFSET(k) - bio->bi_sector); - - n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (n == bio) - op->lookup_done = true; - - bio_key = &container_of(n, struct bbio, bio)->key; - - /* - * The bucket we're reading from might be reused while our bio - * is in flight, and we could then end up reading the wrong - * data. - * - * We guard against this by checking (in cache_read_endio()) if - * the pointer is stale again; if so, we treat it as an error - * and reread from the backing device (but we don't pass that - * error up anywhere). - */ - - bch_bkey_copy_single_ptr(bio_key, k, ptr); - SET_PTR_OFFSET(bio_key, 0, sector); - - n->bi_end_io = bch_cache_read_endio; - n->bi_private = &s->cl; - - __bch_submit_bbio(n, b->c); - } - - return 0; + return btree_root(map_nodes_recurse, c, op, from, fn, flags); } -int bch_btree_search_recurse(struct btree *b, struct btree_op *op) +static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags) { - struct search *s = container_of(op, struct search, op); - struct bio *bio = &s->bio.bio; - - int ret = 0; + int ret = MAP_CONTINUE; struct bkey *k; struct btree_iter iter; - bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); - do { - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); - if (!k) { - /* - * b->key would be exactly what we want, except that - * pointers to btree nodes have nonzero size - we - * wouldn't go far enough - */ + bch_btree_iter_init(b, &iter, from); - ret = submit_partial_cache_miss(b, op, - &KEY(KEY_INODE(&b->key), - KEY_OFFSET(&b->key), 0)); - break; - } + while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { + ret = !b->level + ? fn(op, b, k) + : btree(map_keys_recurse, k, b, op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } - ret = b->level - ? btree(search_recurse, k, b, op) - : submit_partial_cache_hit(b, op, k); - } while (!ret && - !op->lookup_done); + if (!b->level && (flags & MAP_END_KEY)) + ret = fn(op, b, &KEY(KEY_INODE(&b->key), + KEY_OFFSET(&b->key), 0)); return ret; } +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags) +{ + return btree_root(map_keys_recurse, c, op, from, fn, flags); +} + /* Keybuf code */ static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) @@ -2285,80 +2376,79 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); } -static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, - struct keybuf *buf, struct bkey *end, - keybuf_pred_fn *pred) -{ - struct btree_iter iter; - bch_btree_iter_init(b, &iter, &buf->last_scanned); - - while (!array_freelist_empty(&buf->freelist)) { - struct bkey *k = bch_btree_iter_next_filter(&iter, b, - bch_ptr_bad); - - if (!b->level) { - if (!k) { - buf->last_scanned = b->key; - break; - } +struct refill { + struct btree_op op; + unsigned nr_found; + struct keybuf *buf; + struct bkey *end; + keybuf_pred_fn *pred; +}; - buf->last_scanned = *k; - if (bkey_cmp(&buf->last_scanned, end) >= 0) - break; +static int refill_keybuf_fn(struct btree_op *op, struct btree *b, + struct bkey *k) +{ + struct refill *refill = container_of(op, struct refill, op); + struct keybuf *buf = refill->buf; + int ret = MAP_CONTINUE; - if (pred(buf, k)) { - struct keybuf_key *w; + if (bkey_cmp(k, refill->end) >= 0) { + ret = MAP_DONE; + goto out; + } - spin_lock(&buf->lock); + if (!KEY_SIZE(k)) /* end key */ + goto out; - w = array_alloc(&buf->freelist); + if (refill->pred(buf, k)) { + struct keybuf_key *w; - w->private = NULL; - bkey_copy(&w->key, k); + spin_lock(&buf->lock); - if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) - array_free(&buf->freelist, w); + w = array_alloc(&buf->freelist); + if (!w) { + spin_unlock(&buf->lock); + return MAP_DONE; + } - spin_unlock(&buf->lock); - } - } else { - if (!k) - break; + w->private = NULL; + bkey_copy(&w->key, k); - btree(refill_keybuf, k, b, op, buf, end, pred); - /* - * Might get an error here, but can't really do anything - * and it'll get logged elsewhere. Just read what we - * can. - */ + if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) + array_free(&buf->freelist, w); + else + refill->nr_found++; - if (bkey_cmp(&buf->last_scanned, end) >= 0) - break; + if (array_freelist_empty(&buf->freelist)) + ret = MAP_DONE; - cond_resched(); - } + spin_unlock(&buf->lock); } - - return 0; +out: + buf->last_scanned = *k; + return ret; } void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, struct bkey *end, keybuf_pred_fn *pred) { struct bkey start = buf->last_scanned; - struct btree_op op; - bch_btree_op_init_stack(&op); + struct refill refill; cond_resched(); - btree_root(refill_keybuf, c, &op, buf, end, pred); - closure_sync(&op.cl); + bch_btree_op_init(&refill.op, -1); + refill.nr_found = 0; + refill.buf = buf; + refill.end = end; + refill.pred = pred; + + bch_btree_map_keys(&refill.op, c, &buf->last_scanned, + refill_keybuf_fn, MAP_END_KEY); - pr_debug("found %s keys from %llu:%llu to %llu:%llu", - RB_EMPTY_ROOT(&buf->keys) ? "no" : - array_freelist_empty(&buf->freelist) ? "some" : "a few", - KEY_INODE(&start), KEY_OFFSET(&start), - KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); + trace_bcache_keyscan(refill.nr_found, + KEY_INODE(&start), KEY_OFFSET(&start), + KEY_INODE(&buf->last_scanned), + KEY_OFFSET(&buf->last_scanned)); spin_lock(&buf->lock); @@ -2436,9 +2526,9 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) } struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, - struct keybuf *buf, - struct bkey *end, - keybuf_pred_fn *pred) + struct keybuf *buf, + struct bkey *end, + keybuf_pred_fn *pred) { struct keybuf_key *ret; @@ -2471,14 +2561,12 @@ void bch_btree_exit(void) { if (btree_io_wq) destroy_workqueue(btree_io_wq); - if (bch_gc_wq) - destroy_workqueue(bch_gc_wq); } int __init bch_btree_init(void) { - if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || - !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) + btree_io_wq = create_singlethread_workqueue("bch_btree_io"); + if (!btree_io_wq) return -ENOMEM; return 0; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 3333d3723633..767e75570896 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -125,6 +125,7 @@ struct btree { unsigned long seq; struct rw_semaphore lock; struct cache_set *c; + struct btree *parent; unsigned long flags; uint16_t written; /* would be nice to kill */ @@ -200,12 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k) static inline void set_gc_sectors(struct cache_set *c) { - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); -} - -static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) -{ - return __bch_ptr_invalid(b->c, b->level, k); + atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); } static inline struct bkey *bch_btree_iter_init(struct btree *b, @@ -215,6 +211,16 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b, return __bch_btree_iter_init(b, iter, search, b->sets); } +static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) +{ + if (b->level) + return bch_btree_ptr_invalid(b->c, k); + else + return bch_extent_ptr_invalid(b->c, k); +} + +void bkey_put(struct cache_set *c, struct bkey *k); + /* Looping macros */ #define for_each_cached_btree(b, c, iter) \ @@ -234,51 +240,17 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b, /* Recursing down the btree */ struct btree_op { - struct closure cl; - struct cache_set *c; - - /* Journal entry we have a refcount on */ - atomic_t *journal; - - /* Bio to be inserted into the cache */ - struct bio *cache_bio; - - unsigned inode; - - uint16_t write_prio; - /* Btree level at which we start taking write locks */ short lock; - /* Btree insertion type */ - enum { - BTREE_INSERT, - BTREE_REPLACE - } type:8; - - unsigned csum:1; - unsigned skip:1; - unsigned flush_journal:1; - - unsigned insert_data_done:1; - unsigned lookup_done:1; unsigned insert_collision:1; - - /* Anything after this point won't get zeroed in do_bio_hook() */ - - /* Keys to be inserted */ - struct keylist keys; - BKEY_PADDED(replace); }; -enum { - BTREE_INSERT_STATUS_INSERT, - BTREE_INSERT_STATUS_BACK_MERGE, - BTREE_INSERT_STATUS_OVERWROTE, - BTREE_INSERT_STATUS_FRONT_MERGE, -}; - -void bch_btree_op_init_stack(struct btree_op *); +static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) +{ + memset(op, 0, sizeof(struct btree_op)); + op->lock = write_lock_level; +} static inline void rw_lock(bool w, struct btree *b, int level) { @@ -290,108 +262,71 @@ static inline void rw_lock(bool w, struct btree *b, int level) static inline void rw_unlock(bool w, struct btree *b) { -#ifdef CONFIG_BCACHE_EDEBUG - unsigned i; - - if (w && b->key.ptr[0]) - for (i = 0; i <= b->nsets; i++) - bch_check_key_order(b, b->sets[i].data); -#endif - if (w) b->seq++; (w ? up_write : up_read)(&b->lock); } -#define insert_lock(s, b) ((b)->level <= (s)->lock) +void bch_btree_node_read(struct btree *); +void bch_btree_node_write(struct btree *, struct closure *); -/* - * These macros are for recursing down the btree - they handle the details of - * locking and looking up nodes in the cache for you. They're best treated as - * mere syntax when reading code that uses them. - * - * op->lock determines whether we take a read or a write lock at a given depth. - * If you've got a read lock and find that you need a write lock (i.e. you're - * going to have to split), set op->lock and return -EINTR; btree_root() will - * call you again and you'll have the correct lock. - */ +void bch_btree_set_root(struct btree *); +struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); +struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); -/** - * btree - recurse down the btree on a specified key - * @fn: function to call, which will be passed the child node - * @key: key to recurse on - * @b: parent btree node - * @op: pointer to struct btree_op - */ -#define btree(fn, key, b, op, ...) \ -({ \ - int _r, l = (b)->level - 1; \ - bool _w = l <= (op)->lock; \ - struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ - if (!IS_ERR(_b)) { \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - rw_unlock(_w, _b); \ - } else \ - _r = PTR_ERR(_b); \ - _r; \ -}) - -/** - * btree_root - call a function on the root of the btree - * @fn: function to call, which will be passed the child node - * @c: cache set - * @op: pointer to struct btree_op - */ -#define btree_root(fn, c, op, ...) \ -({ \ - int _r = -EINTR; \ - do { \ - struct btree *_b = (c)->root; \ - bool _w = insert_lock(op, _b); \ - rw_lock(_w, _b, _b->level); \ - if (_b == (c)->root && \ - _w == insert_lock(op, _b)) \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c, &(op)->cl); \ - } while (_r == -EINTR); \ - \ - _r; \ -}) +int bch_btree_insert_check_key(struct btree *, struct btree_op *, + struct bkey *); +int bch_btree_insert(struct cache_set *, struct keylist *, + atomic_t *, struct bkey *); + +int bch_gc_thread_start(struct cache_set *); +size_t bch_btree_gc_finish(struct cache_set *); +void bch_moving_gc(struct cache_set *); +int bch_btree_check(struct cache_set *); +uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); -static inline bool should_split(struct btree *b) +static inline void wake_up_gc(struct cache_set *c) { - struct bset *i = write_block(b); - return b->written >= btree_blocks(b) || - (i->seq == b->sets[0].data->seq && - b->written + __set_blocks(i, i->keys + 15, b->c) - > btree_blocks(b)); + if (c->gc_thread) + wake_up_process(c->gc_thread); } -void bch_btree_node_read(struct btree *); -void bch_btree_node_write(struct btree *, struct closure *); +#define MAP_DONE 0 +#define MAP_CONTINUE 1 -void bch_cannibalize_unlock(struct cache_set *, struct closure *); -void bch_btree_set_root(struct btree *); -struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); -struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, - int, struct btree_op *); +#define MAP_ALL_NODES 0 +#define MAP_LEAF_NODES 1 -bool bch_btree_insert_check_key(struct btree *, struct btree_op *, - struct bio *); -int bch_btree_insert(struct btree_op *, struct cache_set *); +#define MAP_END_KEY 1 -int bch_btree_search_recurse(struct btree *, struct btree_op *); +typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *); +int __bch_btree_map_nodes(struct btree_op *, struct cache_set *, + struct bkey *, btree_map_nodes_fn *, int); -void bch_queue_gc(struct cache_set *); -size_t bch_btree_gc_finish(struct cache_set *); -void bch_moving_gc(struct closure *); -int bch_btree_check(struct cache_set *, struct btree_op *); -uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); +static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES); +} + +static inline int bch_btree_map_leaf_nodes(struct btree_op *op, + struct cache_set *c, + struct bkey *from, + btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); +} + +typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *, + struct bkey *); +int bch_btree_map_keys(struct btree_op *, struct cache_set *, + struct bkey *, btree_map_keys_fn *, int); + +typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); void bch_keybuf_init(struct keybuf *); -void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, - keybuf_pred_fn *); +void bch_refill_keybuf(struct cache_set *, struct keybuf *, + struct bkey *, keybuf_pred_fn *); bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, struct bkey *); void bch_keybuf_del(struct keybuf *, struct keybuf_key *); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 9aba2017f0d1..dfff2410322e 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -11,17 +11,6 @@ #include "closure.h" -void closure_queue(struct closure *cl) -{ - struct workqueue_struct *wq = cl->wq; - if (wq) { - INIT_WORK(&cl->work, cl->work.func); - BUG_ON(!queue_work(wq, &cl->work)); - } else - cl->fn(cl); -} -EXPORT_SYMBOL_GPL(closure_queue); - #define CL_FIELD(type, field) \ case TYPE_ ## type: \ return &container_of(cl, struct type, cl)->field @@ -30,17 +19,6 @@ static struct closure_waitlist *closure_waitlist(struct closure *cl) { switch (cl->type) { CL_FIELD(closure_with_waitlist, wait); - CL_FIELD(closure_with_waitlist_and_timer, wait); - default: - return NULL; - } -} - -static struct timer_list *closure_timer(struct closure *cl) -{ - switch (cl->type) { - CL_FIELD(closure_with_timer, timer); - CL_FIELD(closure_with_waitlist_and_timer, timer); default: return NULL; } @@ -51,7 +29,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) int r = flags & CLOSURE_REMAINING_MASK; BUG_ON(flags & CLOSURE_GUARD_MASK); - BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); + BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); /* Must deliver precisely one wakeup */ if (r == 1 && (flags & CLOSURE_SLEEPING)) @@ -59,7 +37,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) if (!r) { if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { - /* CLOSURE_BLOCKING might be set - clear it */ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); closure_queue(cl); @@ -90,13 +67,13 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } -EXPORT_SYMBOL_GPL(closure_sub); +EXPORT_SYMBOL(closure_sub); void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } -EXPORT_SYMBOL_GPL(closure_put); +EXPORT_SYMBOL(closure_put); static void set_waiting(struct closure *cl, unsigned long f) { @@ -133,7 +110,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } -EXPORT_SYMBOL_GPL(__closure_wake_up); +EXPORT_SYMBOL(__closure_wake_up); bool closure_wait(struct closure_waitlist *list, struct closure *cl) { @@ -146,7 +123,7 @@ bool closure_wait(struct closure_waitlist *list, struct closure *cl) return true; } -EXPORT_SYMBOL_GPL(closure_wait); +EXPORT_SYMBOL(closure_wait); /** * closure_sync() - sleep until a closure a closure has nothing left to wait on @@ -169,7 +146,7 @@ void closure_sync(struct closure *cl) __closure_end_sleep(cl); } -EXPORT_SYMBOL_GPL(closure_sync); +EXPORT_SYMBOL(closure_sync); /** * closure_trylock() - try to acquire the closure, without waiting @@ -183,17 +160,17 @@ bool closure_trylock(struct closure *cl, struct closure *parent) CLOSURE_REMAINING_INITIALIZER) != -1) return false; - closure_set_ret_ip(cl); - smp_mb(); + cl->parent = parent; if (parent) closure_get(parent); + closure_set_ret_ip(cl); closure_debug_create(cl); return true; } -EXPORT_SYMBOL_GPL(closure_trylock); +EXPORT_SYMBOL(closure_trylock); void __closure_lock(struct closure *cl, struct closure *parent, struct closure_waitlist *wait_list) @@ -205,57 +182,11 @@ void __closure_lock(struct closure *cl, struct closure *parent, if (closure_trylock(cl, parent)) return; - closure_wait_event_sync(wait_list, &wait, - atomic_read(&cl->remaining) == -1); + closure_wait_event(wait_list, &wait, + atomic_read(&cl->remaining) == -1); } } -EXPORT_SYMBOL_GPL(__closure_lock); - -static void closure_delay_timer_fn(unsigned long data) -{ - struct closure *cl = (struct closure *) data; - closure_sub(cl, CLOSURE_TIMER + 1); -} - -void do_closure_timer_init(struct closure *cl) -{ - struct timer_list *timer = closure_timer(cl); - - init_timer(timer); - timer->data = (unsigned long) cl; - timer->function = closure_delay_timer_fn; -} -EXPORT_SYMBOL_GPL(do_closure_timer_init); - -bool __closure_delay(struct closure *cl, unsigned long delay, - struct timer_list *timer) -{ - if (atomic_read(&cl->remaining) & CLOSURE_TIMER) - return false; - - BUG_ON(timer_pending(timer)); - - timer->expires = jiffies + delay; - - atomic_add(CLOSURE_TIMER + 1, &cl->remaining); - add_timer(timer); - return true; -} -EXPORT_SYMBOL_GPL(__closure_delay); - -void __closure_flush(struct closure *cl, struct timer_list *timer) -{ - if (del_timer(timer)) - closure_sub(cl, CLOSURE_TIMER + 1); -} -EXPORT_SYMBOL_GPL(__closure_flush); - -void __closure_flush_sync(struct closure *cl, struct timer_list *timer) -{ - if (del_timer_sync(timer)) - closure_sub(cl, CLOSURE_TIMER + 1); -} -EXPORT_SYMBOL_GPL(__closure_flush_sync); +EXPORT_SYMBOL(__closure_lock); #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -273,7 +204,7 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL_GPL(closure_debug_create); +EXPORT_SYMBOL(closure_debug_create); void closure_debug_destroy(struct closure *cl) { @@ -286,7 +217,7 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL_GPL(closure_debug_destroy); +EXPORT_SYMBOL(closure_debug_destroy); static struct dentry *debug; @@ -304,14 +235,12 @@ static int debug_seq_show(struct seq_file *f, void *data) cl, (void *) cl->ip, cl->fn, cl->parent, r & CLOSURE_REMAINING_MASK); - seq_printf(f, "%s%s%s%s%s%s\n", + seq_printf(f, "%s%s%s%s\n", test_bit(WORK_STRUCT_PENDING, work_data_bits(&cl->work)) ? "Q" : "", r & CLOSURE_RUNNING ? "R" : "", - r & CLOSURE_BLOCKING ? "B" : "", r & CLOSURE_STACK ? "S" : "", - r & CLOSURE_SLEEPING ? "Sl" : "", - r & CLOSURE_TIMER ? "T" : ""); + r & CLOSURE_SLEEPING ? "Sl" : ""); if (r & CLOSURE_WAITING) seq_printf(f, " W %pF\n", diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 00039924ea9d..9762f1be3304 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -155,21 +155,6 @@ * delayed_work embeds a work item and a timer_list. The important thing is, use * it exactly like you would a regular closure and closure_put() will magically * handle everything for you. - * - * We've got closures that embed timers, too. They're called, appropriately - * enough: - * struct closure_with_timer; - * - * This gives you access to closure_delay(). It takes a refcount for a specified - * number of jiffies - you could then call closure_sync() (for a slightly - * convoluted version of msleep()) or continue_at() - which gives you the same - * effect as using a delayed work item, except you can reuse the work_struct - * already embedded in struct closure. - * - * Lastly, there's struct closure_with_waitlist_and_timer. It does what you - * probably expect, if you happen to need the features of both. (You don't - * really want to know how all this is implemented, but if I've done my job - * right you shouldn't have to care). */ struct closure; @@ -182,16 +167,11 @@ struct closure_waitlist { enum closure_type { TYPE_closure = 0, TYPE_closure_with_waitlist = 1, - TYPE_closure_with_timer = 2, - TYPE_closure_with_waitlist_and_timer = 3, - MAX_CLOSURE_TYPE = 3, + MAX_CLOSURE_TYPE = 1, }; enum closure_state { /* - * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of - * waiting asynchronously - * * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by * the thread that owns the closure, and cleared by the thread that's * waking up the closure. @@ -200,10 +180,6 @@ enum closure_state { * - indicates that cl->task is valid and closure_put() may wake it up. * Only set or cleared by the thread that owns the closure. * - * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure - * has an outstanding timer. Must be set by the thread that owns the - * closure, and cleared by the timer function when the timer goes off. - * * The rest are for debugging and don't affect behaviour: * * CLOSURE_RUNNING: Set when a closure is running (i.e. by @@ -218,19 +194,17 @@ enum closure_state { * closure with this flag set */ - CLOSURE_BITS_START = (1 << 19), - CLOSURE_DESTRUCTOR = (1 << 19), - CLOSURE_BLOCKING = (1 << 21), - CLOSURE_WAITING = (1 << 23), - CLOSURE_SLEEPING = (1 << 25), - CLOSURE_TIMER = (1 << 27), + CLOSURE_BITS_START = (1 << 23), + CLOSURE_DESTRUCTOR = (1 << 23), + CLOSURE_WAITING = (1 << 25), + CLOSURE_SLEEPING = (1 << 27), CLOSURE_RUNNING = (1 << 29), CLOSURE_STACK = (1 << 31), }; #define CLOSURE_GUARD_MASK \ - ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \ - CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) + ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ + CLOSURE_RUNNING|CLOSURE_STACK) << 1) #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) @@ -268,17 +242,6 @@ struct closure_with_waitlist { struct closure_waitlist wait; }; -struct closure_with_timer { - struct closure cl; - struct timer_list timer; -}; - -struct closure_with_waitlist_and_timer { - struct closure cl; - struct closure_waitlist wait; - struct timer_list timer; -}; - extern unsigned invalid_closure_type(void); #define __CLOSURE_TYPE(cl, _t) \ @@ -289,14 +252,11 @@ extern unsigned invalid_closure_type(void); ( \ __CLOSURE_TYPE(cl, closure) \ __CLOSURE_TYPE(cl, closure_with_waitlist) \ - __CLOSURE_TYPE(cl, closure_with_timer) \ - __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \ invalid_closure_type() \ ) void closure_sub(struct closure *cl, int v); void closure_put(struct closure *cl); -void closure_queue(struct closure *cl); void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); void closure_sync(struct closure *cl); @@ -305,12 +265,6 @@ bool closure_trylock(struct closure *cl, struct closure *parent); void __closure_lock(struct closure *cl, struct closure *parent, struct closure_waitlist *wait_list); -void do_closure_timer_init(struct closure *cl); -bool __closure_delay(struct closure *cl, unsigned long delay, - struct timer_list *timer); -void __closure_flush(struct closure *cl, struct timer_list *timer); -void __closure_flush_sync(struct closure *cl, struct timer_list *timer); - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG void closure_debug_init(void); @@ -354,11 +308,6 @@ static inline void closure_set_stopped(struct closure *cl) atomic_sub(CLOSURE_RUNNING, &cl->remaining); } -static inline bool closure_is_stopped(struct closure *cl) -{ - return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING); -} - static inline bool closure_is_unlocked(struct closure *cl) { return atomic_read(&cl->remaining) == -1; @@ -367,14 +316,6 @@ static inline bool closure_is_unlocked(struct closure *cl) static inline void do_closure_init(struct closure *cl, struct closure *parent, bool running) { - switch (cl->type) { - case TYPE_closure_with_timer: - case TYPE_closure_with_waitlist_and_timer: - do_closure_timer_init(cl); - default: - break; - } - cl->parent = parent; if (parent) closure_get(parent); @@ -429,8 +370,7 @@ do { \ static inline void closure_init_stack(struct closure *cl) { memset(cl, 0, sizeof(struct closure)); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| - CLOSURE_BLOCKING|CLOSURE_STACK); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); } /** @@ -461,24 +401,6 @@ do { \ #define closure_lock(cl, parent) \ __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) -/** - * closure_delay() - delay some number of jiffies - * @cl: the closure that will sleep - * @delay: the delay in jiffies - * - * Takes a refcount on @cl which will be released after @delay jiffies; this may - * be used to have a function run after a delay with continue_at(), or - * closure_sync() may be used for a convoluted version of msleep(). - */ -#define closure_delay(cl, delay) \ - __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer) - -#define closure_flush(cl) \ - __closure_flush(__to_internal_closure(cl), &(cl)->timer) - -#define closure_flush_sync(cl) \ - __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer) - static inline void __closure_end_sleep(struct closure *cl) { __set_current_state(TASK_RUNNING); @@ -498,40 +420,6 @@ static inline void __closure_start_sleep(struct closure *cl) } /** - * closure_blocking() - returns true if the closure is in blocking mode. - * - * If a closure is in blocking mode, closure_wait_event() will sleep until the - * condition is true instead of waiting asynchronously. - */ -static inline bool closure_blocking(struct closure *cl) -{ - return atomic_read(&cl->remaining) & CLOSURE_BLOCKING; -} - -/** - * set_closure_blocking() - put a closure in blocking mode. - * - * If a closure is in blocking mode, closure_wait_event() will sleep until the - * condition is true instead of waiting asynchronously. - * - * Not thread safe - can only be called by the thread running the closure. - */ -static inline void set_closure_blocking(struct closure *cl) -{ - if (!closure_blocking(cl)) - atomic_add(CLOSURE_BLOCKING, &cl->remaining); -} - -/* - * Not thread safe - can only be called by the thread running the closure. - */ -static inline void clear_closure_blocking(struct closure *cl) -{ - if (closure_blocking(cl)) - atomic_sub(CLOSURE_BLOCKING, &cl->remaining); -} - -/** * closure_wake_up() - wake up all closures on a wait list. */ static inline void closure_wake_up(struct closure_waitlist *list) @@ -561,63 +449,36 @@ static inline void closure_wake_up(struct closure_waitlist *list) * refcount on our closure. If this was a stack allocated closure, that would be * bad. */ -#define __closure_wait_event(list, cl, condition, _block) \ +#define closure_wait_event(list, cl, condition) \ ({ \ - bool block = _block; \ typeof(condition) ret; \ \ while (1) { \ ret = (condition); \ if (ret) { \ __closure_wake_up(list); \ - if (block) \ - closure_sync(cl); \ - \ + closure_sync(cl); \ break; \ } \ \ - if (block) \ - __closure_start_sleep(cl); \ - \ - if (!closure_wait(list, cl)) { \ - if (!block) \ - break; \ + __closure_start_sleep(cl); \ \ + if (!closure_wait(list, cl)) \ schedule(); \ - } \ } \ \ ret; \ }) -/** - * closure_wait_event() - wait on a condition, synchronously or asynchronously. - * @list: the wait list to wait on - * @cl: the closure that is doing the waiting - * @condition: a C expression for the event to wait for - * - * If the closure is in blocking mode, sleeps until the @condition evaluates to - * true - exactly like wait_event(). - * - * If the closure is not in blocking mode, waits asynchronously; if the - * condition is currently false the @cl is put onto @list and returns. @list - * owns a refcount on @cl; closure_sync() or continue_at() may be used later to - * wait for another thread to wake up @list, which drops the refcount on @cl. - * - * Returns the value of @condition; @cl will be on @list iff @condition was - * false. - * - * closure_wake_up(@list) must be called after changing any variable that could - * cause @condition to become true. - */ -#define closure_wait_event(list, cl, condition) \ - __closure_wait_event(list, cl, condition, closure_blocking(cl)) - -#define closure_wait_event_async(list, cl, condition) \ - __closure_wait_event(list, cl, condition, false) - -#define closure_wait_event_sync(list, cl, condition) \ - __closure_wait_event(list, cl, condition, true) +static inline void closure_queue(struct closure *cl) +{ + struct workqueue_struct *wq = cl->wq; + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); + } else + cl->fn(cl); +} static inline void set_closure_fn(struct closure *cl, closure_fn *fn, struct workqueue_struct *wq) @@ -642,7 +503,7 @@ do { \ #define continue_at_nobarrier(_cl, _fn, _wq) \ do { \ set_closure_fn(_cl, _fn, _wq); \ - closure_queue(cl); \ + closure_queue(_cl); \ return; \ } while (0) diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 88e6411eab4f..264fcfbd6290 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -8,7 +8,6 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "request.h" #include <linux/console.h> #include <linux/debugfs.h> @@ -77,29 +76,17 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) return out - buf; } -int bch_btree_to_text(char *buf, size_t size, const struct btree *b) -{ - return scnprintf(buf, size, "%zu level %i/%i", - PTR_BUCKET_NR(b->c, &b->key, 0), - b->level, b->c->root ? b->c->root->level : -1); -} - -#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) - -static bool skipped_backwards(struct btree *b, struct bkey *k) -{ - return bkey_cmp(k, (!b->level) - ? &START_KEY(bkey_next(k)) - : bkey_next(k)) > 0; -} +#ifdef CONFIG_BCACHE_DEBUG static void dump_bset(struct btree *b, struct bset *i) { - struct bkey *k; + struct bkey *k, *next; unsigned j; char buf[80]; - for (k = i->start; k < end(i); k = bkey_next(k)) { + for (k = i->start; k < end(i); k = next) { + next = bkey_next(k); + bch_bkey_to_text(buf, sizeof(buf), k); printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), (uint64_t *) k - i->d, i->keys, buf); @@ -115,15 +102,21 @@ static void dump_bset(struct btree *b, struct bset *i) printk(" %s\n", bch_ptr_status(b->c, k)); - if (bkey_next(k) < end(i) && - skipped_backwards(b, k)) + if (next < end(i) && + bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) printk(KERN_ERR "Key skipped backwards\n"); } } -#endif +static void bch_dump_bucket(struct btree *b) +{ + unsigned i; -#ifdef CONFIG_BCACHE_DEBUG + console_lock(); + for (i = 0; i <= b->nsets; i++) + dump_bset(b, b->sets[i].data); + console_unlock(); +} void bch_btree_verify(struct btree *b, struct bset *new) { @@ -176,66 +169,44 @@ void bch_btree_verify(struct btree *b, struct bset *new) mutex_unlock(&b->c->verify_lock); } -static void data_verify_endio(struct bio *bio, int error) -{ - struct closure *cl = bio->bi_private; - closure_put(cl); -} - -void bch_data_verify(struct search *s) +void bch_data_verify(struct cached_dev *dc, struct bio *bio) { char name[BDEVNAME_SIZE]; - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - struct closure *cl = &s->cl; struct bio *check; struct bio_vec *bv; int i; - if (!s->unaligned_bvec) - bio_for_each_segment(bv, s->orig_bio, i) - bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; - - check = bio_clone(s->orig_bio, GFP_NOIO); + check = bio_clone(bio, GFP_NOIO); if (!check) return; if (bio_alloc_pages(check, GFP_NOIO)) goto out_put; - check->bi_rw = READ_SYNC; - check->bi_private = cl; - check->bi_end_io = data_verify_endio; - - closure_bio_submit(check, cl, &dc->disk); - closure_sync(cl); + submit_bio_wait(READ_SYNC, check); - bio_for_each_segment(bv, s->orig_bio, i) { - void *p1 = kmap(bv->bv_page); - void *p2 = kmap(check->bi_io_vec[i].bv_page); + bio_for_each_segment(bv, bio, i) { + void *p1 = kmap_atomic(bv->bv_page); + void *p2 = page_address(check->bi_io_vec[i].bv_page); - if (memcmp(p1 + bv->bv_offset, - p2 + bv->bv_offset, - bv->bv_len)) - printk(KERN_ERR - "bcache (%s): verify failed at sector %llu\n", - bdevname(dc->bdev, name), - (uint64_t) s->orig_bio->bi_sector); + cache_set_err_on(memcmp(p1 + bv->bv_offset, + p2 + bv->bv_offset, + bv->bv_len), + dc->disk.c, + "verify failed at dev %s sector %llu", + bdevname(dc->bdev, name), + (uint64_t) bio->bi_sector); - kunmap(bv->bv_page); - kunmap(check->bi_io_vec[i].bv_page); + kunmap_atomic(p1); } - __bio_for_each_segment(bv, check, i, 0) + bio_for_each_segment_all(bv, check, i) __free_page(bv->bv_page); out_put: bio_put(check); } -#endif - -#ifdef CONFIG_BCACHE_EDEBUG - -unsigned bch_count_data(struct btree *b) +int __bch_count_data(struct btree *b) { unsigned ret = 0; struct btree_iter iter; @@ -247,72 +218,60 @@ unsigned bch_count_data(struct btree *b) return ret; } -static void vdump_bucket_and_panic(struct btree *b, const char *fmt, - va_list args) -{ - unsigned i; - char buf[80]; - - console_lock(); - - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); - - vprintk(fmt, args); - - console_unlock(); - - bch_btree_to_text(buf, sizeof(buf), b); - panic("at %s\n", buf); -} - -void bch_check_key_order_msg(struct btree *b, struct bset *i, - const char *fmt, ...) -{ - struct bkey *k; - - if (!i->keys) - return; - - for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) - if (skipped_backwards(b, k)) { - va_list args; - va_start(args, fmt); - - vdump_bucket_and_panic(b, fmt, args); - va_end(args); - } -} - -void bch_check_keys(struct btree *b, const char *fmt, ...) +void __bch_check_keys(struct btree *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; struct btree_iter iter; - - if (b->level) - return; + const char *err; for_each_key(b, k, &iter) { - if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { - printk(KERN_ERR "Keys out of order:\n"); - goto bug; - } - - if (bch_ptr_invalid(b, k)) - continue; - - if (p && bkey_cmp(p, &START_KEY(k)) > 0) { - printk(KERN_ERR "Overlapping keys:\n"); - goto bug; + if (!b->level) { + err = "Keys out of order"; + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) + goto bug; + + if (bch_ptr_invalid(b, k)) + continue; + + err = "Overlapping keys"; + if (p && bkey_cmp(p, &START_KEY(k)) > 0) + goto bug; + } else { + if (bch_ptr_bad(b, k)) + continue; + + err = "Duplicate keys"; + if (p && !bkey_cmp(p, k)) + goto bug; } p = k; } + + err = "Key larger than btree node key"; + if (p && bkey_cmp(p, &b->key) > 0) + goto bug; + return; bug: + bch_dump_bucket(b); + va_start(args, fmt); - vdump_bucket_and_panic(b, fmt, args); + vprintk(fmt, args); va_end(args); + + panic("bcache error: %s:\n", err); +} + +void bch_btree_iter_next_check(struct btree_iter *iter) +{ + struct bkey *k = iter->data->k, *next = bkey_next(k); + + if (next < iter->data->end && + bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { + bch_dump_bucket(iter->b); + panic("Key skipped backwards\n"); + } } #endif diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 1c39b5a2489b..2ede60e31874 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -4,40 +4,44 @@ /* Btree/bkey debug printing */ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); -int bch_btree_to_text(char *buf, size_t size, const struct btree *b); - -#ifdef CONFIG_BCACHE_EDEBUG - -unsigned bch_count_data(struct btree *); -void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...); -void bch_check_keys(struct btree *, const char *, ...); - -#define bch_check_key_order(b, i) \ - bch_check_key_order_msg(b, i, "keys out of order") -#define EBUG_ON(cond) BUG_ON(cond) - -#else /* EDEBUG */ - -#define bch_count_data(b) 0 -#define bch_check_key_order(b, i) do {} while (0) -#define bch_check_key_order_msg(b, i, ...) do {} while (0) -#define bch_check_keys(b, ...) do {} while (0) -#define EBUG_ON(cond) do {} while (0) - -#endif #ifdef CONFIG_BCACHE_DEBUG void bch_btree_verify(struct btree *, struct bset *); -void bch_data_verify(struct search *); +void bch_data_verify(struct cached_dev *, struct bio *); +int __bch_count_data(struct btree *); +void __bch_check_keys(struct btree *, const char *, ...); +void bch_btree_iter_next_check(struct btree_iter *); + +#define EBUG_ON(cond) BUG_ON(cond) +#define expensive_debug_checks(c) ((c)->expensive_debug_checks) +#define key_merging_disabled(c) ((c)->key_merging_disabled) +#define bypass_torture_test(d) ((d)->bypass_torture_test) #else /* DEBUG */ static inline void bch_btree_verify(struct btree *b, struct bset *i) {} -static inline void bch_data_verify(struct search *s) {}; +static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} +static inline int __bch_count_data(struct btree *b) { return -1; } +static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} +static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} + +#define EBUG_ON(cond) do { if (cond); } while (0) +#define expensive_debug_checks(c) 0 +#define key_merging_disabled(c) 0 +#define bypass_torture_test(d) 0 #endif +#define bch_count_data(b) \ + (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) + +#define bch_check_keys(b, ...) \ +do { \ + if (expensive_debug_checks((b)->c)) \ + __bch_check_keys(b, __VA_ARGS__); \ +} while (0) + #ifdef CONFIG_DEBUG_FS void bch_debug_init_cache_set(struct cache_set *); #else diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8435f81e5d85..ecdaa671bd50 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -7,7 +7,6 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "request.h" #include <trace/events/bcache.h> @@ -31,17 +30,20 @@ static void journal_read_endio(struct bio *bio, int error) } static int journal_read_bucket(struct cache *ca, struct list_head *list, - struct btree_op *op, unsigned bucket_index) + unsigned bucket_index) { struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio; struct journal_replay *i; struct jset *j, *data = ca->set->journal.w[0].data; + struct closure cl; unsigned len, left, offset = 0; int ret = 0; sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); + closure_init_stack(&cl); + pr_debug("reading %llu", (uint64_t) bucket); while (offset < ca->sb.bucket_size) { @@ -55,11 +57,11 @@ reread: left = ca->sb.bucket_size - offset; bio->bi_size = len << 9; bio->bi_end_io = journal_read_endio; - bio->bi_private = &op->cl; + bio->bi_private = &cl; bch_bio_map(bio, data); - closure_bio_submit(bio, &op->cl, ca); - closure_sync(&op->cl); + closure_bio_submit(bio, &cl, ca); + closure_sync(&cl); /* This function could be simpler now since we no longer write * journal entries that overlap bucket boundaries; this means @@ -72,7 +74,7 @@ reread: left = ca->sb.bucket_size - offset; struct list_head *where; size_t blocks, bytes = set_bytes(j); - if (j->magic != jset_magic(ca->set)) + if (j->magic != jset_magic(&ca->sb)) return ret; if (bytes > left << 9) @@ -129,12 +131,11 @@ next_set: return ret; } -int bch_journal_read(struct cache_set *c, struct list_head *list, - struct btree_op *op) +int bch_journal_read(struct cache_set *c, struct list_head *list) { #define read_bucket(b) \ ({ \ - int ret = journal_read_bucket(ca, list, op, b); \ + int ret = journal_read_bucket(ca, list, b); \ __set_bit(b, bitmap); \ if (ret < 0) \ return ret; \ @@ -292,8 +293,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } } -int bch_journal_replay(struct cache_set *s, struct list_head *list, - struct btree_op *op) +int bch_journal_replay(struct cache_set *s, struct list_head *list) { int ret = 0, keys = 0, entries = 0; struct bkey *k; @@ -301,31 +301,30 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, list_entry(list->prev, struct journal_replay, list); uint64_t start = i->j.last_seq, end = i->j.seq, n = start; + struct keylist keylist; + + bch_keylist_init(&keylist); list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); - if (n != i->j.seq) - pr_err( - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); + cache_set_err_on(n != i->j.seq, s, +"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", + n, i->j.seq - 1, start, end); for (k = i->j.start; k < end(&i->j); k = bkey_next(k)) { trace_bcache_journal_replay_key(k); - bkey_copy(op->keys.top, k); - bch_keylist_push(&op->keys); - - op->journal = i->pin; - atomic_inc(op->journal); + bkey_copy(keylist.top, k); + bch_keylist_push(&keylist); - ret = bch_btree_insert(op, s); + ret = bch_btree_insert(s, &keylist, i->pin, NULL); if (ret) goto err; - BUG_ON(!bch_keylist_empty(&op->keys)); + BUG_ON(!bch_keylist_empty(&keylist)); keys++; cond_resched(); @@ -339,14 +338,13 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, pr_info("journal replay done, %i keys in %i entries, seq %llu", keys, entries, end); - +err: while (!list_empty(list)) { i = list_first_entry(list, struct journal_replay, list); list_del(&i->list); kfree(i); } -err: - closure_sync(&op->cl); + return ret; } @@ -358,48 +356,35 @@ static void btree_flush_write(struct cache_set *c) * Try to find the btree node with that references the oldest journal * entry, best is our current candidate and is locked if non NULL: */ - struct btree *b, *best = NULL; - unsigned iter; + struct btree *b, *best; + unsigned i; +retry: + best = NULL; + + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { + if (!best) + best = b; + else if (journal_pin_cmp(c, + btree_current_write(best)->journal, + btree_current_write(b)->journal)) { + best = b; + } + } - for_each_cached_btree(b, c, iter) { - if (!down_write_trylock(&b->lock)) - continue; + b = best; + if (b) { + rw_lock(true, b, b->level); - if (!btree_node_dirty(b) || - !btree_current_write(b)->journal) { + if (!btree_current_write(b)->journal) { rw_unlock(true, b); - continue; + /* We raced */ + goto retry; } - if (!best) - best = b; - else if (journal_pin_cmp(c, - btree_current_write(best), - btree_current_write(b))) { - rw_unlock(true, best); - best = b; - } else - rw_unlock(true, b); + bch_btree_node_write(b, NULL); + rw_unlock(true, b); } - - if (best) - goto out; - - /* We can't find the best btree node, just pick the first */ - list_for_each_entry(b, &c->btree_cache, list) - if (!b->level && btree_node_dirty(b)) { - best = b; - rw_lock(true, best, best->level); - goto found; - } - -out: - if (!best) - return; -found: - if (btree_node_dirty(best)) - bch_btree_node_write(best, NULL); - rw_unlock(true, best); } #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) @@ -495,7 +480,7 @@ static void journal_reclaim(struct cache_set *c) do_journal_discard(ca); if (c->journal.blocks_free) - return; + goto out; /* * Allocate: @@ -521,7 +506,7 @@ static void journal_reclaim(struct cache_set *c) if (n) c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; - +out: if (!journal_full(&c->journal)) __closure_wake_up(&c->journal.wait); } @@ -554,32 +539,26 @@ static void journal_write_endio(struct bio *bio, int error) struct journal_write *w = bio->bi_private; cache_set_err_on(error, w->c, "journal io error"); - closure_put(&w->c->journal.io.cl); + closure_put(&w->c->journal.io); } static void journal_write(struct closure *); static void journal_write_done(struct closure *cl) { - struct journal *j = container_of(cl, struct journal, io.cl); - struct cache_set *c = container_of(j, struct cache_set, journal); - + struct journal *j = container_of(cl, struct journal, io); struct journal_write *w = (j->cur == j->w) ? &j->w[1] : &j->w[0]; __closure_wake_up(&w->wait); - - if (c->journal_delay_ms) - closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); - - continue_at(cl, journal_write, system_wq); + continue_at_nobarrier(cl, journal_write, system_wq); } static void journal_write_unlocked(struct closure *cl) __releases(c->journal.lock) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); + struct cache_set *c = container_of(cl, struct cache_set, journal.io); struct cache *ca; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; @@ -617,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl) for_each_cache(ca, c, i) w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; - w->data->magic = jset_magic(c); + w->data->magic = jset_magic(&c->sb); w->data->version = BCACHE_JSET_VERSION; w->data->last_seq = last_seq(&c->journal); w->data->csum = csum_set(w->data); @@ -660,121 +639,134 @@ static void journal_write_unlocked(struct closure *cl) static void journal_write(struct closure *cl) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); + struct cache_set *c = container_of(cl, struct cache_set, journal.io); spin_lock(&c->journal.lock); journal_write_unlocked(cl); } -static void __journal_try_write(struct cache_set *c, bool noflush) +static void journal_try_write(struct cache_set *c) __releases(c->journal.lock) { - struct closure *cl = &c->journal.io.cl; + struct closure *cl = &c->journal.io; + struct journal_write *w = c->journal.cur; - if (!closure_trylock(cl, &c->cl)) - spin_unlock(&c->journal.lock); - else if (noflush && journal_full(&c->journal)) { - spin_unlock(&c->journal.lock); - continue_at(cl, journal_write, system_wq); - } else + w->need_write = true; + + if (closure_trylock(cl, &c->cl)) journal_write_unlocked(cl); + else + spin_unlock(&c->journal.lock); } -#define journal_try_write(c) __journal_try_write(c, false) - -void bch_journal_meta(struct cache_set *c, struct closure *cl) +static struct journal_write *journal_wait_for_write(struct cache_set *c, + unsigned nkeys) { - struct journal_write *w; + size_t sectors; + struct closure cl; - if (CACHE_SYNC(&c->sb)) { - spin_lock(&c->journal.lock); + closure_init_stack(&cl); + + spin_lock(&c->journal.lock); - w = c->journal.cur; - w->need_write = true; + while (1) { + struct journal_write *w = c->journal.cur; - if (cl) - BUG_ON(!closure_wait(&w->wait, cl)); + sectors = __set_blocks(w->data, w->data->keys + nkeys, + c) * c->sb.block_size; - closure_flush(&c->journal.io); - __journal_try_write(c, true); + if (sectors <= min_t(size_t, + c->journal.blocks_free * c->sb.block_size, + PAGE_SECTORS << JSET_BITS)) + return w; + + /* XXX: tracepoint */ + if (!journal_full(&c->journal)) { + trace_bcache_journal_entry_full(c); + + /* + * XXX: If we were inserting so many keys that they + * won't fit in an _empty_ journal write, we'll + * deadlock. For now, handle this in + * bch_keylist_realloc() - but something to think about. + */ + BUG_ON(!w->data->keys); + + closure_wait(&w->wait, &cl); + journal_try_write(c); /* unlocks */ + } else { + trace_bcache_journal_full(c); + + closure_wait(&c->journal.wait, &cl); + journal_reclaim(c); + spin_unlock(&c->journal.lock); + + btree_flush_write(c); + } + + closure_sync(&cl); + spin_lock(&c->journal.lock); } } +static void journal_write_work(struct work_struct *work) +{ + struct cache_set *c = container_of(to_delayed_work(work), + struct cache_set, + journal.work); + spin_lock(&c->journal.lock); + journal_try_write(c); +} + /* * Entry point to the journalling code - bio_insert() and btree_invalidate() * pass bch_journal() a list of keys to be journalled, and then * bch_journal() hands those same keys off to btree_insert_async() */ -void bch_journal(struct closure *cl) +atomic_t *bch_journal(struct cache_set *c, + struct keylist *keys, + struct closure *parent) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct cache_set *c = op->c; struct journal_write *w; - size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; - - if (op->type != BTREE_INSERT || - !CACHE_SYNC(&c->sb)) - goto out; + atomic_t *ret; - /* - * If we're looping because we errored, might already be waiting on - * another journal write: - */ - while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) - closure_sync(cl->parent); + if (!CACHE_SYNC(&c->sb)) + return NULL; - spin_lock(&c->journal.lock); + w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); - if (journal_full(&c->journal)) { - trace_bcache_journal_full(c); + memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); + w->data->keys += bch_keylist_nkeys(keys); - closure_wait(&c->journal.wait, cl); + ret = &fifo_back(&c->journal.pin); + atomic_inc(ret); - journal_reclaim(c); + if (parent) { + closure_wait(&w->wait, parent); + journal_try_write(c); + } else if (!w->need_write) { + schedule_delayed_work(&c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); + spin_unlock(&c->journal.lock); + } else { spin_unlock(&c->journal.lock); - - btree_flush_write(c); - continue_at(cl, bch_journal, bcache_wq); } - w = c->journal.cur; - w->need_write = true; - b = __set_blocks(w->data, w->data->keys + n, c); - - if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || - b > c->journal.blocks_free) { - trace_bcache_journal_entry_full(c); - - /* - * XXX: If we were inserting so many keys that they won't fit in - * an _empty_ journal write, we'll deadlock. For now, handle - * this in bch_keylist_realloc() - but something to think about. - */ - BUG_ON(!w->data->keys); - - BUG_ON(!closure_wait(&w->wait, cl)); - - closure_flush(&c->journal.io); - journal_try_write(c); - continue_at(cl, bch_journal, bcache_wq); - } - - memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); - w->data->keys += n; + return ret; +} - op->journal = &fifo_back(&c->journal.pin); - atomic_inc(op->journal); +void bch_journal_meta(struct cache_set *c, struct closure *cl) +{ + struct keylist keys; + atomic_t *ref; - if (op->flush_journal) { - closure_flush(&c->journal.io); - closure_wait(&w->wait, cl->parent); - } + bch_keylist_init(&keys); - journal_try_write(c); -out: - bch_btree_insert_async(cl); + ref = bch_journal(c, &keys, cl); + if (ref) + atomic_dec_bug(ref); } void bch_journal_free(struct cache_set *c) @@ -790,6 +782,7 @@ int bch_journal_alloc(struct cache_set *c) closure_init_unlocked(&j->io); spin_lock_init(&j->lock); + INIT_DELAYED_WORK(&j->work, journal_write_work); c->journal_delay_ms = 100; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 3d7851274b04..a6472fda94b2 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -75,43 +75,6 @@ * nodes that are pinning the oldest journal entries first. */ -#define BCACHE_JSET_VERSION_UUIDv1 1 -/* Always latest UUID format */ -#define BCACHE_JSET_VERSION_UUID 1 -#define BCACHE_JSET_VERSION 1 - -/* - * On disk format for a journal entry: - * seq is monotonically increasing; every journal entry has its own unique - * sequence number. - * - * last_seq is the oldest journal entry that still has keys the btree hasn't - * flushed to disk yet. - * - * version is for on disk format changes. - */ -struct jset { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t keys; - - uint64_t last_seq; - - BKEY_PADDED(uuid_bucket); - BKEY_PADDED(btree_root); - uint16_t btree_level; - uint16_t pad[3]; - - uint64_t prio_bucket[MAX_CACHES_PER_SET]; - - union { - struct bkey start[0]; - uint64_t d[0]; - }; -}; - /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration @@ -140,7 +103,8 @@ struct journal { spinlock_t lock; /* used when waiting because the journal was full */ struct closure_waitlist wait; - struct closure_with_timer io; + struct closure io; + struct delayed_work work; /* Number of blocks free in the bucket(s) we're currently writing to */ unsigned blocks_free; @@ -188,8 +152,7 @@ struct journal_device { }; #define journal_pin_cmp(c, l, r) \ - (fifo_idx(&(c)->journal.pin, (l)->journal) > \ - fifo_idx(&(c)->journal.pin, (r)->journal)) + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) #define JOURNAL_PIN 20000 @@ -199,15 +162,14 @@ struct journal_device { struct closure; struct cache_set; struct btree_op; +struct keylist; -void bch_journal(struct closure *); +atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); void bch_journal_next(struct journal *); void bch_journal_mark(struct cache_set *, struct list_head *); void bch_journal_meta(struct cache_set *, struct closure *); -int bch_journal_read(struct cache_set *, struct list_head *, - struct btree_op *); -int bch_journal_replay(struct cache_set *, struct list_head *, - struct btree_op *); +int bch_journal_read(struct cache_set *, struct list_head *); +int bch_journal_replay(struct cache_set *, struct list_head *); void bch_journal_free(struct cache_set *); int bch_journal_alloc(struct cache_set *); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 1a3b4f4786c3..7c1275e66025 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -12,8 +12,9 @@ #include <trace/events/bcache.h> struct moving_io { + struct closure cl; struct keybuf_key *w; - struct search s; + struct data_insert_op op; struct bbio bio; }; @@ -38,13 +39,13 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) static void moving_io_destructor(struct closure *cl) { - struct moving_io *io = container_of(cl, struct moving_io, s.cl); + struct moving_io *io = container_of(cl, struct moving_io, cl); kfree(io); } static void write_moving_finish(struct closure *cl) { - struct moving_io *io = container_of(cl, struct moving_io, s.cl); + struct moving_io *io = container_of(cl, struct moving_io, cl); struct bio *bio = &io->bio.bio; struct bio_vec *bv; int i; @@ -52,13 +53,12 @@ static void write_moving_finish(struct closure *cl) bio_for_each_segment_all(bv, bio, i) __free_page(bv->bv_page); - if (io->s.op.insert_collision) + if (io->op.replace_collision) trace_bcache_gc_copy_collision(&io->w->key); - bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); + bch_keybuf_del(&io->op.c->moving_gc_keys, io->w); - atomic_dec_bug(&io->s.op.c->in_flight); - closure_wake_up(&io->s.op.c->moving_gc_wait); + up(&io->op.c->moving_in_flight); closure_return_with_destructor(cl, moving_io_destructor); } @@ -66,12 +66,12 @@ static void write_moving_finish(struct closure *cl) static void read_moving_endio(struct bio *bio, int error) { struct moving_io *io = container_of(bio->bi_private, - struct moving_io, s.cl); + struct moving_io, cl); if (error) - io->s.error = error; + io->op.error = error; - bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); + bch_bbio_endio(io->op.c, bio, error, "reading data to move"); } static void moving_init(struct moving_io *io) @@ -85,54 +85,53 @@ static void moving_init(struct moving_io *io) bio->bi_size = KEY_SIZE(&io->w->key) << 9; bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS); - bio->bi_private = &io->s.cl; + bio->bi_private = &io->cl; bio->bi_io_vec = bio->bi_inline_vecs; bch_bio_map(bio, NULL); } static void write_moving(struct closure *cl) { - struct search *s = container_of(cl, struct search, cl); - struct moving_io *io = container_of(s, struct moving_io, s); + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct data_insert_op *op = &io->op; - if (!s->error) { + if (!op->error) { moving_init(io); - io->bio.bio.bi_sector = KEY_START(&io->w->key); - s->op.lock = -1; - s->op.write_prio = 1; - s->op.cache_bio = &io->bio.bio; + io->bio.bio.bi_sector = KEY_START(&io->w->key); + op->write_prio = 1; + op->bio = &io->bio.bio; - s->writeback = KEY_DIRTY(&io->w->key); - s->op.csum = KEY_CSUM(&io->w->key); + op->writeback = KEY_DIRTY(&io->w->key); + op->csum = KEY_CSUM(&io->w->key); - s->op.type = BTREE_REPLACE; - bkey_copy(&s->op.replace, &io->w->key); + bkey_copy(&op->replace_key, &io->w->key); + op->replace = true; - closure_init(&s->op.cl, cl); - bch_insert_data(&s->op.cl); + closure_call(&op->cl, bch_data_insert, NULL, cl); } - continue_at(cl, write_moving_finish, NULL); + continue_at(cl, write_moving_finish, system_wq); } static void read_moving_submit(struct closure *cl) { - struct search *s = container_of(cl, struct search, cl); - struct moving_io *io = container_of(s, struct moving_io, s); + struct moving_io *io = container_of(cl, struct moving_io, cl); struct bio *bio = &io->bio.bio; - bch_submit_bbio(bio, s->op.c, &io->w->key, 0); + bch_submit_bbio(bio, io->op.c, &io->w->key, 0); - continue_at(cl, write_moving, bch_gc_wq); + continue_at(cl, write_moving, system_wq); } -static void read_moving(struct closure *cl) +static void read_moving(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, moving_gc); struct keybuf_key *w; struct moving_io *io; struct bio *bio; + struct closure cl; + + closure_init_stack(&cl); /* XXX: if we error, background writeback could stall indefinitely */ @@ -150,8 +149,8 @@ static void read_moving(struct closure *cl) w->private = io; io->w = w; - io->s.op.inode = KEY_INODE(&w->key); - io->s.op.c = c; + io->op.inode = KEY_INODE(&w->key); + io->op.c = c; moving_init(io); bio = &io->bio.bio; @@ -164,13 +163,8 @@ static void read_moving(struct closure *cl) trace_bcache_gc_copy(&w->key); - closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); - - if (atomic_inc_return(&c->in_flight) >= 64) { - closure_wait_event(&c->moving_gc_wait, cl, - atomic_read(&c->in_flight) < 64); - continue_at(cl, read_moving, bch_gc_wq); - } + down(&c->moving_in_flight); + closure_call(&io->cl, read_moving_submit, NULL, &cl); } if (0) { @@ -180,7 +174,7 @@ err: if (!IS_ERR_OR_NULL(w->private)) bch_keybuf_del(&c->moving_gc_keys, w); } - closure_return(cl); + closure_sync(&cl); } static bool bucket_cmp(struct bucket *l, struct bucket *r) @@ -193,15 +187,14 @@ static unsigned bucket_heap_top(struct cache *ca) return GC_SECTORS_USED(heap_peek(&ca->heap)); } -void bch_moving_gc(struct closure *cl) +void bch_moving_gc(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, gc.cl); struct cache *ca; struct bucket *b; unsigned i; if (!c->copy_gc_enabled) - closure_return(cl); + return; mutex_lock(&c->bucket_lock); @@ -242,13 +235,11 @@ void bch_moving_gc(struct closure *cl) c->moving_gc_keys.last_scanned = ZERO_KEY; - closure_init(&c->moving_gc, cl); - read_moving(&c->moving_gc); - - closure_return(cl); + read_moving(c); } void bch_moving_init_cache_set(struct cache_set *c) { bch_keybuf_init(&c->moving_gc_keys); + sema_init(&c->moving_in_flight, 64); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 2a7f0dd6abab..fbcc851ed5a5 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -25,7 +25,7 @@ struct kmem_cache *bch_search_cache; -static void check_should_skip(struct cached_dev *, struct search *); +static void bch_data_insert_start(struct closure *); /* Cgroup interface */ @@ -213,221 +213,79 @@ static void bio_csum(struct bio *bio, struct bkey *k) /* Insert data into cache */ -static void bio_invalidate(struct closure *cl) +static void bch_data_insert_keys(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct bio *bio = op->cache_bio; - - pr_debug("invalidating %i sectors from %llu", - bio_sectors(bio), (uint64_t) bio->bi_sector); - - while (bio_sectors(bio)) { - unsigned len = min(bio_sectors(bio), 1U << 14); - - if (bch_keylist_realloc(&op->keys, 0, op->c)) - goto out; - - bio->bi_sector += len; - bio->bi_size -= len << 9; - - bch_keylist_add(&op->keys, - &KEY(op->inode, bio->bi_sector, len)); - } - - op->insert_data_done = true; - bio_put(bio); -out: - continue_at(cl, bch_journal, bcache_wq); -} - -struct open_bucket { - struct list_head list; - struct task_struct *last; - unsigned sectors_free; - BKEY_PADDED(key); -}; - -void bch_open_buckets_free(struct cache_set *c) -{ - struct open_bucket *b; - - while (!list_empty(&c->data_buckets)) { - b = list_first_entry(&c->data_buckets, - struct open_bucket, list); - list_del(&b->list); - kfree(b); - } -} - -int bch_open_buckets_alloc(struct cache_set *c) -{ - int i; - - spin_lock_init(&c->data_bucket_lock); - - for (i = 0; i < 6; i++) { - struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); - if (!b) - return -ENOMEM; - - list_add(&b->list, &c->data_buckets); - } - - return 0; -} - -/* - * We keep multiple buckets open for writes, and try to segregate different - * write streams for better cache utilization: first we look for a bucket where - * the last write to it was sequential with the current write, and failing that - * we look for a bucket that was last used by the same task. - * - * The ideas is if you've got multiple tasks pulling data into the cache at the - * same time, you'll get better cache utilization if you try to segregate their - * data and preserve locality. - * - * For example, say you've starting Firefox at the same time you're copying a - * bunch of files. Firefox will likely end up being fairly hot and stay in the - * cache awhile, but the data you copied might not be; if you wrote all that - * data to the same buckets it'd get invalidated at the same time. - * - * Both of those tasks will be doing fairly random IO so we can't rely on - * detecting sequential IO to segregate their data, but going off of the task - * should be a sane heuristic. - */ -static struct open_bucket *pick_data_bucket(struct cache_set *c, - const struct bkey *search, - struct task_struct *task, - struct bkey *alloc) -{ - struct open_bucket *ret, *ret_task = NULL; - - list_for_each_entry_reverse(ret, &c->data_buckets, list) - if (!bkey_cmp(&ret->key, search)) - goto found; - else if (ret->last == task) - ret_task = ret; - - ret = ret_task ?: list_first_entry(&c->data_buckets, - struct open_bucket, list); -found: - if (!ret->sectors_free && KEY_PTRS(alloc)) { - ret->sectors_free = c->sb.bucket_size; - bkey_copy(&ret->key, alloc); - bkey_init(alloc); - } - - if (!ret->sectors_free) - ret = NULL; - - return ret; -} - -/* - * Allocates some space in the cache to write to, and k to point to the newly - * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the - * end of the newly allocated space). - * - * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many - * sectors were actually allocated. - * - * If s->writeback is true, will not fail. - */ -static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, - struct search *s) -{ - struct cache_set *c = s->op.c; - struct open_bucket *b; - BKEY_PADDED(key) alloc; - struct closure cl, *w = NULL; - unsigned i; - - if (s->writeback) { - closure_init_stack(&cl); - w = &cl; - } + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + atomic_t *journal_ref = NULL; + struct bkey *replace_key = op->replace ? &op->replace_key : NULL; + int ret; /* - * We might have to allocate a new bucket, which we can't do with a - * spinlock held. So if we have to allocate, we drop the lock, allocate - * and then retry. KEY_PTRS() indicates whether alloc points to - * allocated bucket(s). + * If we're looping, might already be waiting on + * another journal write - can't wait on more than one journal write at + * a time + * + * XXX: this looks wrong */ +#if 0 + while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) + closure_sync(&s->cl); +#endif - bkey_init(&alloc.key); - spin_lock(&c->data_bucket_lock); - - while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { - unsigned watermark = s->op.write_prio - ? WATERMARK_MOVINGGC - : WATERMARK_NONE; - - spin_unlock(&c->data_bucket_lock); - - if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) - return false; + if (!op->replace) + journal_ref = bch_journal(op->c, &op->insert_keys, + op->flush_journal ? cl : NULL); - spin_lock(&c->data_bucket_lock); + ret = bch_btree_insert(op->c, &op->insert_keys, + journal_ref, replace_key); + if (ret == -ESRCH) { + op->replace_collision = true; + } else if (ret) { + op->error = -ENOMEM; + op->insert_data_done = true; } - /* - * If we had to allocate, we might race and not need to allocate the - * second time we call find_data_bucket(). If we allocated a bucket but - * didn't use it, drop the refcount bch_bucket_alloc_set() took: - */ - if (KEY_PTRS(&alloc.key)) - __bkey_put(c, &alloc.key); - - for (i = 0; i < KEY_PTRS(&b->key); i++) - EBUG_ON(ptr_stale(c, &b->key, i)); + if (journal_ref) + atomic_dec_bug(journal_ref); - /* Set up the pointer to the space we're allocating: */ + if (!op->insert_data_done) + continue_at(cl, bch_data_insert_start, bcache_wq); - for (i = 0; i < KEY_PTRS(&b->key); i++) - k->ptr[i] = b->key.ptr[i]; + bch_keylist_free(&op->insert_keys); + closure_return(cl); +} - sectors = min(sectors, b->sectors_free); +static void bch_data_invalidate(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio; - SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); - SET_KEY_SIZE(k, sectors); - SET_KEY_PTRS(k, KEY_PTRS(&b->key)); + pr_debug("invalidating %i sectors from %llu", + bio_sectors(bio), (uint64_t) bio->bi_sector); - /* - * Move b to the end of the lru, and keep track of what this bucket was - * last used for: - */ - list_move_tail(&b->list, &c->data_buckets); - bkey_copy_key(&b->key, k); - b->last = s->task; + while (bio_sectors(bio)) { + unsigned sectors = min(bio_sectors(bio), + 1U << (KEY_SIZE_BITS - 1)); - b->sectors_free -= sectors; + if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) + goto out; - for (i = 0; i < KEY_PTRS(&b->key); i++) { - SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); + bio->bi_sector += sectors; + bio->bi_size -= sectors << 9; - atomic_long_add(sectors, - &PTR_CACHE(c, &b->key, i)->sectors_written); + bch_keylist_add(&op->insert_keys, + &KEY(op->inode, bio->bi_sector, sectors)); } - if (b->sectors_free < c->sb.block_size) - b->sectors_free = 0; - - /* - * k takes refcounts on the buckets it points to until it's inserted - * into the btree, but if we're done with this bucket we just transfer - * get_data_bucket()'s refcount. - */ - if (b->sectors_free) - for (i = 0; i < KEY_PTRS(&b->key); i++) - atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); - - spin_unlock(&c->data_bucket_lock); - return true; + op->insert_data_done = true; + bio_put(bio); +out: + continue_at(cl, bch_data_insert_keys, bcache_wq); } -static void bch_insert_data_error(struct closure *cl) +static void bch_data_insert_error(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); /* * Our data write just errored, which means we've got a bunch of keys to @@ -438,35 +296,34 @@ static void bch_insert_data_error(struct closure *cl) * from the keys we'll accomplish just that. */ - struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; + struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; - while (src != op->keys.top) { + while (src != op->insert_keys.top) { struct bkey *n = bkey_next(src); SET_KEY_PTRS(src, 0); - bkey_copy(dst, src); + memmove(dst, src, bkey_bytes(src)); dst = bkey_next(dst); src = n; } - op->keys.top = dst; + op->insert_keys.top = dst; - bch_journal(cl); + bch_data_insert_keys(cl); } -static void bch_insert_data_endio(struct bio *bio, int error) +static void bch_data_insert_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); if (error) { /* TODO: We could try to recover from this. */ - if (s->writeback) - s->error = error; - else if (s->write) - set_closure_fn(cl, bch_insert_data_error, bcache_wq); + if (op->writeback) + op->error = error; + else if (!op->replace) + set_closure_fn(cl, bch_data_insert_error, bcache_wq); else set_closure_fn(cl, NULL, NULL); } @@ -474,18 +331,17 @@ static void bch_insert_data_endio(struct bio *bio, int error) bch_bbio_endio(op->c, bio, error, "writing data to cache"); } -static void bch_insert_data_loop(struct closure *cl) +static void bch_data_insert_start(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); - struct bio *bio = op->cache_bio, *n; + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio, *n; - if (op->skip) - return bio_invalidate(cl); + if (op->bypass) + return bch_data_invalidate(cl); if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { set_gc_sectors(op->c); - bch_queue_gc(op->c); + wake_up_gc(op->c); } /* @@ -497,29 +353,30 @@ static void bch_insert_data_loop(struct closure *cl) do { unsigned i; struct bkey *k; - struct bio_set *split = s->d - ? s->d->bio_split : op->c->bio_split; + struct bio_set *split = op->c->bio_split; /* 1 for the device pointer and 1 for the chksum */ - if (bch_keylist_realloc(&op->keys, + if (bch_keylist_realloc(&op->insert_keys, 1 + (op->csum ? 1 : 0), op->c)) - continue_at(cl, bch_journal, bcache_wq); + continue_at(cl, bch_data_insert_keys, bcache_wq); - k = op->keys.top; + k = op->insert_keys.top; bkey_init(k); SET_KEY_INODE(k, op->inode); SET_KEY_OFFSET(k, bio->bi_sector); - if (!bch_alloc_sectors(k, bio_sectors(bio), s)) + if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), + op->write_point, op->write_prio, + op->writeback)) goto err; n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); - n->bi_end_io = bch_insert_data_endio; + n->bi_end_io = bch_data_insert_endio; n->bi_private = cl; - if (s->writeback) { + if (op->writeback) { SET_KEY_DIRTY(k, true); for (i = 0; i < KEY_PTRS(k); i++) @@ -532,17 +389,17 @@ static void bch_insert_data_loop(struct closure *cl) bio_csum(n, k); trace_bcache_cache_insert(k); - bch_keylist_push(&op->keys); + bch_keylist_push(&op->insert_keys); n->bi_rw |= REQ_WRITE; bch_submit_bbio(n, op->c, k, 0); } while (n != bio); op->insert_data_done = true; - continue_at(cl, bch_journal, bcache_wq); + continue_at(cl, bch_data_insert_keys, bcache_wq); err: /* bch_alloc_sectors() blocks if s->writeback = true */ - BUG_ON(s->writeback); + BUG_ON(op->writeback); /* * But if it's not a writeback write we'd rather just bail out if @@ -550,15 +407,15 @@ err: * we might be starving btree writes for gc or something. */ - if (s->write) { + if (!op->replace) { /* * Writethrough write: We can't complete the write until we've * updated the index. But we don't want to delay the write while * we wait for buckets to be freed up, so just invalidate the * rest of the write. */ - op->skip = true; - return bio_invalidate(cl); + op->bypass = true; + return bch_data_invalidate(cl); } else { /* * From a cache miss, we can just insert the keys for the data @@ -567,15 +424,15 @@ err: op->insert_data_done = true; bio_put(bio); - if (!bch_keylist_empty(&op->keys)) - continue_at(cl, bch_journal, bcache_wq); + if (!bch_keylist_empty(&op->insert_keys)) + continue_at(cl, bch_data_insert_keys, bcache_wq); else closure_return(cl); } } /** - * bch_insert_data - stick some data in the cache + * bch_data_insert - stick some data in the cache * * This is the starting point for any data to end up in a cache device; it could * be from a normal write, or a writeback write, or a write to a flash only @@ -587,56 +444,179 @@ err: * data is written it calls bch_journal, and after the keys have been added to * the next journal write they're inserted into the btree. * - * It inserts the data in op->cache_bio; bi_sector is used for the key offset, + * It inserts the data in s->cache_bio; bi_sector is used for the key offset, * and op->inode is used for the key inode. * - * If op->skip is true, instead of inserting the data it invalidates the region - * of the cache represented by op->cache_bio and op->inode. + * If s->bypass is true, instead of inserting the data it invalidates the + * region of the cache represented by s->cache_bio and op->inode. */ -void bch_insert_data(struct closure *cl) +void bch_data_insert(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + + trace_bcache_write(op->bio, op->writeback, op->bypass); - bch_keylist_init(&op->keys); - bio_get(op->cache_bio); - bch_insert_data_loop(cl); + bch_keylist_init(&op->insert_keys); + bio_get(op->bio); + bch_data_insert_start(cl); } -void bch_btree_insert_async(struct closure *cl) +/* Congested? */ + +unsigned bch_get_congested(struct cache_set *c) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + int i; + long rand; - if (bch_btree_insert(op, op->c)) { - s->error = -ENOMEM; - op->insert_data_done = true; - } + if (!c->congested_read_threshold_us && + !c->congested_write_threshold_us) + return 0; + + i = (local_clock_us() - c->congested_last_us) / 1024; + if (i < 0) + return 0; + + i += atomic_read(&c->congested); + if (i >= 0) + return 0; - if (op->insert_data_done) { - bch_keylist_free(&op->keys); - closure_return(cl); - } else - continue_at(cl, bch_insert_data_loop, bcache_wq); + i += CONGESTED_MAX; + + if (i > 0) + i = fract_exp_two(i, 6); + + rand = get_random_int(); + i -= bitmap_weight(&rand, BITS_PER_LONG); + + return i > 0 ? i : 1; } -/* Common code for the make_request functions */ +static void add_sequential(struct task_struct *t) +{ + ewma_add(t->sequential_io_avg, + t->sequential_io, 8, 0); -static void request_endio(struct bio *bio, int error) + t->sequential_io = 0; +} + +static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) { - struct closure *cl = bio->bi_private; + return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; +} - if (error) { - struct search *s = container_of(cl, struct search, cl); - s->error = error; - /* Only cache read errors are recoverable */ - s->recoverable = false; +static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) +{ + struct cache_set *c = dc->disk.c; + unsigned mode = cache_mode(dc, bio); + unsigned sectors, congested = bch_get_congested(c); + struct task_struct *task = current; + struct io *i; + + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || + c->gc_stats.in_use > CUTOFF_CACHE_ADD || + (bio->bi_rw & REQ_DISCARD)) + goto skip; + + if (mode == CACHE_MODE_NONE || + (mode == CACHE_MODE_WRITEAROUND && + (bio->bi_rw & REQ_WRITE))) + goto skip; + + if (bio->bi_sector & (c->sb.block_size - 1) || + bio_sectors(bio) & (c->sb.block_size - 1)) { + pr_debug("skipping unaligned io"); + goto skip; } - bio_put(bio); - closure_put(cl); + if (bypass_torture_test(dc)) { + if ((get_random_int() & 3) == 3) + goto skip; + else + goto rescale; + } + + if (!congested && !dc->sequential_cutoff) + goto rescale; + + if (!congested && + mode == CACHE_MODE_WRITEBACK && + (bio->bi_rw & REQ_WRITE) && + (bio->bi_rw & REQ_SYNC)) + goto rescale; + + spin_lock(&dc->io_lock); + + hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) + if (i->last == bio->bi_sector && + time_before(jiffies, i->jiffies)) + goto found; + + i = list_first_entry(&dc->io_lru, struct io, lru); + + add_sequential(task); + i->sequential = 0; +found: + if (i->sequential + bio->bi_size > i->sequential) + i->sequential += bio->bi_size; + + i->last = bio_end_sector(bio); + i->jiffies = jiffies + msecs_to_jiffies(5000); + task->sequential_io = i->sequential; + + hlist_del(&i->hash); + hlist_add_head(&i->hash, iohash(dc, i->last)); + list_move_tail(&i->lru, &dc->io_lru); + + spin_unlock(&dc->io_lock); + + sectors = max(task->sequential_io, + task->sequential_io_avg) >> 9; + + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(bio); + goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(bio); + goto skip; + } + +rescale: + bch_rescale_priorities(c, bio_sectors(bio)); + return false; +skip: + bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); + return true; } -void bch_cache_read_endio(struct bio *bio, int error) +/* Cache lookup */ + +struct search { + /* Stack frame for bio_complete */ + struct closure cl; + + struct bcache_device *d; + + struct bbio bio; + struct bio *orig_bio; + struct bio *cache_miss; + + unsigned insert_bio_sectors; + + unsigned recoverable:1; + unsigned unaligned_bvec:1; + unsigned write:1; + unsigned read_dirty_data:1; + + unsigned long start_time; + + struct btree_op op; + struct data_insert_op iop; +}; + +static void bch_cache_read_endio(struct bio *bio, int error) { struct bbio *b = container_of(bio, struct bbio, bio); struct closure *cl = bio->bi_private; @@ -650,13 +630,113 @@ void bch_cache_read_endio(struct bio *bio, int error) */ if (error) - s->error = error; - else if (ptr_stale(s->op.c, &b->key, 0)) { - atomic_long_inc(&s->op.c->cache_read_races); - s->error = -EINTR; + s->iop.error = error; + else if (ptr_stale(s->iop.c, &b->key, 0)) { + atomic_long_inc(&s->iop.c->cache_read_races); + s->iop.error = -EINTR; } - bch_bbio_endio(s->op.c, bio, error, "reading from cache"); + bch_bbio_endio(s->iop.c, bio, error, "reading from cache"); +} + +/* + * Read from a single key, handling the initial cache miss if the key starts in + * the middle of the bio + */ +static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) +{ + struct search *s = container_of(op, struct search, op); + struct bio *n, *bio = &s->bio.bio; + struct bkey *bio_key; + unsigned ptr; + + if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0) + return MAP_CONTINUE; + + if (KEY_INODE(k) != s->iop.inode || + KEY_START(k) > bio->bi_sector) { + unsigned bio_sectors = bio_sectors(bio); + unsigned sectors = KEY_INODE(k) == s->iop.inode + ? min_t(uint64_t, INT_MAX, + KEY_START(k) - bio->bi_sector) + : INT_MAX; + + int ret = s->d->cache_miss(b, s, bio, sectors); + if (ret != MAP_CONTINUE) + return ret; + + /* if this was a complete miss we shouldn't get here */ + BUG_ON(bio_sectors <= sectors); + } + + if (!KEY_SIZE(k)) + return MAP_CONTINUE; + + /* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0; + + PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; + + if (KEY_DIRTY(k)) + s->read_dirty_data = true; + + n = bch_bio_split(bio, min_t(uint64_t, INT_MAX, + KEY_OFFSET(k) - bio->bi_sector), + GFP_NOIO, s->d->bio_split); + + bio_key = &container_of(n, struct bbio, bio)->key; + bch_bkey_copy_single_ptr(bio_key, k, ptr); + + bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key); + bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); + + n->bi_end_io = bch_cache_read_endio; + n->bi_private = &s->cl; + + /* + * The bucket we're reading from might be reused while our bio + * is in flight, and we could then end up reading the wrong + * data. + * + * We guard against this by checking (in cache_read_endio()) if + * the pointer is stale again; if so, we treat it as an error + * and reread from the backing device (but we don't pass that + * error up anywhere). + */ + + __bch_submit_bbio(n, b->c); + return n == bio ? MAP_DONE : MAP_CONTINUE; +} + +static void cache_lookup(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, iop.cl); + struct bio *bio = &s->bio.bio; + + int ret = bch_btree_map_keys(&s->op, s->iop.c, + &KEY(s->iop.inode, bio->bi_sector, 0), + cache_lookup_fn, MAP_END_KEY); + if (ret == -EAGAIN) + continue_at(cl, cache_lookup, bcache_wq); + + closure_return(cl); +} + +/* Common code for the make_request functions */ + +static void request_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + + if (error) { + struct search *s = container_of(cl, struct search, cl); + s->iop.error = error; + /* Only cache read errors are recoverable */ + s->recoverable = false; + } + + bio_put(bio); + closure_put(cl); } static void bio_complete(struct search *s) @@ -670,8 +750,8 @@ static void bio_complete(struct search *s) part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); part_stat_unlock(); - trace_bcache_request_end(s, s->orig_bio); - bio_endio(s->orig_bio, s->error); + trace_bcache_request_end(s->d, s->orig_bio); + bio_endio(s->orig_bio, s->iop.error); s->orig_bio = NULL; } } @@ -691,8 +771,8 @@ static void search_free(struct closure *cl) struct search *s = container_of(cl, struct search, cl); bio_complete(s); - if (s->op.cache_bio) - bio_put(s->op.cache_bio); + if (s->iop.bio) + bio_put(s->iop.bio); if (s->unaligned_bvec) mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); @@ -703,21 +783,22 @@ static void search_free(struct closure *cl) static struct search *search_alloc(struct bio *bio, struct bcache_device *d) { + struct search *s; struct bio_vec *bv; - struct search *s = mempool_alloc(d->c->search, GFP_NOIO); - memset(s, 0, offsetof(struct search, op.keys)); + + s = mempool_alloc(d->c->search, GFP_NOIO); + memset(s, 0, offsetof(struct search, iop.insert_keys)); __closure_init(&s->cl, NULL); - s->op.inode = d->id; - s->op.c = d->c; + s->iop.inode = d->id; + s->iop.c = d->c; s->d = d; s->op.lock = -1; - s->task = current; + s->iop.write_point = hash_long((unsigned long) current, 16); s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; - s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; + s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; s->recoverable = 1; s->start_time = jiffies; do_bio_hook(s); @@ -734,18 +815,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) return s; } -static void btree_read_async(struct closure *cl) -{ - struct btree_op *op = container_of(cl, struct btree_op, cl); - - int ret = btree_root(search_recurse, op->c, op); - - if (ret == -EAGAIN) - continue_at(cl, btree_read_async, bcache_wq); - - closure_return(cl); -} - /* Cached devices */ static void cached_dev_bio_complete(struct closure *cl) @@ -759,27 +828,28 @@ static void cached_dev_bio_complete(struct closure *cl) /* Process reads */ -static void cached_dev_read_complete(struct closure *cl) +static void cached_dev_cache_miss_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); - if (s->op.insert_collision) - bch_mark_cache_miss_collision(s); + if (s->iop.replace_collision) + bch_mark_cache_miss_collision(s->iop.c, s->d); - if (s->op.cache_bio) { + if (s->iop.bio) { int i; struct bio_vec *bv; - __bio_for_each_segment(bv, s->op.cache_bio, i, 0) + bio_for_each_segment_all(bv, s->iop.bio, i) __free_page(bv->bv_page); } cached_dev_bio_complete(cl); } -static void request_read_error(struct closure *cl) +static void cached_dev_read_error(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); + struct bio *bio = &s->bio.bio; struct bio_vec *bv; int i; @@ -787,7 +857,7 @@ static void request_read_error(struct closure *cl) /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); - s->error = 0; + s->iop.error = 0; bv = s->bio.bio.bi_io_vec; do_bio_hook(s); s->bio.bio.bi_io_vec = bv; @@ -803,146 +873,148 @@ static void request_read_error(struct closure *cl) /* XXX: invalidate cache */ - closure_bio_submit(&s->bio.bio, &s->cl, s->d); + closure_bio_submit(bio, cl, s->d); } - continue_at(cl, cached_dev_read_complete, NULL); + continue_at(cl, cached_dev_cache_miss_done, NULL); } -static void request_read_done(struct closure *cl) +static void cached_dev_read_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); /* - * s->cache_bio != NULL implies that we had a cache miss; cache_bio now - * contains data ready to be inserted into the cache. + * We had a cache miss; cache_bio now contains data ready to be inserted + * into the cache. * * First, we copy the data we just read from cache_bio's bounce buffers * to the buffers the original bio pointed to: */ - if (s->op.cache_bio) { - bio_reset(s->op.cache_bio); - s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; - s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; - s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; - bch_bio_map(s->op.cache_bio, NULL); + if (s->iop.bio) { + bio_reset(s->iop.bio); + s->iop.bio->bi_sector = s->cache_miss->bi_sector; + s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; + s->iop.bio->bi_size = s->insert_bio_sectors << 9; + bch_bio_map(s->iop.bio, NULL); - bio_copy_data(s->cache_miss, s->op.cache_bio); + bio_copy_data(s->cache_miss, s->iop.bio); bio_put(s->cache_miss); s->cache_miss = NULL; } - if (verify(dc, &s->bio.bio) && s->recoverable) - bch_data_verify(s); + if (verify(dc, &s->bio.bio) && s->recoverable && + !s->unaligned_bvec && !s->read_dirty_data) + bch_data_verify(dc, s->orig_bio); bio_complete(s); - if (s->op.cache_bio && - !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { - s->op.type = BTREE_REPLACE; - closure_call(&s->op.cl, bch_insert_data, NULL, cl); + if (s->iop.bio && + !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { + BUG_ON(!s->iop.replace); + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); } - continue_at(cl, cached_dev_read_complete, NULL); + continue_at(cl, cached_dev_cache_miss_done, NULL); } -static void request_read_done_bh(struct closure *cl) +static void cached_dev_read_done_bh(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); - trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); + bch_mark_cache_accounting(s->iop.c, s->d, + !s->cache_miss, s->iop.bypass); + trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); - if (s->error) - continue_at_nobarrier(cl, request_read_error, bcache_wq); - else if (s->op.cache_bio || verify(dc, &s->bio.bio)) - continue_at_nobarrier(cl, request_read_done, bcache_wq); + if (s->iop.error) + continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); + else if (s->iop.bio || verify(dc, &s->bio.bio)) + continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); else - continue_at_nobarrier(cl, cached_dev_read_complete, NULL); + continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); } static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { - int ret = 0; - unsigned reada; + int ret = MAP_CONTINUE; + unsigned reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - struct bio *miss; - - miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (miss == bio) - s->op.lookup_done = true; + struct bio *miss, *cache_bio; - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; - - if (s->cache_miss || s->op.skip) + if (s->cache_miss || s->iop.bypass) { + miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); + ret = miss == bio ? MAP_DONE : MAP_CONTINUE; goto out_submit; - - if (miss != bio || - (bio->bi_rw & REQ_RAHEAD) || - (bio->bi_rw & REQ_META) || - s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) - reada = 0; - else { - reada = min(dc->readahead >> 9, - sectors - bio_sectors(miss)); - - if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) - reada = bdev_sectors(miss->bi_bdev) - - bio_end_sector(miss); } - s->cache_bio_sectors = bio_sectors(miss) + reada; - s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, - DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), - dc->disk.bio_split); + if (!(bio->bi_rw & REQ_RAHEAD) && + !(bio->bi_rw & REQ_META) && + s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) + reada = min_t(sector_t, dc->readahead >> 9, + bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); - if (!s->op.cache_bio) - goto out_submit; + s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); - s->op.cache_bio->bi_sector = miss->bi_sector; - s->op.cache_bio->bi_bdev = miss->bi_bdev; - s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; + s->iop.replace_key = KEY(s->iop.inode, + bio->bi_sector + s->insert_bio_sectors, + s->insert_bio_sectors); - s->op.cache_bio->bi_end_io = request_endio; - s->op.cache_bio->bi_private = &s->cl; + ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); + if (ret) + return ret; + + s->iop.replace = true; + + miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); /* btree_search_recurse()'s btree iterator is no good anymore */ - ret = -EINTR; - if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) - goto out_put; + ret = miss == bio ? MAP_DONE : -EINTR; + + cache_bio = bio_alloc_bioset(GFP_NOWAIT, + DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), + dc->disk.bio_split); + if (!cache_bio) + goto out_submit; + + cache_bio->bi_sector = miss->bi_sector; + cache_bio->bi_bdev = miss->bi_bdev; + cache_bio->bi_size = s->insert_bio_sectors << 9; + + cache_bio->bi_end_io = request_endio; + cache_bio->bi_private = &s->cl; - bch_bio_map(s->op.cache_bio, NULL); - if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) + bch_bio_map(cache_bio, NULL); + if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; - s->cache_miss = miss; - bio_get(s->op.cache_bio); + if (reada) + bch_mark_cache_readahead(s->iop.c, s->d); - closure_bio_submit(s->op.cache_bio, &s->cl, s->d); + s->cache_miss = miss; + s->iop.bio = cache_bio; + bio_get(cache_bio); + closure_bio_submit(cache_bio, &s->cl, s->d); return ret; out_put: - bio_put(s->op.cache_bio); - s->op.cache_bio = NULL; + bio_put(cache_bio); out_submit: + miss->bi_end_io = request_endio; + miss->bi_private = &s->cl; closure_bio_submit(miss, &s->cl, s->d); return ret; } -static void request_read(struct cached_dev *dc, struct search *s) +static void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; - check_should_skip(dc, s); - closure_call(&s->op.cl, btree_read_async, NULL, cl); - - continue_at(cl, request_read_done_bh, NULL); + closure_call(&s->iop.cl, cache_lookup, NULL, cl); + continue_at(cl, cached_dev_read_done_bh, NULL); } /* Process writes */ @@ -956,47 +1028,52 @@ static void cached_dev_write_complete(struct closure *cl) cached_dev_bio_complete(cl); } -static void request_write(struct cached_dev *dc, struct search *s) +static void cached_dev_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; struct bio *bio = &s->bio.bio; - struct bkey start, end; - start = KEY(dc->disk.id, bio->bi_sector, 0); - end = KEY(dc->disk.id, bio_end_sector(bio), 0); + struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0); + struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); - bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); - check_should_skip(dc, s); down_read_non_owner(&dc->writeback_lock); - if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { - s->op.skip = false; - s->writeback = true; + /* + * We overlap with some dirty data undergoing background + * writeback, force this write to writeback + */ + s->iop.bypass = false; + s->iop.writeback = true; } + /* + * Discards aren't _required_ to do anything, so skipping if + * check_overlapping returned true is ok + * + * But check_overlapping drops dirty keys for which io hasn't started, + * so we still want to call it. + */ if (bio->bi_rw & REQ_DISCARD) - goto skip; + s->iop.bypass = true; if (should_writeback(dc, s->orig_bio, cache_mode(dc, bio), - s->op.skip)) { - s->op.skip = false; - s->writeback = true; + s->iop.bypass)) { + s->iop.bypass = false; + s->iop.writeback = true; } - if (s->op.skip) - goto skip; - - trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); + if (s->iop.bypass) { + s->iop.bio = s->orig_bio; + bio_get(s->iop.bio); - if (!s->writeback) { - s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, - dc->disk.bio_split); - - closure_bio_submit(bio, cl, s->d); - } else { + if (!(bio->bi_rw & REQ_DISCARD) || + blk_queue_discard(bdev_get_queue(dc->bdev))) + closure_bio_submit(bio, cl, s->d); + } else if (s->iop.writeback) { bch_writeback_add(dc); - s->op.cache_bio = bio; + s->iop.bio = bio; if (bio->bi_rw & REQ_FLUSH) { /* Also need to send a flush to the backing device */ @@ -1010,36 +1087,26 @@ static void request_write(struct cached_dev *dc, struct search *s) closure_bio_submit(flush, cl, s->d); } - } -out: - closure_call(&s->op.cl, bch_insert_data, NULL, cl); - continue_at(cl, cached_dev_write_complete, NULL); -skip: - s->op.skip = true; - s->op.cache_bio = s->orig_bio; - bio_get(s->op.cache_bio); + } else { + s->iop.bio = bio_clone_bioset(bio, GFP_NOIO, + dc->disk.bio_split); - if ((bio->bi_rw & REQ_DISCARD) && - !blk_queue_discard(bdev_get_queue(dc->bdev))) - goto out; + closure_bio_submit(bio, cl, s->d); + } - closure_bio_submit(bio, cl, s->d); - goto out; + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + continue_at(cl, cached_dev_write_complete, NULL); } -static void request_nodata(struct cached_dev *dc, struct search *s) +static void cached_dev_nodata(struct closure *cl) { - struct closure *cl = &s->cl; + struct search *s = container_of(cl, struct search, cl); struct bio *bio = &s->bio.bio; - if (bio->bi_rw & REQ_DISCARD) { - request_write(dc, s); - return; - } - - if (s->op.flush_journal) - bch_journal_meta(s->op.c, cl); + if (s->iop.flush_journal) + bch_journal_meta(s->iop.c, cl); + /* If it's a flush, we send the flush to the backing device too */ closure_bio_submit(bio, cl, s->d); continue_at(cl, cached_dev_bio_complete, NULL); @@ -1047,134 +1114,6 @@ static void request_nodata(struct cached_dev *dc, struct search *s) /* Cached devices - read & write stuff */ -unsigned bch_get_congested(struct cache_set *c) -{ - int i; - long rand; - - if (!c->congested_read_threshold_us && - !c->congested_write_threshold_us) - return 0; - - i = (local_clock_us() - c->congested_last_us) / 1024; - if (i < 0) - return 0; - - i += atomic_read(&c->congested); - if (i >= 0) - return 0; - - i += CONGESTED_MAX; - - if (i > 0) - i = fract_exp_two(i, 6); - - rand = get_random_int(); - i -= bitmap_weight(&rand, BITS_PER_LONG); - - return i > 0 ? i : 1; -} - -static void add_sequential(struct task_struct *t) -{ - ewma_add(t->sequential_io_avg, - t->sequential_io, 8, 0); - - t->sequential_io = 0; -} - -static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) -{ - return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; -} - -static void check_should_skip(struct cached_dev *dc, struct search *s) -{ - struct cache_set *c = s->op.c; - struct bio *bio = &s->bio.bio; - unsigned mode = cache_mode(dc, bio); - unsigned sectors, congested = bch_get_congested(c); - - if (atomic_read(&dc->disk.detaching) || - c->gc_stats.in_use > CUTOFF_CACHE_ADD || - (bio->bi_rw & REQ_DISCARD)) - goto skip; - - if (mode == CACHE_MODE_NONE || - (mode == CACHE_MODE_WRITEAROUND && - (bio->bi_rw & REQ_WRITE))) - goto skip; - - if (bio->bi_sector & (c->sb.block_size - 1) || - bio_sectors(bio) & (c->sb.block_size - 1)) { - pr_debug("skipping unaligned io"); - goto skip; - } - - if (!congested && !dc->sequential_cutoff) - goto rescale; - - if (!congested && - mode == CACHE_MODE_WRITEBACK && - (bio->bi_rw & REQ_WRITE) && - (bio->bi_rw & REQ_SYNC)) - goto rescale; - - if (dc->sequential_merge) { - struct io *i; - - spin_lock(&dc->io_lock); - - hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) - if (i->last == bio->bi_sector && - time_before(jiffies, i->jiffies)) - goto found; - - i = list_first_entry(&dc->io_lru, struct io, lru); - - add_sequential(s->task); - i->sequential = 0; -found: - if (i->sequential + bio->bi_size > i->sequential) - i->sequential += bio->bi_size; - - i->last = bio_end_sector(bio); - i->jiffies = jiffies + msecs_to_jiffies(5000); - s->task->sequential_io = i->sequential; - - hlist_del(&i->hash); - hlist_add_head(&i->hash, iohash(dc, i->last)); - list_move_tail(&i->lru, &dc->io_lru); - - spin_unlock(&dc->io_lock); - } else { - s->task->sequential_io = bio->bi_size; - - add_sequential(s->task); - } - - sectors = max(s->task->sequential_io, - s->task->sequential_io_avg) >> 9; - - if (dc->sequential_cutoff && - sectors >= dc->sequential_cutoff >> 9) { - trace_bcache_bypass_sequential(s->orig_bio); - goto skip; - } - - if (congested && sectors >= congested) { - trace_bcache_bypass_congested(s->orig_bio); - goto skip; - } - -rescale: - bch_rescale_priorities(c, bio_sectors(bio)); - return; -skip: - bch_mark_sectors_bypassed(s, bio_sectors(bio)); - s->op.skip = true; -} - static void cached_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; @@ -1192,14 +1131,24 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) if (cached_dev_get(dc)) { s = search_alloc(bio, d); - trace_bcache_request_start(s, bio); - - if (!bio_has_data(bio)) - request_nodata(dc, s); - else if (rw) - request_write(dc, s); - else - request_read(dc, s); + trace_bcache_request_start(s->d, bio); + + if (!bio->bi_size) { + /* + * can't call bch_journal_meta from under + * generic_make_request + */ + continue_at_nobarrier(&s->cl, + cached_dev_nodata, + bcache_wq); + } else { + s->iop.bypass = check_should_bypass(dc, bio); + + if (rw) + cached_dev_write(dc, s); + else + cached_dev_read(dc, s); + } } else { if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(dc->bdev))) @@ -1274,9 +1223,19 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s, bio_advance(bio, min(sectors << 9, bio->bi_size)); if (!bio->bi_size) - s->op.lookup_done = true; + return MAP_DONE; - return 0; + return MAP_CONTINUE; +} + +static void flash_dev_nodata(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + + if (s->iop.flush_journal) + bch_journal_meta(s->iop.c, cl); + + continue_at(cl, search_free, NULL); } static void flash_dev_make_request(struct request_queue *q, struct bio *bio) @@ -1295,23 +1254,28 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) cl = &s->cl; bio = &s->bio.bio; - trace_bcache_request_start(s, bio); + trace_bcache_request_start(s->d, bio); - if (bio_has_data(bio) && !rw) { - closure_call(&s->op.cl, btree_read_async, NULL, cl); - } else if (bio_has_data(bio) || s->op.skip) { - bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, + if (!bio->bi_size) { + /* + * can't call bch_journal_meta from under + * generic_make_request + */ + continue_at_nobarrier(&s->cl, + flash_dev_nodata, + bcache_wq); + } else if (rw) { + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_sector, 0), &KEY(d->id, bio_end_sector(bio), 0)); - s->writeback = true; - s->op.cache_bio = bio; + s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; + s->iop.writeback = true; + s->iop.bio = bio; - closure_call(&s->op.cl, bch_insert_data, NULL, cl); + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); } else { - /* No data - probably a cache flush */ - if (s->op.flush_journal) - bch_journal_meta(s->op.c, cl); + closure_call(&s->iop.cl, cache_lookup, NULL, cl); } continue_at(cl, search_free, NULL); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 57dc4784f4f4..2cd65bf073c2 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -3,40 +3,33 @@ #include <linux/cgroup.h> -struct search { - /* Stack frame for bio_complete */ +struct data_insert_op { struct closure cl; + struct cache_set *c; + struct bio *bio; - struct bcache_device *d; - struct task_struct *task; - - struct bbio bio; - struct bio *orig_bio; - struct bio *cache_miss; - unsigned cache_bio_sectors; - - unsigned recoverable:1; - unsigned unaligned_bvec:1; + unsigned inode; + uint16_t write_point; + uint16_t write_prio; + short error; - unsigned write:1; + unsigned bypass:1; unsigned writeback:1; + unsigned flush_journal:1; + unsigned csum:1; - /* IO error returned to s->bio */ - short error; - unsigned long start_time; + unsigned replace:1; + unsigned replace_collision:1; + + unsigned insert_data_done:1; - /* Anything past op->keys won't get zeroed in do_bio_hook */ - struct btree_op op; + /* Anything past this point won't get zeroed in search_alloc() */ + struct keylist insert_keys; + BKEY_PADDED(replace_key); }; -void bch_cache_read_endio(struct bio *, int); unsigned bch_get_congested(struct cache_set *); -void bch_insert_data(struct closure *cl); -void bch_btree_insert_async(struct closure *); -void bch_cache_read_endio(struct bio *, int); - -void bch_open_buckets_free(struct cache_set *); -int bch_open_buckets_alloc(struct cache_set *); +void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); void bch_flash_dev_request_init(struct bcache_device *d); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index b8730e714d69..84d0782f702e 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -7,7 +7,6 @@ #include "bcache.h" #include "stats.h" #include "btree.h" -#include "request.h" #include "sysfs.h" /* @@ -196,35 +195,36 @@ static void mark_cache_stats(struct cache_stat_collector *stats, atomic_inc(&stats->cache_bypass_misses); } -void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) +void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, + bool hit, bool bypass) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct cached_dev *dc = container_of(d, struct cached_dev, disk); mark_cache_stats(&dc->accounting.collector, hit, bypass); - mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); + mark_cache_stats(&c->accounting.collector, hit, bypass); #ifdef CONFIG_CGROUP_BCACHE mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); #endif } -void bch_mark_cache_readahead(struct search *s) +void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct cached_dev *dc = container_of(d, struct cached_dev, disk); atomic_inc(&dc->accounting.collector.cache_readaheads); - atomic_inc(&s->op.c->accounting.collector.cache_readaheads); + atomic_inc(&c->accounting.collector.cache_readaheads); } -void bch_mark_cache_miss_collision(struct search *s) +void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct cached_dev *dc = container_of(d, struct cached_dev, disk); atomic_inc(&dc->accounting.collector.cache_miss_collisions); - atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); + atomic_inc(&c->accounting.collector.cache_miss_collisions); } -void bch_mark_sectors_bypassed(struct search *s, int sectors) +void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, + int sectors) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); - atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); + atomic_add(sectors, &c->accounting.collector.sectors_bypassed); } void bch_cache_accounting_init(struct cache_accounting *acc, diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index c7c7a8fd29fe..adbff141c887 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -38,7 +38,9 @@ struct cache_accounting { struct cache_stats day; }; -struct search; +struct cache_set; +struct cached_dev; +struct bcache_device; void bch_cache_accounting_init(struct cache_accounting *acc, struct closure *parent); @@ -50,9 +52,10 @@ void bch_cache_accounting_clear(struct cache_accounting *acc); void bch_cache_accounting_destroy(struct cache_accounting *acc); -void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); -void bch_mark_cache_readahead(struct search *s); -void bch_mark_cache_miss_collision(struct search *s); -void bch_mark_sectors_bypassed(struct search *s, int sectors); +void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *, + bool, bool); +void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *); +void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *); +void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int); #endif /* _BCACHE_STATS_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 547c4c57b052..dec15cd2d797 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -16,6 +16,7 @@ #include <linux/buffer_head.h> #include <linux/debugfs.h> #include <linux/genhd.h> +#include <linux/idr.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> @@ -45,21 +46,13 @@ const char * const bch_cache_modes[] = { NULL }; -struct uuid_entry_v0 { - uint8_t uuid[16]; - uint8_t label[32]; - uint32_t first_reg; - uint32_t last_reg; - uint32_t invalidated; - uint32_t pad; -}; - static struct kobject *bcache_kobj; struct mutex bch_register_lock; LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices); -static int bcache_major, bcache_minor; +static int bcache_major; +static DEFINE_IDA(bcache_minor); static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; @@ -382,7 +375,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) { struct bkey *k = &j->uuid_bucket; - if (__bch_ptr_invalid(c, 1, k)) + if (bch_btree_ptr_invalid(c, k)) return "bad uuid pointer"; bkey_copy(&c->uuid_bucket, k); @@ -427,7 +420,7 @@ static int __uuid_write(struct cache_set *c) lockdep_assert_held(&bch_register_lock); - if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) + if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) return 1; SET_KEY_SIZE(&k.key, c->sb.bucket_size); @@ -435,7 +428,7 @@ static int __uuid_write(struct cache_set *c) closure_sync(&cl); bkey_copy(&c->uuid_bucket, &k.key); - __bkey_put(c, &k.key); + bkey_put(c, &k.key); return 0; } @@ -562,10 +555,10 @@ void bch_prio_write(struct cache *ca) } p->next_bucket = ca->prio_buckets[i + 1]; - p->magic = pset_magic(ca); + p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); + bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -613,7 +606,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) pr_warn("bad csum reading priorities"); - if (p->magic != pset_magic(ca)) + if (p->magic != pset_magic(&ca->sb)) pr_warn("bad magic reading priorities"); bucket = p->next_bucket; @@ -630,7 +623,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) static int open_dev(struct block_device *b, fmode_t mode) { struct bcache_device *d = b->bd_disk->private_data; - if (atomic_read(&d->closing)) + if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) return -ENXIO; closure_get(&d->cl); @@ -659,20 +652,24 @@ static const struct block_device_operations bcache_ops = { void bcache_device_stop(struct bcache_device *d) { - if (!atomic_xchg(&d->closing, 1)) + if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) closure_queue(&d->cl); } static void bcache_device_unlink(struct bcache_device *d) { - unsigned i; - struct cache *ca; + lockdep_assert_held(&bch_register_lock); - sysfs_remove_link(&d->c->kobj, d->name); - sysfs_remove_link(&d->kobj, "cache"); + if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { + unsigned i; + struct cache *ca; - for_each_cache(ca, d->c, i) - bd_unlink_disk_holder(ca->bdev, d->disk); + sysfs_remove_link(&d->c->kobj, d->name); + sysfs_remove_link(&d->kobj, "cache"); + + for_each_cache(ca, d->c, i) + bd_unlink_disk_holder(ca->bdev, d->disk); + } } static void bcache_device_link(struct bcache_device *d, struct cache_set *c, @@ -696,19 +693,16 @@ static void bcache_device_detach(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); - if (atomic_read(&d->detaching)) { + if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { struct uuid_entry *u = d->c->uuids + d->id; SET_UUID_FLASH_ONLY(u, 0); memcpy(u->uuid, invalid_uuid, 16); u->invalidated = cpu_to_le32(get_seconds()); bch_uuid_write(d->c); - - atomic_set(&d->detaching, 0); } - if (!d->flush_done) - bcache_device_unlink(d); + bcache_device_unlink(d); d->c->devices[d->id] = NULL; closure_put(&d->c->caching); @@ -739,14 +733,20 @@ static void bcache_device_free(struct bcache_device *d) del_gendisk(d->disk); if (d->disk && d->disk->queue) blk_cleanup_queue(d->disk->queue); - if (d->disk) + if (d->disk) { + ida_simple_remove(&bcache_minor, d->disk->first_minor); put_disk(d->disk); + } bio_split_pool_free(&d->bio_split_hook); if (d->unaligned_bvec) mempool_destroy(d->unaligned_bvec); if (d->bio_split) bioset_free(d->bio_split); + if (is_vmalloc_addr(d->full_dirty_stripes)) + vfree(d->full_dirty_stripes); + else + kfree(d->full_dirty_stripes); if (is_vmalloc_addr(d->stripe_sectors_dirty)) vfree(d->stripe_sectors_dirty); else @@ -760,15 +760,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, { struct request_queue *q; size_t n; + int minor; - if (!d->stripe_size_bits) - d->stripe_size_bits = 31; + if (!d->stripe_size) + d->stripe_size = 1 << 31; - d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> - d->stripe_size_bits; + d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); - if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) + if (!d->nr_stripes || + d->nr_stripes > INT_MAX || + d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { + pr_err("nr_stripes too large"); return -ENOMEM; + } n = d->nr_stripes * sizeof(atomic_t); d->stripe_sectors_dirty = n < PAGE_SIZE << 6 @@ -777,22 +781,38 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->stripe_sectors_dirty) return -ENOMEM; + n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); + d->full_dirty_stripes = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->full_dirty_stripes) + return -ENOMEM; + + minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); + if (minor < 0) + return minor; + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, sizeof(struct bio_vec) * BIO_MAX_PAGES)) || bio_split_pool_init(&d->bio_split_hook) || - !(d->disk = alloc_disk(1)) || - !(q = blk_alloc_queue(GFP_KERNEL))) + !(d->disk = alloc_disk(1))) { + ida_simple_remove(&bcache_minor, minor); return -ENOMEM; + } set_capacity(d->disk, sectors); - snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); d->disk->major = bcache_major; - d->disk->first_minor = bcache_minor++; + d->disk->first_minor = minor; d->disk->fops = &bcache_ops; d->disk->private_data = d; + q = blk_alloc_queue(GFP_KERNEL); + if (!q) + return -ENOMEM; + blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; @@ -874,7 +894,7 @@ static void cached_dev_detach_finish(struct work_struct *w) struct closure cl; closure_init_stack(&cl); - BUG_ON(!atomic_read(&dc->disk.detaching)); + BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(atomic_read(&dc->count)); mutex_lock(&bch_register_lock); @@ -888,6 +908,8 @@ static void cached_dev_detach_finish(struct work_struct *w) bcache_device_detach(&dc->disk); list_move(&dc->list, &uncached_devices); + clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); + mutex_unlock(&bch_register_lock); pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); @@ -900,10 +922,10 @@ void bch_cached_dev_detach(struct cached_dev *dc) { lockdep_assert_held(&bch_register_lock); - if (atomic_read(&dc->disk.closing)) + if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) return; - if (atomic_xchg(&dc->disk.detaching, 1)) + if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) return; /* @@ -1030,6 +1052,7 @@ static void cached_dev_free(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); cancel_delayed_work_sync(&dc->writeback_rate_update); + kthread_stop(dc->writeback_thread); mutex_lock(&bch_register_lock); @@ -1058,11 +1081,7 @@ static void cached_dev_flush(struct closure *cl) struct bcache_device *d = &dc->disk; mutex_lock(&bch_register_lock); - d->flush_done = 1; - - if (d->c) - bcache_device_unlink(d); - + bcache_device_unlink(d); mutex_unlock(&bch_register_lock); bch_cache_accounting_destroy(&dc->accounting); @@ -1088,7 +1107,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) spin_lock_init(&dc->io_lock); bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); - dc->sequential_merge = true; dc->sequential_cutoff = 4 << 20; for (io = dc->io; io < dc->io + RECENT_IO; io++) { @@ -1260,7 +1278,8 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) { va_list args; - if (test_bit(CACHE_SET_STOPPING, &c->flags)) + if (c->on_error != ON_ERROR_PANIC && + test_bit(CACHE_SET_STOPPING, &c->flags)) return false; /* XXX: we can be called from atomic context @@ -1275,6 +1294,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) printk(", disabling caching\n"); + if (c->on_error == ON_ERROR_PANIC) + panic("panic forced after error\n"); + bch_cache_set_unregister(c); return true; } @@ -1339,6 +1361,9 @@ static void cache_set_flush(struct closure *cl) kobject_put(&c->internal); kobject_del(&c->kobj); + if (c->gc_thread) + kthread_stop(c->gc_thread); + if (!IS_ERR_OR_NULL(c->root)) list_add(&c->root->list, &c->btree_cache); @@ -1433,12 +1458,19 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->sort_crit_factor = int_sqrt(c->btree_pages); - mutex_init(&c->bucket_lock); - mutex_init(&c->sort_lock); - spin_lock_init(&c->sort_time_lock); closure_init_unlocked(&c->sb_write); + mutex_init(&c->bucket_lock); + init_waitqueue_head(&c->try_wait); + init_waitqueue_head(&c->bucket_wait); closure_init_unlocked(&c->uuid_write); - spin_lock_init(&c->btree_read_time_lock); + mutex_init(&c->sort_lock); + + spin_lock_init(&c->sort_time.lock); + spin_lock_init(&c->btree_gc_time.lock); + spin_lock_init(&c->btree_split_time.lock); + spin_lock_init(&c->btree_read_time.lock); + spin_lock_init(&c->try_harder_time.lock); + bch_moving_init_cache_set(c); INIT_LIST_HEAD(&c->list); @@ -1483,11 +1515,10 @@ static void run_cache_set(struct cache_set *c) const char *err = "cannot allocate memory"; struct cached_dev *dc, *t; struct cache *ca; + struct closure cl; unsigned i; - struct btree_op op; - bch_btree_op_init_stack(&op); - op.lock = SHRT_MAX; + closure_init_stack(&cl); for_each_cache(ca, c, i) c->nbuckets += ca->sb.nbuckets; @@ -1498,7 +1529,7 @@ static void run_cache_set(struct cache_set *c) struct jset *j; err = "cannot allocate memory for journal"; - if (bch_journal_read(c, &journal, &op)) + if (bch_journal_read(c, &journal)) goto err; pr_debug("btree_journal_read() done"); @@ -1522,23 +1553,23 @@ static void run_cache_set(struct cache_set *c) k = &j->btree_root; err = "bad btree root"; - if (__bch_ptr_invalid(c, j->btree_level + 1, k)) + if (bch_btree_ptr_invalid(c, k)) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, k, j->btree_level, &op); + c->root = bch_btree_node_get(c, k, j->btree_level, true); if (IS_ERR_OR_NULL(c->root)) goto err; list_del_init(&c->root->list); rw_unlock(true, c->root); - err = uuid_read(c, j, &op.cl); + err = uuid_read(c, j, &cl); if (err) goto err; err = "error in recovery"; - if (bch_btree_check(c, &op)) + if (bch_btree_check(c)) goto err; bch_journal_mark(c, &journal); @@ -1570,11 +1601,9 @@ static void run_cache_set(struct cache_set *c) if (j->version < BCACHE_JSET_VERSION_UUID) __uuid_write(c); - bch_journal_replay(c, &journal, &op); + bch_journal_replay(c, &journal); } else { pr_notice("invalidating existing data"); - /* Don't want invalidate_buckets() to queue a gc yet */ - closure_lock(&c->gc, NULL); for_each_cache(ca, c, i) { unsigned j; @@ -1600,15 +1629,15 @@ static void run_cache_set(struct cache_set *c) err = "cannot allocate new UUID bucket"; if (__uuid_write(c)) - goto err_unlock_gc; + goto err; err = "cannot allocate new btree root"; - c->root = bch_btree_node_alloc(c, 0, &op.cl); + c->root = bch_btree_node_alloc(c, 0, true); if (IS_ERR_OR_NULL(c->root)) - goto err_unlock_gc; + goto err; bkey_copy_key(&c->root->key, &MAX_KEY); - bch_btree_node_write(c->root, &op.cl); + bch_btree_node_write(c->root, &cl); bch_btree_set_root(c->root); rw_unlock(true, c->root); @@ -1621,14 +1650,14 @@ static void run_cache_set(struct cache_set *c) SET_CACHE_SYNC(&c->sb, true); bch_journal_next(&c->journal); - bch_journal_meta(c, &op.cl); - - /* Unlock */ - closure_set_stopped(&c->gc.cl); - closure_put(&c->gc.cl); + bch_journal_meta(c, &cl); } - closure_sync(&op.cl); + err = "error starting gc thread"; + if (bch_gc_thread_start(c)) + goto err; + + closure_sync(&cl); c->sb.last_mount = get_seconds(); bcache_write_super(c); @@ -1638,13 +1667,10 @@ static void run_cache_set(struct cache_set *c) flash_devs_run(c); return; -err_unlock_gc: - closure_set_stopped(&c->gc.cl); - closure_put(&c->gc.cl); err: - closure_sync(&op.cl); + closure_sync(&cl); /* XXX: test this, it's broken */ - bch_cache_set_error(c, err); + bch_cache_set_error(c, "%s", err); } static bool can_attach_cache(struct cache *ca, struct cache_set *c) @@ -1725,8 +1751,6 @@ void bch_cache_release(struct kobject *kobj) if (ca->set) ca->set->cache[ca->sb.nr_this_dev] = NULL; - bch_cache_allocator_exit(ca); - bio_split_pool_free(&ca->bio_split_hook); free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); @@ -1758,8 +1782,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - INIT_LIST_HEAD(&ca->discards); - bio_init(&ca->journal.bio); ca->journal.bio.bi_max_vecs = 8; ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; @@ -2006,7 +2028,6 @@ static struct notifier_block reboot = { static void bcache_exit(void) { bch_debug_exit(); - bch_writeback_exit(); bch_request_exit(); bch_btree_exit(); if (bcache_kobj) @@ -2039,7 +2060,6 @@ static int __init bcache_init(void) sysfs_create_files(bcache_kobj, files) || bch_btree_init() || bch_request_init() || - bch_writeback_init() || bch_debug_init(bcache_kobj)) goto err; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 924dcfdae111..80d4c2bee18a 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -21,6 +21,12 @@ static const char * const cache_replacement_policies[] = { NULL }; +static const char * const error_actions[] = { + "unregister", + "panic", + NULL +}; + write_attribute(attach); write_attribute(detach); write_attribute(unregister); @@ -66,7 +72,6 @@ rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us); rw_attribute(sequential_cutoff); -rw_attribute(sequential_merge); rw_attribute(data_csum); rw_attribute(cache_mode); rw_attribute(writeback_metadata); @@ -90,11 +95,14 @@ rw_attribute(discard); rw_attribute(running); rw_attribute(label); rw_attribute(readahead); +rw_attribute(errors); rw_attribute(io_error_limit); rw_attribute(io_error_halflife); rw_attribute(verify); +rw_attribute(bypass_torture_test); rw_attribute(key_merging_disabled); rw_attribute(gc_always_rewrite); +rw_attribute(expensive_debug_checks); rw_attribute(freelist_percent); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); @@ -116,6 +124,7 @@ SHOW(__bch_cached_dev) sysfs_printf(data_csum, "%i", dc->disk.data_csum); var_printf(verify, "%i"); + var_printf(bypass_torture_test, "%i"); var_printf(writeback_metadata, "%i"); var_printf(writeback_running, "%i"); var_print(writeback_delay); @@ -150,10 +159,9 @@ SHOW(__bch_cached_dev) sysfs_hprint(dirty_data, bcache_dev_sectors_dirty(&dc->disk) << 9); - sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); + sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); var_printf(partial_stripes_expensive, "%u"); - var_printf(sequential_merge, "%i"); var_hprint(sequential_cutoff); var_hprint(readahead); @@ -185,6 +193,7 @@ STORE(__cached_dev) sysfs_strtoul(data_csum, dc->disk.data_csum); d_strtoul(verify); + d_strtoul(bypass_torture_test); d_strtoul(writeback_metadata); d_strtoul(writeback_running); d_strtoul(writeback_delay); @@ -199,7 +208,6 @@ STORE(__cached_dev) dc->writeback_rate_p_term_inverse, 1, INT_MAX); d_strtoul(writeback_rate_d_smooth); - d_strtoul(sequential_merge); d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); @@ -311,7 +319,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_stripe_size, &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, - &sysfs_sequential_merge, &sysfs_clear_stats, &sysfs_running, &sysfs_state, @@ -319,6 +326,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_readahead, #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, + &sysfs_bypass_torture_test, #endif NULL }; @@ -366,7 +374,7 @@ STORE(__bch_flash_dev) } if (attr == &sysfs_unregister) { - atomic_set(&d->detaching, 1); + set_bit(BCACHE_DEV_DETACHING, &d->flags); bcache_device_stop(d); } @@ -481,7 +489,6 @@ lock_root: sysfs_print(btree_used_percent, btree_used(c)); sysfs_print(btree_nodes, c->gc_stats.nodes); - sysfs_hprint(dirty_data, c->gc_stats.dirty); sysfs_hprint(average_key_size, average_key_size(c)); sysfs_print(cache_read_races, @@ -492,6 +499,10 @@ lock_root: sysfs_print(writeback_keys_failed, atomic_long_read(&c->writeback_keys_failed)); + if (attr == &sysfs_errors) + return bch_snprint_string_list(buf, PAGE_SIZE, error_actions, + c->on_error); + /* See count_io_errors for why 88 */ sysfs_print(io_error_halflife, c->error_decay * 88); sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); @@ -506,6 +517,8 @@ lock_root: sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); sysfs_printf(verify, "%i", c->verify); sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); + sysfs_printf(expensive_debug_checks, + "%i", c->expensive_debug_checks); sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); @@ -555,7 +568,7 @@ STORE(__bch_cache_set) } if (attr == &sysfs_trigger_gc) - bch_queue_gc(c); + wake_up_gc(c); if (attr == &sysfs_prune_cache) { struct shrink_control sc; @@ -569,6 +582,15 @@ STORE(__bch_cache_set) sysfs_strtoul(congested_write_threshold_us, c->congested_write_threshold_us); + if (attr == &sysfs_errors) { + ssize_t v = bch_read_string_list(buf, error_actions); + + if (v < 0) + return v; + + c->on_error = v; + } + if (attr == &sysfs_io_error_limit) c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; @@ -579,6 +601,7 @@ STORE(__bch_cache_set) sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); sysfs_strtoul(verify, c->verify); sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); + sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); @@ -618,8 +641,8 @@ static struct attribute *bch_cache_set_files[] = { &sysfs_cache_available_percent, &sysfs_average_key_size, - &sysfs_dirty_data, + &sysfs_errors, &sysfs_io_error_limit, &sysfs_io_error_halflife, &sysfs_congested, @@ -653,6 +676,7 @@ static struct attribute *bch_cache_set_internal_files[] = { #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, &sysfs_key_merging_disabled, + &sysfs_expensive_debug_checks, #endif &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index f7b6c197f90f..adbc3df17a80 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -1,6 +1,5 @@ #include "bcache.h" #include "btree.h" -#include "request.h" #include <linux/blktrace_api.h> #include <linux/module.h> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 420dad545c7d..462214eeacbe 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -168,10 +168,14 @@ int bch_parse_uuid(const char *s, char *uuid) void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) { - uint64_t now = local_clock(); - uint64_t duration = time_after64(now, start_time) + uint64_t now, duration, last; + + spin_lock(&stats->lock); + + now = local_clock(); + duration = time_after64(now, start_time) ? now - start_time : 0; - uint64_t last = time_after64(now, stats->last) + last = time_after64(now, stats->last) ? now - stats->last : 0; stats->max_duration = max(stats->max_duration, duration); @@ -188,6 +192,8 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) } stats->last = now ?: 1; + + spin_unlock(&stats->lock); } /** diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ea345c6896f4..362c4b3f8b4a 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -15,28 +15,18 @@ struct closure; -#ifdef CONFIG_BCACHE_EDEBUG +#ifdef CONFIG_BCACHE_DEBUG #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) -#else /* EDEBUG */ +#else /* DEBUG */ #define atomic_dec_bug(v) atomic_dec(v) #define atomic_inc_bug(v, i) atomic_inc(v) #endif -#define BITMASK(name, type, field, offset, size) \ -static inline uint64_t name(const type *k) \ -{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ - \ -static inline void SET_##name(type *k, uint64_t v) \ -{ \ - k->field &= ~(~((uint64_t) ~0 << size) << offset); \ - k->field |= v << offset; \ -} - #define DECLARE_HEAP(type, name) \ struct { \ size_t size, used; \ @@ -388,6 +378,7 @@ ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[ ssize_t bch_read_string_list(const char *buf, const char * const list[]); struct time_stats { + spinlock_t lock; /* * all fields are in nanoseconds, averages are ewmas stored left shifted * by 8 diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index ba3ee48320f2..99053b1251be 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -11,18 +11,11 @@ #include "debug.h" #include "writeback.h" +#include <linux/delay.h> +#include <linux/freezer.h> +#include <linux/kthread.h> #include <trace/events/bcache.h> -static struct workqueue_struct *dirty_wq; - -static void read_dirty(struct closure *); - -struct dirty_io { - struct closure cl; - struct cached_dev *dc; - struct bio bio; -}; - /* Rate limiting */ static void __update_writeback_rate(struct cached_dev *dc) @@ -72,9 +65,6 @@ out: dc->writeback_rate_derivative = derivative; dc->writeback_rate_change = change; dc->writeback_rate_target = target; - - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); } static void update_writeback_rate(struct work_struct *work) @@ -90,13 +80,16 @@ static void update_writeback_rate(struct work_struct *work) __update_writeback_rate(dc); up_read(&dc->writeback_lock); + + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); } static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { uint64_t ret; - if (atomic_read(&dc->disk.detaching) || + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) return 0; @@ -105,37 +98,11 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) return min_t(uint64_t, ret, HZ); } -/* Background writeback */ - -static bool dirty_pred(struct keybuf *buf, struct bkey *k) -{ - return KEY_DIRTY(k); -} - -static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) -{ - uint64_t stripe; - unsigned nr_sectors = KEY_SIZE(k); - struct cached_dev *dc = container_of(buf, struct cached_dev, - writeback_keys); - unsigned stripe_size = 1 << dc->disk.stripe_size_bits; - - if (!KEY_DIRTY(k)) - return false; - - stripe = KEY_START(k) >> dc->disk.stripe_size_bits; - while (1) { - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != - stripe_size) - return false; - - if (nr_sectors <= stripe_size) - return true; - - nr_sectors -= stripe_size; - stripe++; - } -} +struct dirty_io { + struct closure cl; + struct cached_dev *dc; + struct bio bio; +}; static void dirty_init(struct keybuf_key *w) { @@ -153,131 +120,6 @@ static void dirty_init(struct keybuf_key *w) bch_bio_map(bio, NULL); } -static void refill_dirty(struct closure *cl) -{ - struct cached_dev *dc = container_of(cl, struct cached_dev, - writeback.cl); - struct keybuf *buf = &dc->writeback_keys; - bool searched_from_start = false; - struct bkey end = MAX_KEY; - SET_KEY_INODE(&end, dc->disk.id); - - if (!atomic_read(&dc->disk.detaching) && - !dc->writeback_running) - closure_return(cl); - - down_write(&dc->writeback_lock); - - if (!atomic_read(&dc->has_dirty)) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - - up_write(&dc->writeback_lock); - closure_return(cl); - } - - if (bkey_cmp(&buf->last_scanned, &end) >= 0) { - buf->last_scanned = KEY(dc->disk.id, 0, 0); - searched_from_start = true; - } - - if (dc->partial_stripes_expensive) { - uint64_t i; - - for (i = 0; i < dc->disk.nr_stripes; i++) - if (atomic_read(dc->disk.stripe_sectors_dirty + i) == - 1 << dc->disk.stripe_size_bits) - goto full_stripes; - - goto normal_refill; -full_stripes: - bch_refill_keybuf(dc->disk.c, buf, &end, - dirty_full_stripe_pred); - } else { -normal_refill: - bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - } - - if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { - /* Searched the entire btree - delay awhile */ - - if (RB_EMPTY_ROOT(&buf->keys)) { - atomic_set(&dc->has_dirty, 0); - cached_dev_put(dc); - } - - if (!atomic_read(&dc->disk.detaching)) - closure_delay(&dc->writeback, dc->writeback_delay * HZ); - } - - up_write(&dc->writeback_lock); - - bch_ratelimit_reset(&dc->writeback_rate); - - /* Punt to workqueue only so we don't recurse and blow the stack */ - continue_at(cl, read_dirty, dirty_wq); -} - -void bch_writeback_queue(struct cached_dev *dc) -{ - if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { - if (!atomic_read(&dc->disk.detaching)) - closure_delay(&dc->writeback, dc->writeback_delay * HZ); - - continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); - } -} - -void bch_writeback_add(struct cached_dev *dc) -{ - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); - - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ - bch_write_bdev_super(dc, NULL); - } - - bch_writeback_queue(dc); - - if (dc->writeback_percent) - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } -} - -void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, - uint64_t offset, int nr_sectors) -{ - struct bcache_device *d = c->devices[inode]; - unsigned stripe_size, stripe_offset; - uint64_t stripe; - - if (!d) - return; - - stripe_size = 1 << d->stripe_size_bits; - stripe = offset >> d->stripe_size_bits; - stripe_offset = offset & (stripe_size - 1); - - while (nr_sectors) { - int s = min_t(unsigned, abs(nr_sectors), - stripe_size - stripe_offset); - - if (nr_sectors < 0) - s = -s; - - atomic_add(s, d->stripe_sectors_dirty + stripe); - nr_sectors -= s; - stripe_offset = 0; - stripe++; - } -} - -/* Background writeback - IO loop */ - static void dirty_io_destructor(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); @@ -297,26 +139,25 @@ static void write_dirty_finish(struct closure *cl) /* This is kind of a dumb way of signalling errors. */ if (KEY_DIRTY(&w->key)) { + int ret; unsigned i; - struct btree_op op; - bch_btree_op_init_stack(&op); + struct keylist keys; - op.type = BTREE_REPLACE; - bkey_copy(&op.replace, &w->key); + bch_keylist_init(&keys); - SET_KEY_DIRTY(&w->key, false); - bch_keylist_add(&op.keys, &w->key); + bkey_copy(keys.top, &w->key); + SET_KEY_DIRTY(keys.top, false); + bch_keylist_push(&keys); for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - bch_btree_insert(&op, dc->disk.c); - closure_sync(&op.cl); + ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); - if (op.insert_collision) + if (ret) trace_bcache_writeback_collision(&w->key); - atomic_long_inc(op.insert_collision + atomic_long_inc(ret ? &dc->disk.c->writeback_keys_failed : &dc->disk.c->writeback_keys_done); } @@ -374,30 +215,33 @@ static void read_dirty_submit(struct closure *cl) continue_at(cl, write_dirty, system_wq); } -static void read_dirty(struct closure *cl) +static void read_dirty(struct cached_dev *dc) { - struct cached_dev *dc = container_of(cl, struct cached_dev, - writeback.cl); - unsigned delay = writeback_delay(dc, 0); + unsigned delay = 0; struct keybuf_key *w; struct dirty_io *io; + struct closure cl; + + closure_init_stack(&cl); /* * XXX: if we error, background writeback just spins. Should use some * mempools. */ - while (1) { + while (!kthread_should_stop()) { + try_to_freeze(); + w = bch_keybuf_next(&dc->writeback_keys); if (!w) break; BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); - if (delay > 0 && - (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50)) - delay = schedule_timeout_uninterruptible(delay); + if (KEY_START(&w->key) != dc->last_read || + jiffies_to_msecs(delay) > 50) + while (!kthread_should_stop() && delay) + delay = schedule_timeout_interruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -423,7 +267,7 @@ static void read_dirty(struct closure *cl) trace_bcache_writeback(&w->key); down(&dc->in_flight); - closure_call(&io->cl, read_dirty_submit, NULL, cl); + closure_call(&io->cl, read_dirty_submit, NULL, &cl); delay = writeback_delay(dc, KEY_SIZE(&w->key)); } @@ -439,52 +283,205 @@ err: * Wait for outstanding writeback IOs to finish (and keybuf slots to be * freed) before refilling again */ - continue_at(cl, refill_dirty, dirty_wq); + closure_sync(&cl); } -/* Init */ +/* Scan for dirty data */ + +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, + uint64_t offset, int nr_sectors) +{ + struct bcache_device *d = c->devices[inode]; + unsigned stripe_offset, stripe, sectors_dirty; + + if (!d) + return; + + stripe = offset_to_stripe(d, offset); + stripe_offset = offset & (d->stripe_size - 1); + + while (nr_sectors) { + int s = min_t(unsigned, abs(nr_sectors), + d->stripe_size - stripe_offset); + + if (nr_sectors < 0) + s = -s; + + if (stripe >= d->nr_stripes) + return; + + sectors_dirty = atomic_add_return(s, + d->stripe_sectors_dirty + stripe); + if (sectors_dirty == d->stripe_size) + set_bit(stripe, d->full_dirty_stripes); + else + clear_bit(stripe, d->full_dirty_stripes); + + nr_sectors -= s; + stripe_offset = 0; + stripe++; + } +} -static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, - struct cached_dev *dc) +static bool dirty_pred(struct keybuf *buf, struct bkey *k) { - struct bkey *k; - struct btree_iter iter; - - bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); - while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) - if (!b->level) { - if (KEY_INODE(k) > dc->disk.id) - break; - - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, dc->disk.id, - KEY_START(k), - KEY_SIZE(k)); - } else { - btree(sectors_dirty_init, k, b, op, dc); - if (KEY_INODE(k) > dc->disk.id) - break; - - cond_resched(); + return KEY_DIRTY(k); +} + +static void refill_full_stripes(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + unsigned start_stripe, stripe, next_stripe; + bool wrapped = false; + + stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); + + if (stripe >= dc->disk.nr_stripes) + stripe = 0; + + start_stripe = stripe; + + while (1) { + stripe = find_next_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + if (stripe == dc->disk.nr_stripes) + goto next; + + next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + buf->last_scanned = KEY(dc->disk.id, + stripe * dc->disk.stripe_size, 0); + + bch_refill_keybuf(dc->disk.c, buf, + &KEY(dc->disk.id, + next_stripe * dc->disk.stripe_size, 0), + dirty_pred); + + if (array_freelist_empty(&buf->freelist)) + return; + + stripe = next_stripe; +next: + if (wrapped && stripe > start_stripe) + return; + + if (stripe == dc->disk.nr_stripes) { + stripe = 0; + wrapped = true; } + } +} + +static bool refill_dirty(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); + bool searched_from_start = false; + + if (dc->partial_stripes_expensive) { + refill_full_stripes(dc); + if (array_freelist_empty(&buf->freelist)) + return false; + } + + if (bkey_cmp(&buf->last_scanned, &end) >= 0) { + buf->last_scanned = KEY(dc->disk.id, 0, 0); + searched_from_start = true; + } + + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); + + return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; +} + +static int bch_writeback_thread(void *arg) +{ + struct cached_dev *dc = arg; + bool searched_full_index; + + while (!kthread_should_stop()) { + down_write(&dc->writeback_lock); + if (!atomic_read(&dc->has_dirty) || + (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && + !dc->writeback_running)) { + up_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 0; + + try_to_freeze(); + schedule(); + continue; + } + + searched_full_index = refill_dirty(dc); + + if (searched_full_index && + RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { + atomic_set(&dc->has_dirty, 0); + cached_dev_put(dc); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); + } + + up_write(&dc->writeback_lock); + + bch_ratelimit_reset(&dc->writeback_rate); + read_dirty(dc); + + if (searched_full_index) { + unsigned delay = dc->writeback_delay * HZ; + + while (delay && + !kthread_should_stop() && + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + delay = schedule_timeout_interruptible(delay); + } + } return 0; } +/* Init */ + +struct sectors_dirty_init { + struct btree_op op; + unsigned inode; +}; + +static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, + struct bkey *k) +{ + struct sectors_dirty_init *op = container_of(_op, + struct sectors_dirty_init, op); + if (KEY_INODE(k) > op->inode) + return MAP_DONE; + + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + + return MAP_CONTINUE; +} + void bch_sectors_dirty_init(struct cached_dev *dc) { - struct btree_op op; + struct sectors_dirty_init op; + + bch_btree_op_init(&op.op, -1); + op.inode = dc->disk.id; - bch_btree_op_init_stack(&op); - btree_root(sectors_dirty_init, dc->disk.c, &op, dc); + bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, 0); } -void bch_cached_dev_writeback_init(struct cached_dev *dc) +int bch_cached_dev_writeback_init(struct cached_dev *dc) { sema_init(&dc->in_flight, 64); - closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); - bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; @@ -498,22 +495,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_p_term_inverse = 64; dc->writeback_rate_d_smooth = 8; + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, + "bcache_writeback"); + if (IS_ERR(dc->writeback_thread)) + return PTR_ERR(dc->writeback_thread); + + set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE); + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); -} - -void bch_writeback_exit(void) -{ - if (dirty_wq) - destroy_workqueue(dirty_wq); -} - -int __init bch_writeback_init(void) -{ - dirty_wq = create_workqueue("bcache_writeback"); - if (!dirty_wq) - return -ENOMEM; return 0; } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index c91f61bb95b6..c9ddcf4614b9 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -14,20 +14,27 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, +static inline unsigned offset_to_stripe(struct bcache_device *d, + uint64_t offset) +{ + do_div(offset, d->stripe_size); + return offset; +} + +static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, uint64_t offset, unsigned nr_sectors) { - uint64_t stripe = offset >> d->stripe_size_bits; + unsigned stripe = offset_to_stripe(&dc->disk, offset); while (1) { - if (atomic_read(d->stripe_sectors_dirty + stripe)) + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) return true; - if (nr_sectors <= 1 << d->stripe_size_bits) + if (nr_sectors <= dc->disk.stripe_size) return false; - nr_sectors -= 1 << d->stripe_size_bits; + nr_sectors -= dc->disk.stripe_size; stripe++; } } @@ -38,12 +45,12 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned in_use = dc->disk.c->gc_stats.in_use; if (cache_mode != CACHE_MODE_WRITEBACK || - atomic_read(&dc->disk.detaching) || + test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || in_use > CUTOFF_WRITEBACK_SYNC) return false; if (dc->partial_stripes_expensive && - bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, + bcache_dev_stripe_dirty(dc, bio->bi_sector, bio_sectors(bio))) return true; @@ -54,11 +61,30 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, in_use <= CUTOFF_WRITEBACK; } +static inline void bch_writeback_queue(struct cached_dev *dc) +{ + wake_up_process(dc->writeback_thread); +} + +static inline void bch_writeback_add(struct cached_dev *dc) +{ + if (!atomic_read(&dc->has_dirty) && + !atomic_xchg(&dc->has_dirty, 1)) { + atomic_inc(&dc->count); + + if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); + /* XXX: should do this synchronously */ + bch_write_bdev_super(dc, NULL); + } + + bch_writeback_queue(dc); + } +} + void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); -void bch_writeback_queue(struct cached_dev *); -void bch_writeback_add(struct cached_dev *); void bch_sectors_dirty_init(struct cached_dev *dc); -void bch_cached_dev_writeback_init(struct cached_dev *); +int bch_cached_dev_writeback_init(struct cached_dev *); #endif diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 451bf99582ff..846d5c6609d8 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2978,12 +2978,12 @@ static int dasd_alloc_queue(struct dasd_block *block) elevator_exit(block->request_queue->elevator); block->request_queue->elevator = NULL; + mutex_lock(&block->request_queue->sysfs_lock); rc = elevator_init(block->request_queue, "deadline"); - if (rc) { + if (rc) blk_cleanup_queue(block->request_queue); - return rc; - } - return 0; + mutex_unlock(&block->request_queue->sysfs_lock); + return rc; } /* diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 5ebda976ea93..e2b9576d00e2 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -6,11 +6,9 @@ #include <linux/tracepoint.h> -struct search; - DECLARE_EVENT_CLASS(bcache_request, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio), + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio), TP_STRUCT__entry( __field(dev_t, dev ) @@ -24,8 +22,8 @@ DECLARE_EVENT_CLASS(bcache_request, TP_fast_assign( __entry->dev = bio->bi_bdev->bd_dev; - __entry->orig_major = s->d->disk->major; - __entry->orig_minor = s->d->disk->first_minor; + __entry->orig_major = d->disk->major; + __entry->orig_minor = d->disk->first_minor; __entry->sector = bio->bi_sector; __entry->orig_sector = bio->bi_sector - 16; __entry->nr_sector = bio->bi_size >> 9; @@ -79,13 +77,13 @@ DECLARE_EVENT_CLASS(btree_node, /* request.c */ DEFINE_EVENT(bcache_request, bcache_request_start, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio) + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) ); DEFINE_EVENT(bcache_request, bcache_request_end, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio) + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) ); DECLARE_EVENT_CLASS(bcache_bio, @@ -370,6 +368,35 @@ DEFINE_EVENT(btree_node, bcache_btree_set_root, TP_ARGS(b) ); +TRACE_EVENT(bcache_keyscan, + TP_PROTO(unsigned nr_found, + unsigned start_inode, uint64_t start_offset, + unsigned end_inode, uint64_t end_offset), + TP_ARGS(nr_found, + start_inode, start_offset, + end_inode, end_offset), + + TP_STRUCT__entry( + __field(__u32, nr_found ) + __field(__u32, start_inode ) + __field(__u64, start_offset ) + __field(__u32, end_inode ) + __field(__u64, end_offset ) + ), + + TP_fast_assign( + __entry->nr_found = nr_found; + __entry->start_inode = start_inode; + __entry->start_offset = start_offset; + __entry->end_inode = end_inode; + __entry->end_offset = end_offset; + ), + + TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found, + __entry->start_inode, __entry->start_offset, + __entry->end_inode, __entry->end_offset) +); + /* Allocator */ TRACE_EVENT(bcache_alloc_invalidate, diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h new file mode 100644 index 000000000000..164a7e263988 --- /dev/null +++ b/include/uapi/linux/bcache.h @@ -0,0 +1,373 @@ +#ifndef _LINUX_BCACHE_H +#define _LINUX_BCACHE_H + +/* + * Bcache on disk data structures + */ + +#include <asm/types.h> + +#define BITMASK(name, type, field, offset, size) \ +static inline __u64 name(const type *k) \ +{ return (k->field >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + k->field &= ~(~(~0ULL << size) << offset); \ + k->field |= (v & ~(~0ULL << size)) << offset; \ +} + +/* Btree keys - all units are in sectors */ + +struct bkey { + __u64 high; + __u64 low; + __u64 ptr[]; +}; + +#define KEY_FIELD(name, field, offset, size) \ + BITMASK(name, struct bkey, field, offset, size) + +#define PTR_FIELD(name, offset, size) \ +static inline __u64 name(const struct bkey *k, unsigned i) \ +{ return (k->ptr[i] >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \ +{ \ + k->ptr[i] &= ~(~(~0ULL << size) << offset); \ + k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \ +} + +#define KEY_SIZE_BITS 16 + +KEY_FIELD(KEY_PTRS, high, 60, 3) +KEY_FIELD(HEADER_SIZE, high, 58, 2) +KEY_FIELD(KEY_CSUM, high, 56, 2) +KEY_FIELD(KEY_PINNED, high, 55, 1) +KEY_FIELD(KEY_DIRTY, high, 36, 1) + +KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS) +KEY_FIELD(KEY_INODE, high, 0, 20) + +/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ + +static inline __u64 KEY_OFFSET(const struct bkey *k) +{ + return k->low; +} + +static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v) +{ + k->low = v; +} + +/* + * The high bit being set is a relic from when we used it to do binary + * searches - it told you where a key started. It's not used anymore, + * and can probably be safely dropped. + */ +#define KEY(inode, offset, size) \ +((struct bkey) { \ + .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \ + .low = (offset) \ +}) + +#define ZERO_KEY KEY(0, 0, 0) + +#define MAX_KEY_INODE (~(~0 << 20)) +#define MAX_KEY_OFFSET (~0ULL >> 1) +#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) + +#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) +#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) + +#define PTR_DEV_BITS 12 + +PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS) +PTR_FIELD(PTR_OFFSET, 8, 43) +PTR_FIELD(PTR_GEN, 0, 8) + +#define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1) + +#define PTR(gen, offset, dev) \ + ((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen) + +/* Bkey utility code */ + +static inline unsigned long bkey_u64s(const struct bkey *k) +{ + return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k); +} + +static inline unsigned long bkey_bytes(const struct bkey *k) +{ + return bkey_u64s(k) * sizeof(__u64); +} + +#define bkey_copy(_dest, _src) memcpy(_dest, _src, bkey_bytes(_src)) + +static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) +{ + SET_KEY_INODE(dest, KEY_INODE(src)); + SET_KEY_OFFSET(dest, KEY_OFFSET(src)); +} + +static inline struct bkey *bkey_next(const struct bkey *k) +{ + __u64 *d = (void *) k; + return (struct bkey *) (d + bkey_u64s(k)); +} + +static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys) +{ + __u64 *d = (void *) k; + return (struct bkey *) (d + nr_keys); +} +/* Enough for a key with 6 pointers */ +#define BKEY_PAD 8 + +#define BKEY_PADDED(key) \ + union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; } + +/* Superblock */ + +/* Version 0: Cache device + * Version 1: Backing device + * Version 2: Seed pointer into btree node checksum + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset + */ +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_MAX_VERSION 4 + +#define SB_SECTOR 8 +#define SB_SIZE 4096 +#define SB_LABEL_SIZE 32 +#define SB_JOURNAL_BUCKETS 256U +/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ +#define MAX_CACHES_PER_SET 8 + +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ + +struct cache_sb { + __u64 csum; + __u64 offset; /* sector where this sb was written */ + __u64 version; + + __u8 magic[16]; + + __u8 uuid[16]; + union { + __u8 set_uuid[16]; + __u64 set_magic; + }; + __u8 label[SB_LABEL_SIZE]; + + __u64 flags; + __u64 seq; + __u64 pad[8]; + + union { + struct { + /* Cache devices */ + __u64 nbuckets; /* device size */ + + __u16 block_size; /* sectors */ + __u16 bucket_size; /* sectors */ + + __u16 nr_in_set; + __u16 nr_this_dev; + }; + struct { + /* Backing devices */ + __u64 data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + __u32 last_mount; /* time_t */ + + __u16 first_bucket; + union { + __u16 njournal_buckets; + __u16 keys; + }; + __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + +static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) +{ + return sb->version == BCACHE_SB_VERSION_BDEV + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; +} + +BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); +BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); +BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); +#define CACHE_REPLACEMENT_LRU 0U +#define CACHE_REPLACEMENT_FIFO 1U +#define CACHE_REPLACEMENT_RANDOM 2U + +BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); +#define CACHE_MODE_WRITETHROUGH 0U +#define CACHE_MODE_WRITEBACK 1U +#define CACHE_MODE_WRITEAROUND 2U +#define CACHE_MODE_NONE 3U +BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); +#define BDEV_STATE_NONE 0U +#define BDEV_STATE_CLEAN 1U +#define BDEV_STATE_DIRTY 2U +#define BDEV_STATE_STALE 3U + +/* + * Magic numbers + * + * The various other data structures have their own magic numbers, which are + * xored with the first part of the cache set's UUID + */ + +#define JSET_MAGIC 0x245235c1a3625032ULL +#define PSET_MAGIC 0x6750e15f87337f91ULL +#define BSET_MAGIC 0x90135c78b99e07f5ULL + +static inline __u64 jset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ JSET_MAGIC; +} + +static inline __u64 pset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ PSET_MAGIC; +} + +static inline __u64 bset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ BSET_MAGIC; +} + +/* + * Journal + * + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique + * sequence number. + * + * last_seq is the oldest journal entry that still has keys the btree hasn't + * flushed to disk yet. + * + * version is for on disk format changes. + */ + +#define BCACHE_JSET_VERSION_UUIDv1 1 +#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ +#define BCACHE_JSET_VERSION 1 + +struct jset { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 keys; + + __u64 last_seq; + + BKEY_PADDED(uuid_bucket); + BKEY_PADDED(btree_root); + __u16 btree_level; + __u16 pad[3]; + + __u64 prio_bucket[MAX_CACHES_PER_SET]; + + union { + struct bkey start[0]; + __u64 d[0]; + }; +}; + +/* Bucket prios/gens */ + +struct prio_set { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 pad; + + __u64 next_bucket; + + struct bucket_disk { + __u16 prio; + __u8 gen; + } __attribute((packed)) data[]; +}; + +/* UUIDS - per backing device/flash only volume metadata */ + +struct uuid_entry { + union { + struct { + __u8 uuid[16]; + __u8 label[32]; + __u32 first_reg; + __u32 last_reg; + __u32 invalidated; + + __u32 flags; + /* Size of flash only volumes */ + __u64 sectors; + }; + + __u8 pad[128]; + }; +}; + +BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); + +/* Btree nodes */ + +/* Version 1: Seed pointer into btree node checksum + */ +#define BCACHE_BSET_CSUM 1 +#define BCACHE_BSET_VERSION 1 + +/* + * Btree nodes + * + * On disk a btree node is a list/log of these; within each set the keys are + * sorted + */ +struct bset { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 keys; + + union { + struct bkey start[0]; + __u64 d[0]; + }; +}; + +/* OBSOLETE */ + +/* UUIDS - per backing device/flash only volume metadata */ + +struct uuid_entry_v0 { + __u8 uuid[16]; + __u8 label[32]; + __u32 first_reg; + __u32 last_reg; + __u32 invalidated; + __u32 pad; +}; + +#endif /* _LINUX_BCACHE_H */ |