diff options
Diffstat (limited to 'drivers/hv/vmbus_drv.c')
-rw-r--r-- | drivers/hv/vmbus_drv.c | 308 |
1 files changed, 293 insertions, 15 deletions
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index ebd35fc35290..029378c27421 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -24,12 +24,14 @@ #include <linux/sched/task_stack.h> #include <asm/mshyperv.h> +#include <linux/delay.h> #include <linux/notifier.h> #include <linux/ptrace.h> #include <linux/screen_info.h> #include <linux/kdebug.h> #include <linux/efi.h> #include <linux/random.h> +#include <linux/syscore_ops.h> #include <clocksource/hyperv_timer.h> #include "hyperv_vmbus.h" @@ -77,7 +79,7 @@ static struct notifier_block hyperv_panic_block = { static const char *fb_mmio_name = "fb_range"; static struct resource *fb_mmio; static struct resource *hyperv_mmio; -static DEFINE_SEMAPHORE(hyperv_mmio_lock); +static DEFINE_MUTEX(hyperv_mmio_lock); static int vmbus_exists(void) { @@ -910,6 +912,45 @@ static void vmbus_shutdown(struct device *child_device) drv->shutdown(dev); } +#ifdef CONFIG_PM_SLEEP +/* + * vmbus_suspend - Suspend a vmbus device + */ +static int vmbus_suspend(struct device *child_device) +{ + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); + + /* The device may not be attached yet */ + if (!child_device->driver) + return 0; + + drv = drv_to_hv_drv(child_device->driver); + if (!drv->suspend) + return -EOPNOTSUPP; + + return drv->suspend(dev); +} + +/* + * vmbus_resume - Resume a vmbus device + */ +static int vmbus_resume(struct device *child_device) +{ + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); + + /* The device may not be attached yet */ + if (!child_device->driver) + return 0; + + drv = drv_to_hv_drv(child_device->driver); + if (!drv->resume) + return -EOPNOTSUPP; + + return drv->resume(dev); +} +#endif /* CONFIG_PM_SLEEP */ /* * vmbus_device_release - Final callback release of the vmbus child device @@ -919,12 +960,22 @@ static void vmbus_device_release(struct device *device) struct hv_device *hv_dev = device_to_hv_device(device); struct vmbus_channel *channel = hv_dev->channel; + hv_debug_rm_dev_dir(hv_dev); + mutex_lock(&vmbus_connection.channel_mutex); hv_process_channel_removal(channel); mutex_unlock(&vmbus_connection.channel_mutex); kfree(hv_dev); } +/* + * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than + * SET_SYSTEM_SLEEP_PM_OPS: see the comment before vmbus_bus_pm. + */ +static const struct dev_pm_ops vmbus_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_suspend, vmbus_resume) +}; + /* The one and only one */ static struct bus_type hv_bus = { .name = "vmbus", @@ -935,6 +986,7 @@ static struct bus_type hv_bus = { .uevent = vmbus_uevent, .dev_groups = vmbus_dev_groups, .drv_groups = vmbus_drv_groups, + .pm = &vmbus_pm, }; struct onmessage_work_context { @@ -981,6 +1033,10 @@ void vmbus_on_msg_dpc(unsigned long data) } entry = &channel_message_table[hdr->msgtype]; + + if (!entry->message_handler) + goto msg_handled; + if (entry->handler_type == VMHT_BLOCKING) { ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); if (ctx == NULL) @@ -1022,6 +1078,43 @@ msg_handled: vmbus_signal_eom(msg, message_type); } +#ifdef CONFIG_PM_SLEEP +/* + * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for + * hibernation, because hv_sock connections can not persist across hibernation. + */ +static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) +{ + struct onmessage_work_context *ctx; + struct vmbus_channel_rescind_offer *rescind; + + WARN_ON(!is_hvsock_channel(channel)); + + /* + * sizeof(*ctx) is small and the allocation should really not fail, + * otherwise the state of the hv_sock connections ends up in limbo. + */ + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); + + /* + * So far, these are not really used by Linux. Just set them to the + * reasonable values conforming to the definitions of the fields. + */ + ctx->msg.header.message_type = 1; + ctx->msg.header.payload_size = sizeof(*rescind); + + /* These values are actually used by Linux. */ + rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload; + rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER; + rescind->child_relid = channel->offermsg.child_relid; + + INIT_WORK(&ctx->work, vmbus_onmessage_work); + + queue_work_on(vmbus_connection.connect_cpu, + vmbus_connection.work_queue, + &ctx->work); +} +#endif /* CONFIG_PM_SLEEP */ /* * Direct callback for channels using other deferred processing @@ -1186,7 +1279,7 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper, * Write dump contents to the page. No need to synchronize; panic should * be single-threaded. */ - kmsg_dump_get_buffer(dumper, true, hv_panic_page, PAGE_SIZE, + kmsg_dump_get_buffer(dumper, true, hv_panic_page, HV_HYP_PAGE_SIZE, &bytes_written); if (bytes_written) hyperv_report_panic_msg(panic_pa, bytes_written); @@ -1253,10 +1346,6 @@ static int vmbus_bus_init(void) if (ret) goto err_alloc; - ret = hv_stimer_alloc(VMBUS_MESSAGE_SINT); - if (ret < 0) - goto err_alloc; - /* * Initialize the per-cpu interrupt state and stimer state. * Then connect to the host. @@ -1290,7 +1379,7 @@ static int vmbus_bus_init(void) */ hv_get_crash_ctl(hyperv_crash_ctl); if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) { - hv_panic_page = (void *)get_zeroed_page(GFP_KERNEL); + hv_panic_page = (void *)hv_alloc_hyperv_zeroed_page(); if (hv_panic_page) { ret = kmsg_dump_register(&hv_kmsg_dumper); if (ret) @@ -1313,13 +1402,12 @@ static int vmbus_bus_init(void) err_connect: cpuhp_remove_state(hyperv_cpuhp_online); err_cpuhp: - hv_stimer_free(); -err_alloc: hv_synic_free(); +err_alloc: hv_remove_vmbus_irq(); bus_unregister(&hv_bus); - free_page((unsigned long)hv_panic_page); + hv_free_hyperv_page((unsigned long)hv_panic_page); unregister_sysctl_table(hv_ctl_table_hdr); hv_ctl_table_hdr = NULL; return ret; @@ -1727,6 +1815,7 @@ int vmbus_device_register(struct hv_device *child_device_obj) pr_err("Unable to register primary channeln"); goto err_kset_unregister; } + hv_debug_add_dev_dir(child_device_obj); return 0; @@ -1928,7 +2017,7 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, int retval; retval = -ENXIO; - down(&hyperv_mmio_lock); + mutex_lock(&hyperv_mmio_lock); /* * If overlaps with frame buffers are allowed, then first attempt to @@ -1975,7 +2064,7 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, } exit: - up(&hyperv_mmio_lock); + mutex_unlock(&hyperv_mmio_lock); return retval; } EXPORT_SYMBOL_GPL(vmbus_allocate_mmio); @@ -1992,7 +2081,7 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size) { struct resource *iter; - down(&hyperv_mmio_lock); + mutex_lock(&hyperv_mmio_lock); for (iter = hyperv_mmio; iter; iter = iter->sibling) { if ((iter->start >= start + size) || (iter->end <= start)) continue; @@ -2000,7 +2089,7 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size) __release_region(iter, start, size); } release_mem_region(start, size); - up(&hyperv_mmio_lock); + mutex_unlock(&hyperv_mmio_lock); } EXPORT_SYMBOL_GPL(vmbus_free_mmio); @@ -2042,6 +2131,130 @@ acpi_walk_err: return ret_val; } +#ifdef CONFIG_PM_SLEEP +static int vmbus_bus_suspend(struct device *dev) +{ + struct vmbus_channel *channel, *sc; + unsigned long flags; + + while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { + /* + * We wait here until the completion of any channel + * offers that are currently in progress. + */ + msleep(1); + } + + mutex_lock(&vmbus_connection.channel_mutex); + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!is_hvsock_channel(channel)) + continue; + + vmbus_force_channel_rescinded(channel); + } + mutex_unlock(&vmbus_connection.channel_mutex); + + /* + * Wait until all the sub-channels and hv_sock channels have been + * cleaned up. Sub-channels should be destroyed upon suspend, otherwise + * they would conflict with the new sub-channels that will be created + * in the resume path. hv_sock channels should also be destroyed, but + * a hv_sock channel of an established hv_sock connection can not be + * really destroyed since it may still be referenced by the userspace + * application, so we just force the hv_sock channel to be rescinded + * by vmbus_force_channel_rescinded(), and the userspace application + * will thoroughly destroy the channel after hibernation. + * + * Note: the counter nr_chan_close_on_suspend may never go above 0 if + * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM. + */ + if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) + wait_for_completion(&vmbus_connection.ready_for_suspend_event); + + WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0); + + mutex_lock(&vmbus_connection.channel_mutex); + + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + /* + * Invalidate the field. Upon resume, vmbus_onoffer() will fix + * up the field, and the other fields (if necessary). + */ + channel->offermsg.child_relid = INVALID_RELID; + + if (is_hvsock_channel(channel)) { + if (!channel->rescind) { + pr_err("hv_sock channel not rescinded!\n"); + WARN_ON_ONCE(1); + } + continue; + } + + spin_lock_irqsave(&channel->lock, flags); + list_for_each_entry(sc, &channel->sc_list, sc_list) { + pr_err("Sub-channel not deleted!\n"); + WARN_ON_ONCE(1); + } + spin_unlock_irqrestore(&channel->lock, flags); + + atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume); + } + + mutex_unlock(&vmbus_connection.channel_mutex); + + vmbus_initiate_unload(false); + + vmbus_connection.conn_state = DISCONNECTED; + + /* Reset the event for the next resume. */ + reinit_completion(&vmbus_connection.ready_for_resume_event); + + return 0; +} + +static int vmbus_bus_resume(struct device *dev) +{ + struct vmbus_channel_msginfo *msginfo; + size_t msgsize; + int ret; + + /* + * We only use the 'vmbus_proto_version', which was in use before + * hibernation, to re-negotiate with the host. + */ + if (!vmbus_proto_version) { + pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version); + return -EINVAL; + } + + msgsize = sizeof(*msginfo) + + sizeof(struct vmbus_channel_initiate_contact); + + msginfo = kzalloc(msgsize, GFP_KERNEL); + + if (msginfo == NULL) + return -ENOMEM; + + ret = vmbus_negotiate_version(msginfo, vmbus_proto_version); + + kfree(msginfo); + + if (ret != 0) + return ret; + + WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0); + + vmbus_request_offers(); + + wait_for_completion(&vmbus_connection.ready_for_resume_event); + + /* Reset the event for the next suspend. */ + reinit_completion(&vmbus_connection.ready_for_suspend_event); + + return 0; +} +#endif /* CONFIG_PM_SLEEP */ + static const struct acpi_device_id vmbus_acpi_device_ids[] = { {"VMBUS", 0}, {"VMBus", 0}, @@ -2049,6 +2262,19 @@ static const struct acpi_device_id vmbus_acpi_device_ids[] = { }; MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids); +/* + * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than + * SET_SYSTEM_SLEEP_PM_OPS, otherwise NIC SR-IOV can not work, because the + * "pci_dev_pm_ops" uses the "noirq" callbacks: in the resume path, the + * pci "noirq" restore callback runs before "non-noirq" callbacks (see + * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() -> + * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's + * resume callback must also run via the "noirq" callbacks. + */ +static const struct dev_pm_ops vmbus_bus_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_bus_suspend, vmbus_bus_resume) +}; + static struct acpi_driver vmbus_acpi_driver = { .name = "vmbus", .ids = vmbus_acpi_device_ids, @@ -2056,6 +2282,7 @@ static struct acpi_driver vmbus_acpi_driver = { .add = vmbus_acpi_add, .remove = vmbus_acpi_remove, }, + .drv.pm = &vmbus_bus_pm, }; static void hv_kexec_handler(void) @@ -2082,10 +2309,54 @@ static void hv_crash_handler(struct pt_regs *regs) vmbus_connection.conn_state = DISCONNECTED; cpu = smp_processor_id(); hv_stimer_cleanup(cpu); - hv_synic_cleanup(cpu); + hv_synic_disable_regs(cpu); hyperv_cleanup(); }; +static int hv_synic_suspend(void) +{ + /* + * When we reach here, all the non-boot CPUs have been offlined. + * If we're in a legacy configuration where stimer Direct Mode is + * not enabled, the stimers on the non-boot CPUs have been unbound + * in hv_synic_cleanup() -> hv_stimer_legacy_cleanup() -> + * hv_stimer_cleanup() -> clockevents_unbind_device(). + * + * hv_synic_suspend() only runs on CPU0 with interrupts disabled. + * Here we do not call hv_stimer_legacy_cleanup() on CPU0 because: + * 1) it's unnecessary as interrupts remain disabled between + * syscore_suspend() and syscore_resume(): see create_image() and + * resume_target_kernel() + * 2) the stimer on CPU0 is automatically disabled later by + * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ... + * -> clockevents_shutdown() -> ... -> hv_ce_shutdown() + * 3) a warning would be triggered if we call + * clockevents_unbind_device(), which may sleep, in an + * interrupts-disabled context. + */ + + hv_synic_disable_regs(0); + + return 0; +} + +static void hv_synic_resume(void) +{ + hv_synic_enable_regs(0); + + /* + * Note: we don't need to call hv_stimer_init(0), because the timer + * on CPU0 is not unbound in hv_synic_suspend(), and the timer is + * automatically re-enabled in timekeeping_resume(). + */ +} + +/* The callbacks run only on CPU0, with irqs_disabled. */ +static struct syscore_ops hv_synic_syscore_ops = { + .suspend = hv_synic_suspend, + .resume = hv_synic_resume, +}; + static int __init hv_acpi_init(void) { int ret, t; @@ -2108,6 +2379,7 @@ static int __init hv_acpi_init(void) ret = -ETIMEDOUT; goto cleanup; } + hv_debug_init(); ret = vmbus_bus_init(); if (ret) @@ -2116,6 +2388,8 @@ static int __init hv_acpi_init(void) hv_setup_kexec_handler(hv_kexec_handler); hv_setup_crash_handler(hv_crash_handler); + register_syscore_ops(&hv_synic_syscore_ops); + return 0; cleanup: @@ -2128,6 +2402,8 @@ static void __exit vmbus_exit(void) { int cpu; + unregister_syscore_ops(&hv_synic_syscore_ops); + hv_remove_kexec_handler(); hv_remove_crash_handler(); vmbus_connection.conn_state = DISCONNECTED; @@ -2140,6 +2416,8 @@ static void __exit vmbus_exit(void) tasklet_kill(&hv_cpu->msg_dpc); } + hv_debug_rm_all_dir(); + vmbus_free_channels(); if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { |