diff options
Diffstat (limited to 'drivers/hv')
-rw-r--r-- | drivers/hv/Kconfig | 2 | ||||
-rw-r--r-- | drivers/hv/Makefile | 1 | ||||
-rw-r--r-- | drivers/hv/channel_mgmt.c | 8 | ||||
-rw-r--r-- | drivers/hv/hv.c | 64 | ||||
-rw-r--r-- | drivers/hv/hv_balloon.c | 121 | ||||
-rw-r--r-- | drivers/hv/hv_trace.c | 2 | ||||
-rw-r--r-- | drivers/hv/hv_trace.h | 2 | ||||
-rw-r--r-- | drivers/hv/hv_trace_balloon.h | 48 | ||||
-rw-r--r-- | drivers/hv/hyperv_vmbus.h | 4 | ||||
-rw-r--r-- | drivers/hv/ring_buffer.c | 52 |
10 files changed, 237 insertions, 67 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 50b89ea0e60f..97954f575c3f 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + menu "Microsoft Hyper-V guest support" config HYPERV diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index 14c22786b519..a1eec7177c2d 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o CFLAGS_hv_trace.o = -I$(src) +CFLAGS_hv_balloon.o = -I$(src) hv_vmbus-y := vmbus_drv.o \ hv.o connection.o channel.o \ diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index c21020b69114..ecc2bd275a73 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -71,7 +71,7 @@ static const struct vmbus_device vmbus_devs[] = { /* PCIE */ { .dev_type = HV_PCIE, HV_PCIE_GUID, - .perf_device = true, + .perf_device = false, }, /* Synthetic Frame Buffer */ @@ -596,10 +596,8 @@ static int next_numa_node_id; /* * Starting with Win8, we can statically distribute the incoming * channel interrupt load by binding a channel to VCPU. - * We do this in a hierarchical fashion: - * First distribute the primary channels across available NUMA nodes - * and then distribute the subchannels amongst the CPUs in the NUMA - * node assigned to the primary channel. + * We distribute the interrupt loads to one or more NUMA nodes based on + * the channel's affinity_policy. * * For pre-win8 hosts or non-performance critical channels we assign the * first CPU in the first NUMA node. diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 45f3694bbb76..9b82549cbbc8 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -27,7 +27,7 @@ #include <linux/vmalloc.h> #include <linux/hyperv.h> #include <linux/version.h> -#include <linux/interrupt.h> +#include <linux/random.h> #include <linux/clockchips.h> #include <asm/mshyperv.h> #include "hyperv_vmbus.h" @@ -37,6 +37,17 @@ struct hv_context hv_context = { .synic_initialized = false, }; +/* + * If false, we're using the old mechanism for stimer0 interrupts + * where it sends a VMbus message when it expires. The old + * mechanism is used when running on older versions of Hyper-V + * that don't support Direct Mode. While Hyper-V provides + * four stimer's per CPU, Linux uses only stimer0. + */ +static bool direct_mode_enabled; +static int stimer0_irq; +static int stimer0_vector; + #define HV_TIMER_FREQUENCY (10 * 1000 * 1000) /* 100ns period */ #define HV_MAX_MAX_DELTA_TICKS 0xffffffff #define HV_MIN_DELTA_TICKS 1 @@ -52,6 +63,8 @@ int hv_init(void) if (!hv_context.cpu_context) return -ENOMEM; + direct_mode_enabled = ms_hyperv.misc_features & + HV_X64_STIMER_DIRECT_MODE_AVAILABLE; return 0; } @@ -90,6 +103,21 @@ int hv_post_message(union hv_connection_id connection_id, return status & 0xFFFF; } +/* + * ISR for when stimer0 is operating in Direct Mode. Direct Mode + * does not use VMbus or any VMbus messages, so process here and not + * in the VMbus driver code. + */ + +static void hv_stimer0_isr(void) +{ + struct hv_per_cpu_context *hv_cpu; + + hv_cpu = this_cpu_ptr(hv_context.cpu_context); + hv_cpu->clk_evt->event_handler(hv_cpu->clk_evt); + add_interrupt_randomness(stimer0_vector, 0); +} + static int hv_ce_set_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -107,6 +135,8 @@ static int hv_ce_shutdown(struct clock_event_device *evt) { hv_init_timer(HV_X64_MSR_STIMER0_COUNT, 0); hv_init_timer_config(HV_X64_MSR_STIMER0_CONFIG, 0); + if (direct_mode_enabled) + hv_disable_stimer0_percpu_irq(stimer0_irq); return 0; } @@ -115,11 +145,26 @@ static int hv_ce_set_oneshot(struct clock_event_device *evt) { union hv_timer_config timer_cfg; + timer_cfg.as_uint64 = 0; timer_cfg.enable = 1; timer_cfg.auto_enable = 1; - timer_cfg.sintx = VMBUS_MESSAGE_SINT; + if (direct_mode_enabled) { + /* + * When it expires, the timer will directly interrupt + * on the specified hardware vector/IRQ. + */ + timer_cfg.direct_mode = 1; + timer_cfg.apic_vector = stimer0_vector; + hv_enable_stimer0_percpu_irq(stimer0_irq); + } else { + /* + * When it expires, the timer will generate a VMbus message, + * to be handled by the normal VMbus interrupt handler. + */ + timer_cfg.direct_mode = 0; + timer_cfg.sintx = VMBUS_MESSAGE_SINT; + } hv_init_timer_config(HV_X64_MSR_STIMER0_CONFIG, timer_cfg.as_uint64); - return 0; } @@ -146,7 +191,7 @@ int hv_synic_alloc(void) int cpu; hv_context.hv_numa_map = kzalloc(sizeof(struct cpumask) * nr_node_ids, - GFP_ATOMIC); + GFP_KERNEL); if (hv_context.hv_numa_map == NULL) { pr_err("Unable to allocate NUMA map\n"); goto err; @@ -190,6 +235,11 @@ int hv_synic_alloc(void) INIT_LIST_HEAD(&hv_cpu->chan_list); } + if (direct_mode_enabled && + hv_setup_stimer0_irq(&stimer0_irq, &stimer0_vector, + hv_stimer0_isr)) + goto err; + return 0; err: return -ENOMEM; @@ -216,7 +266,7 @@ void hv_synic_free(void) } /* - * hv_synic_init - Initialize the Synthethic Interrupt Controller. + * hv_synic_init - Initialize the Synthetic Interrupt Controller. * * If it is already initialized by another entity (ie x2v shim), we need to * retrieve the initialized message and event pages. Otherwise, we create and @@ -251,7 +301,6 @@ int hv_synic_init(unsigned int cpu) hv_get_synint_state(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); - shared_sint.as_uint64 = 0; shared_sint.vector = HYPERVISOR_CALLBACK_VECTOR; shared_sint.masked = false; if (ms_hyperv.hints & HV_X64_DEPRECATING_AEOI_RECOMMENDED) @@ -291,6 +340,9 @@ void hv_synic_clockevents_cleanup(void) if (!(ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE)) return; + if (direct_mode_enabled) + hv_remove_stimer0_irq(stimer0_irq); + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index db0e6652d7ef..b3e9f13f8bc3 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -34,6 +34,9 @@ #include <linux/hyperv.h> +#define CREATE_TRACE_POINTS +#include "hv_trace_balloon.h" + /* * We begin with definitions supporting the Dynamic Memory protocol * with the host. @@ -576,11 +579,65 @@ static struct hv_dynmem_device dm_device; static void post_status(struct hv_dynmem_device *dm); #ifdef CONFIG_MEMORY_HOTPLUG +static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, + unsigned long pfn) +{ + struct hv_hotadd_gap *gap; + + /* The page is not backed. */ + if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn)) + return false; + + /* Check for gaps. */ + list_for_each_entry(gap, &has->gap_list, list) { + if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn)) + return false; + } + + return true; +} + +static unsigned long hv_page_offline_check(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long pfn = start_pfn, count = 0; + struct hv_hotadd_state *has; + bool found; + + while (pfn < start_pfn + nr_pages) { + /* + * Search for HAS which covers the pfn and when we find one + * count how many consequitive PFNs are covered. + */ + found = false; + list_for_each_entry(has, &dm_device.ha_region_list, list) { + while ((pfn >= has->start_pfn) && + (pfn < has->end_pfn) && + (pfn < start_pfn + nr_pages)) { + found = true; + if (has_pfn_is_backed(has, pfn)) + count++; + pfn++; + } + } + + /* + * This PFN is not in any HAS (e.g. we're offlining a region + * which was present at boot), no need to account for it. Go + * to the next one. + */ + if (!found) + pfn++; + } + + return count; +} + static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) { struct memory_notify *mem = (struct memory_notify *)v; - unsigned long flags; + unsigned long flags, pfn_count; switch (val) { case MEM_ONLINE: @@ -593,7 +650,19 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, case MEM_OFFLINE: spin_lock_irqsave(&dm_device.ha_lock, flags); - dm_device.num_pages_onlined -= mem->nr_pages; + pfn_count = hv_page_offline_check(mem->start_pfn, + mem->nr_pages); + if (pfn_count <= dm_device.num_pages_onlined) { + dm_device.num_pages_onlined -= pfn_count; + } else { + /* + * We're offlining more pages than we managed to online. + * This is unexpected. In any case don't let + * num_pages_onlined wrap around zero. + */ + WARN_ON_ONCE(1); + dm_device.num_pages_onlined = 0; + } spin_unlock_irqrestore(&dm_device.ha_lock, flags); break; case MEM_GOING_ONLINE: @@ -612,30 +681,9 @@ static struct notifier_block hv_memory_nb = { /* Check if the particular page is backed and can be onlined and online it. */ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) { - unsigned long cur_start_pgp; - unsigned long cur_end_pgp; - struct hv_hotadd_gap *gap; - - cur_start_pgp = (unsigned long)pfn_to_page(has->covered_start_pfn); - cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); - - /* The page is not backed. */ - if (((unsigned long)pg < cur_start_pgp) || - ((unsigned long)pg >= cur_end_pgp)) + if (!has_pfn_is_backed(has, page_to_pfn(pg))) return; - /* Check for gaps. */ - list_for_each_entry(gap, &has->gap_list, list) { - cur_start_pgp = (unsigned long) - pfn_to_page(gap->start_pfn); - cur_end_pgp = (unsigned long) - pfn_to_page(gap->end_pfn); - if (((unsigned long)pg >= cur_start_pgp) && - ((unsigned long)pg < cur_end_pgp)) { - return; - } - } - /* This frame is currently backed; online the page. */ __online_page_set_limits(pg); __online_page_increment_counters(pg); @@ -691,7 +739,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, (HA_CHUNK << PAGE_SHIFT)); if (ret) { - pr_warn("hot_add memory failed error is %d\n", ret); + pr_err("hot_add memory failed error is %d\n", ret); if (ret == -EEXIST) { /* * This error indicates that the error @@ -726,19 +774,13 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, static void hv_online_page(struct page *pg) { struct hv_hotadd_state *has; - unsigned long cur_start_pgp; - unsigned long cur_end_pgp; unsigned long flags; + unsigned long pfn = page_to_pfn(pg); spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry(has, &dm_device.ha_region_list, list) { - cur_start_pgp = (unsigned long) - pfn_to_page(has->start_pfn); - cur_end_pgp = (unsigned long)pfn_to_page(has->end_pfn); - /* The page belongs to a different HAS. */ - if (((unsigned long)pg < cur_start_pgp) || - ((unsigned long)pg >= cur_end_pgp)) + if ((pfn < has->start_pfn) || (pfn >= has->end_pfn)) continue; hv_page_online_one(has, pg); @@ -1014,7 +1056,7 @@ static void hot_add_req(struct work_struct *dummy) resp.result = 0; if (!do_hot_add || (resp.page_count == 0)) - pr_info("Memory hot add failed\n"); + pr_err("Memory hot add failed\n"); dm->state = DM_INITIALIZED; resp.hdr.trans_id = atomic_inc_return(&trans_id); @@ -1041,7 +1083,7 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) break; default: - pr_info("Received Unknown type: %d\n", info_hdr->type); + pr_warn("Received Unknown type: %d\n", info_hdr->type); } } @@ -1120,6 +1162,9 @@ static void post_status(struct hv_dynmem_device *dm) dm->num_pages_added - dm->num_pages_onlined : 0) + compute_balloon_floor(); + trace_balloon_status(status.num_avail, status.num_committed, + vm_memory_committed(), dm->num_pages_ballooned, + dm->num_pages_added, dm->num_pages_onlined); /* * If our transaction ID is no longer current, just don't * send the status. This can happen if we were interrupted @@ -1290,7 +1335,7 @@ static void balloon_up(struct work_struct *dummy) /* * Free up the memory we allocatted. */ - pr_info("Balloon response failed\n"); + pr_err("Balloon response failed\n"); for (i = 0; i < bl_resp->range_count; i++) free_balloon_pages(&dm_device, @@ -1421,7 +1466,7 @@ static void cap_resp(struct hv_dynmem_device *dm, struct dm_capabilities_resp_msg *cap_resp) { if (!cap_resp->is_accepted) { - pr_info("Capabilities not accepted by host\n"); + pr_err("Capabilities not accepted by host\n"); dm->state = DM_INIT_ERROR; } complete(&dm->host_event); @@ -1508,7 +1553,7 @@ static void balloon_onchannelcallback(void *context) break; default: - pr_err("Unhandled message: type: %d\n", dm_hdr->type); + pr_warn("Unhandled message: type: %d\n", dm_hdr->type); } } diff --git a/drivers/hv/hv_trace.c b/drivers/hv/hv_trace.c index df47acd01a81..38d359cf1e70 100644 --- a/drivers/hv/hv_trace.c +++ b/drivers/hv/hv_trace.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + #include "hyperv_vmbus.h" #define CREATE_TRACE_POINTS diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h index d635ee95b20d..999f80a63bff 100644 --- a/drivers/hv/hv_trace.h +++ b/drivers/hv/hv_trace.h @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + #undef TRACE_SYSTEM #define TRACE_SYSTEM hyperv diff --git a/drivers/hv/hv_trace_balloon.h b/drivers/hv/hv_trace_balloon.h new file mode 100644 index 000000000000..93082888aec3 --- /dev/null +++ b/drivers/hv/hv_trace_balloon.h @@ -0,0 +1,48 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hyperv + +#if !defined(_HV_TRACE_BALLOON_H) || defined(TRACE_HEADER_MULTI_READ) +#define _HV_TRACE_BALLOON_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(balloon_status, + TP_PROTO(u64 available, u64 committed, + unsigned long vm_memory_committed, + unsigned long pages_ballooned, + unsigned long pages_added, + unsigned long pages_onlined), + TP_ARGS(available, committed, vm_memory_committed, + pages_ballooned, pages_added, pages_onlined), + TP_STRUCT__entry( + __field(u64, available) + __field(u64, committed) + __field(unsigned long, vm_memory_committed) + __field(unsigned long, pages_ballooned) + __field(unsigned long, pages_added) + __field(unsigned long, pages_onlined) + ), + TP_fast_assign( + __entry->available = available; + __entry->committed = committed; + __entry->vm_memory_committed = vm_memory_committed; + __entry->pages_ballooned = pages_ballooned; + __entry->pages_added = pages_added; + __entry->pages_onlined = pages_onlined; + ), + TP_printk("available %lld, committed %lld; vm_memory_committed %ld;" + " pages_ballooned %ld, pages_added %ld, pages_onlined %ld", + __entry->available, __entry->committed, + __entry->vm_memory_committed, __entry->pages_ballooned, + __entry->pages_added, __entry->pages_onlined + ) + ); + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hv_trace_balloon +#endif /* _HV_TRACE_BALLOON_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 500f805a6ef2..f761bef36e77 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -58,7 +58,9 @@ union hv_timer_config { u64 periodic:1; u64 lazy:1; u64 auto_enable:1; - u64 reserved_z0:12; + u64 apic_vector:8; + u64 direct_mode:1; + u64 reserved_z0:3; u64 sintx:4; u64 reserved_z1:44; }; diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 50e071444a5c..8699bb969e7e 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -417,13 +417,24 @@ __hv_pkt_iter_next(struct vmbus_channel *channel, } EXPORT_SYMBOL_GPL(__hv_pkt_iter_next); +/* How many bytes were read in this iterator cycle */ +static u32 hv_pkt_iter_bytes_read(const struct hv_ring_buffer_info *rbi, + u32 start_read_index) +{ + if (rbi->priv_read_index >= start_read_index) + return rbi->priv_read_index - start_read_index; + else + return rbi->ring_datasize - start_read_index + + rbi->priv_read_index; +} + /* * Update host ring buffer after iterating over packets. */ void hv_pkt_iter_close(struct vmbus_channel *channel) { struct hv_ring_buffer_info *rbi = &channel->inbound; - u32 orig_write_sz = hv_get_bytes_to_write(rbi); + u32 curr_write_sz, pending_sz, bytes_read, start_read_index; /* * Make sure all reads are done before we update the read index since @@ -431,8 +442,12 @@ void hv_pkt_iter_close(struct vmbus_channel *channel) * is updated. */ virt_rmb(); + start_read_index = rbi->ring_buffer->read_index; rbi->ring_buffer->read_index = rbi->priv_read_index; + if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz) + return; + /* * Issue a full memory barrier before making the signaling decision. * Here is the reason for having this barrier: @@ -446,26 +461,29 @@ void hv_pkt_iter_close(struct vmbus_channel *channel) */ virt_mb(); - /* If host has disabled notifications then skip */ - if (rbi->ring_buffer->interrupt_mask) + pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz); + if (!pending_sz) return; - if (rbi->ring_buffer->feature_bits.feat_pending_send_sz) { - u32 pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz); + /* + * Ensure the read of write_index in hv_get_bytes_to_write() + * happens after the read of pending_send_sz. + */ + virt_rmb(); + curr_write_sz = hv_get_bytes_to_write(rbi); + bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index); - /* - * If there was space before we began iteration, - * then host was not blocked. Also handles case where - * pending_sz is zero then host has nothing pending - * and does not need to be signaled. - */ - if (orig_write_sz > pending_sz) - return; + /* + * If there was space before we began iteration, + * then host was not blocked. + */ - /* If pending write will not fit, don't give false hope. */ - if (hv_get_bytes_to_write(rbi) < pending_sz) - return; - } + if (curr_write_sz - bytes_read > pending_sz) + return; + + /* If pending write will not fit, don't give false hope. */ + if (curr_write_sz <= pending_sz) + return; vmbus_setevent(channel); } |