summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86_64/boot-options.txt5
-rw-r--r--arch/x86_64/mm/init.c3
-rw-r--r--arch/x86_64/mm/numa.c5
-rw-r--r--arch/x86_64/mm/srat.c164
-rw-r--r--include/asm-x86_64/numa.h2
5 files changed, 171 insertions, 8 deletions
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 1921353259ae..f2cd6ef53ff3 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -151,6 +151,11 @@ NUMA
numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine.
+ numa=hotadd=percent
+ Only allow hotadd memory to preallocate page structures upto
+ percent of already available memory.
+ numa=hotadd=0 will disable hotadd memory.
+
ACPI
acpi=off Don't enable ACPI
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 492161168402..dff870534199 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -530,8 +530,7 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
unsigned long pfn;
unsigned long total = 0, mem = 0;
for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
- unsigned long addr = pfn << PAGE_SHIFT;
- if (pfn_valid(pfn) && e820_mapped(addr, addr+1, E820_RAM)) {
+ if (pfn_valid(pfn)) {
online_page(pfn_to_page(pfn));
err = 0;
mem++;
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 4be82d6e2b48..779132af29a7 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -142,6 +142,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+#ifdef CONFIG_ACPI_NUMA
+ srat_reserve_add_area(nodeid);
+#endif
node_set_online(nodeid);
}
@@ -335,6 +338,8 @@ __init int numa_setup(char *opt)
#ifdef CONFIG_ACPI_NUMA
if (!strncmp(opt,"noacpi",6))
acpi_numa = -1;
+ if (!strncmp(opt,"hotadd=", 7))
+ hotadd_percent = simple_strtoul(opt+7, NULL, 10);
#endif
return 1;
}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2eb879590dc4..443875eb15a2 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -15,15 +15,26 @@
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
#include <asm/proto.h>
#include <asm/numa.h>
#include <asm/e820.h>
+#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
+ defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
+ && !defined(CONFIG_MEMORY_HOTPLUG)
+#define RESERVE_HOTADD 1
+#endif
+
static struct acpi_table_slit *acpi_slit;
static nodemask_t nodes_parsed __initdata;
static nodemask_t nodes_found __initdata;
static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
+static int found_add_area __initdata;
+int hotadd_percent __initdata = 10;
static u8 pxm2node[256] = { [0 ... 255] = 0xff };
/* Too small nodes confuse the VM badly. Usually they result
@@ -71,6 +82,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end)
static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
struct bootnode *nd = &nodes[i];
+
+ if (found_add_area)
+ return;
+
if (nd->start < start) {
nd->start = start;
if (nd->end < nd->start)
@@ -90,6 +105,8 @@ static __init void bad_srat(void)
acpi_numa = -1;
for (i = 0; i < MAX_LOCAL_APIC; i++)
apicid_to_node[i] = NUMA_NO_NODE;
+ for (i = 0; i < MAX_NUMNODES; i++)
+ nodes_add[i].start = nodes[i].end = 0;
}
static __init inline int srat_disabled(void)
@@ -155,11 +172,114 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
pxm, pa->apic_id, node);
}
+#ifdef RESERVE_HOTADD
+/*
+ * Protect against too large hotadd areas that would fill up memory.
+ */
+static int hotadd_enough_memory(struct bootnode *nd)
+{
+ static unsigned long allocated;
+ static unsigned long last_area_end;
+ unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
+ long mem = pages * sizeof(struct page);
+ unsigned long addr;
+ unsigned long allowed;
+ unsigned long oldpages = pages;
+
+ if (mem < 0)
+ return 0;
+ allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
+ allowed = (allowed / 100) * hotadd_percent;
+ if (allocated + mem > allowed) {
+ /* Give them at least part of their hotadd memory upto hotadd_percent
+ It would be better to spread the limit out
+ over multiple hotplug areas, but that is too complicated
+ right now */
+ if (allocated >= allowed)
+ return 0;
+ pages = (allowed - allocated + mem) / sizeof(struct page);
+ mem = pages * sizeof(struct page);
+ nd->end = nd->start + pages*PAGE_SIZE;
+ }
+ /* Not completely fool proof, but a good sanity check */
+ addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
+ if (addr == -1UL)
+ return 0;
+ if (pages != oldpages)
+ printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
+ pages << PAGE_SHIFT);
+ last_area_end = addr + mem;
+ allocated += mem;
+ return 1;
+}
+
+/*
+ * It is fine to add this area to the nodes data it will be used later
+ * This code supports one contigious hot add area per node.
+ */
+static int reserve_hotadd(int node, unsigned long start, unsigned long end)
+{
+ unsigned long s_pfn = start >> PAGE_SHIFT;
+ unsigned long e_pfn = end >> PAGE_SHIFT;
+ int changed = 0;
+ struct bootnode *nd = &nodes_add[node];
+
+ /* I had some trouble with strange memory hotadd regions breaking
+ the boot. Be very strict here and reject anything unexpected.
+ If you want working memory hotadd write correct SRATs.
+
+ The node size check is a basic sanity check to guard against
+ mistakes */
+ if ((signed long)(end - start) < NODE_MIN_SIZE) {
+ printk(KERN_ERR "SRAT: Hotplug area too small\n");
+ return -1;
+ }
+
+ /* This check might be a bit too strict, but I'm keeping it for now. */
+ if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
+ printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
+ return -1;
+ }
+
+ if (!hotadd_enough_memory(&nodes_add[node])) {
+ printk(KERN_ERR "SRAT: Hotplug area too large\n");
+ return -1;
+ }
+
+ /* Looks good */
+
+ found_add_area = 1;
+ if (nd->start == nd->end) {
+ nd->start = start;
+ nd->end = end;
+ changed = 1;
+ } else {
+ if (nd->start == end) {
+ nd->start = start;
+ changed = 1;
+ }
+ if (nd->end == start) {
+ nd->end = end;
+ changed = 1;
+ }
+ if (!changed)
+ printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
+ }
+
+ if ((nd->end >> PAGE_SHIFT) > end_pfn)
+ end_pfn = nd->end >> PAGE_SHIFT;
+
+ if (changed)
+ printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
+ return 0;
+}
+#endif
+
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
{
- struct bootnode *nd;
+ struct bootnode *nd, oldnode;
unsigned long start, end;
int node, pxm;
int i;
@@ -172,6 +292,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
}
if (ma->flags.enabled == 0)
return;
+ if (ma->flags.hot_pluggable && hotadd_percent == 0)
+ return;
start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
pxm = ma->proximity_domain;
@@ -181,10 +303,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
bad_srat();
return;
}
- /* It is fine to add this area to the nodes data it will be used later*/
- if (ma->flags.hot_pluggable == 1)
- printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n",
- start, end);
i = conflicting_nodes(start, end);
if (i == node) {
printk(KERN_WARNING
@@ -199,6 +317,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
return;
}
nd = &nodes[node];
+ oldnode = *nd;
if (!node_test_and_set(node, nodes_parsed)) {
nd->start = start;
nd->end = end;
@@ -208,8 +327,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
if (nd->end < end)
nd->end = end;
}
+
printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
nd->start, nd->end);
+
+#ifdef RESERVE_HOTADD
+ if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
+ /* Ignore hotadd region. Undo damage */
+ printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+ *nd = oldnode;
+ if ((nd->start | nd->end) == 0)
+ node_clear(node, nodes_parsed);
+ }
+#endif
}
/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -225,6 +355,9 @@ static int nodes_cover_memory(void)
unsigned long e = nodes[i].end >> PAGE_SHIFT;
pxmram += e - s;
pxmram -= e820_hole_size(s, e);
+ pxmram -= nodes_add[i].end - nodes_add[i].start;
+ if ((long)pxmram < 0)
+ pxmram = 0;
}
e820ram = end_pfn - e820_hole_size(0, end_pfn);
@@ -258,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
/* First clean up the node list */
for (i = 0; i < MAX_NUMNODES; i++) {
- cutoff_node(i, start, end);
+ cutoff_node(i, start, end);
if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
unparse_node(i);
}
@@ -303,6 +436,25 @@ static int node_to_pxm(int n)
return 0;
}
+void __init srat_reserve_add_area(int nodeid)
+{
+ if (found_add_area && nodes_add[nodeid].end) {
+ u64 total_mb;
+
+ printk(KERN_INFO "SRAT: Reserving hot-add memory space "
+ "for node %d at %Lx-%Lx\n",
+ nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
+ total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
+ >> PAGE_SHIFT;
+ total_mb *= sizeof(struct page);
+ total_mb >>= 20;
+ printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
+ "pre-allocated memory.\n", (unsigned long long)total_mb);
+ reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
+ nodes_add[nodeid].end - nodes_add[nodeid].start);
+ }
+}
+
int __node_distance(int a, int b)
{
int index;
diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h
index f6cbb4cbb5a3..f0ba4d984bdf 100644
--- a/include/asm-x86_64/numa.h
+++ b/include/asm-x86_64/numa.h
@@ -18,6 +18,8 @@ extern void numa_init_array(void);
extern int numa_off;
extern void numa_set_node(int cpu, int node);
+extern void srat_reserve_add_area(int nodeid);
+extern int hotadd_percent;
extern unsigned char apicid_to_node[256];
#ifdef CONFIG_NUMA
OpenPOWER on IntegriCloud