5 files changed, 171 insertions, 8 deletions
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 1921353259ae..f2cd6ef53ff3 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -151,6 +151,11 @@ NUMA
 
   numa=fake=X   Fake X nodes and ignore NUMA setup of the actual machine.
 
+  numa=hotadd=percent
+		Only allow hotadd memory to preallocate page structures upto
+		percent of already available memory.
+		numa=hotadd=0 will disable hotadd memory.
+
 ACPI
 
   acpi=off	Don't enable ACPI
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 492161168402..dff870534199 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -530,8 +530,7 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
 	unsigned long pfn;
 	unsigned long total = 0, mem = 0;
 	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-		unsigned long addr = pfn << PAGE_SHIFT;
-		if (pfn_valid(pfn) && e820_mapped(addr, addr+1, E820_RAM)) {
+		if (pfn_valid(pfn)) {
 			online_page(pfn_to_page(pfn));
 			err = 0;
 			mem++;
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 4be82d6e2b48..779132af29a7 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -142,6 +142,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 
 	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
 	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+#ifdef CONFIG_ACPI_NUMA
+	srat_reserve_add_area(nodeid);
+#endif
 	node_set_online(nodeid);
 } 
 
@@ -335,6 +338,8 @@ __init int numa_setup(char *opt)
 #ifdef CONFIG_ACPI_NUMA
  	if (!strncmp(opt,"noacpi",6))
  		acpi_numa = -1;
+	if (!strncmp(opt,"hotadd=", 7))
+		hotadd_percent = simple_strtoul(opt+7, NULL, 10);
 #endif
 	return 1;
 } 
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2eb879590dc4..443875eb15a2 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -15,15 +15,26 @@
 #include <linux/bitmap.h>
 #include <linux/module.h>
 #include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
 #include <asm/proto.h>
 #include <asm/numa.h>
 #include <asm/e820.h>
 
+#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
+	defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
+		&& !defined(CONFIG_MEMORY_HOTPLUG)
+#define RESERVE_HOTADD 1
+#endif
+
 static struct acpi_table_slit *acpi_slit;
 
 static nodemask_t nodes_parsed __initdata;
 static nodemask_t nodes_found __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
+static int found_add_area __initdata;
+int hotadd_percent __initdata = 10;
 static u8 pxm2node[256] = { [0 ... 255] = 0xff };
 
 /* Too small nodes confuse the VM badly. Usually they result
@@ -71,6 +82,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end)
 static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 {
 	struct bootnode *nd = &nodes[i];
+
+	if (found_add_area)
+		return;
+
 	if (nd->start < start) {
 		nd->start = start;
 		if (nd->end < nd->start)
@@ -90,6 +105,8 @@ static __init void bad_srat(void)
 	acpi_numa = -1;
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
 		apicid_to_node[i] = NUMA_NO_NODE;
+	for (i = 0; i < MAX_NUMNODES; i++)
+		nodes_add[i].start = nodes[i].end = 0;
 }
 
 static __init inline int srat_disabled(void)
@@ -155,11 +172,114 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
 	       pxm, pa->apic_id, node);
 }
 
+#ifdef RESERVE_HOTADD
+/*
+ * Protect against too large hotadd areas that would fill up memory.
+ */
+static int hotadd_enough_memory(struct bootnode *nd)
+{
+	static unsigned long allocated;
+	static unsigned long last_area_end;
+	unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
+	long mem = pages * sizeof(struct page);
+	unsigned long addr;
+	unsigned long allowed;
+	unsigned long oldpages = pages;
+
+	if (mem < 0)
+		return 0;
+	allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
+	allowed = (allowed / 100) * hotadd_percent;
+	if (allocated + mem > allowed) {
+		/* Give them at least part of their hotadd memory upto hotadd_percent
+		   It would be better to spread the limit out
+		   over multiple hotplug areas, but that is too complicated
+		   right now */
+		if (allocated >= allowed)
+			return 0;
+		pages = (allowed - allocated + mem) / sizeof(struct page);
+		mem = pages * sizeof(struct page);
+		nd->end = nd->start + pages*PAGE_SIZE;
+	}
+	/* Not completely fool proof, but a good sanity check */
+	addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
+	if (addr == -1UL)
+		return 0;
+	if (pages != oldpages)
+		printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
+			pages << PAGE_SHIFT);
+	last_area_end = addr + mem;
+	allocated += mem;
+	return 1;
+}
+
+/*
+ * It is fine to add this area to the nodes data it will be used later
+ * This code supports one contigious hot add area per node.
+ */
+static int reserve_hotadd(int node, unsigned long start, unsigned long end)
+{
+	unsigned long s_pfn = start >> PAGE_SHIFT;
+	unsigned long e_pfn = end >> PAGE_SHIFT;
+	int changed = 0;
+	struct bootnode *nd = &nodes_add[node];
+
+	/* I had some trouble with strange memory hotadd regions breaking
+	   the boot. Be very strict here and reject anything unexpected.
+	   If you want working memory hotadd write correct SRATs.
+
+	   The node size check is a basic sanity check to guard against
+	   mistakes */
+	if ((signed long)(end - start) < NODE_MIN_SIZE) {
+		printk(KERN_ERR "SRAT: Hotplug area too small\n");
+		return -1;
+	}
+
+	/* This check might be a bit too strict, but I'm keeping it for now. */
+	if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
+		printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
+		return -1;
+	}
+
+	if (!hotadd_enough_memory(&nodes_add[node]))  {
+		printk(KERN_ERR "SRAT: Hotplug area too large\n");
+		return -1;
+	}
+
+	/* Looks good */
+
+ 	found_add_area = 1;
+	if (nd->start == nd->end) {
+ 		nd->start = start;
+ 		nd->end = end;
+		changed = 1;
+ 	} else {
+ 		if (nd->start == end) {
+ 			nd->start = start;
+			changed = 1;
+		}
+ 		if (nd->end == start) {
+ 			nd->end = end;
+			changed = 1;
+		}
+		if (!changed)
+			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
+ 	}
+
+ 	if ((nd->end >> PAGE_SHIFT) > end_pfn)
+ 		end_pfn = nd->end >> PAGE_SHIFT;
+
+	if (changed)
+	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
+	return 0;
+}
+#endif
+
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 void __init
 acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 {
-	struct bootnode *nd;
+	struct bootnode *nd, oldnode;
 	unsigned long start, end;
 	int node, pxm;
 	int i;
@@ -172,6 +292,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 	}
 	if (ma->flags.enabled == 0)
 		return;
+ 	if (ma->flags.hot_pluggable && hotadd_percent == 0)
+		return;
 	start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
 	end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
 	pxm = ma->proximity_domain;
@@ -181,10 +303,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 		bad_srat();
 		return;
 	}
-	/* It is fine to add this area to the nodes data it will be used later*/
-	if (ma->flags.hot_pluggable == 1)
-		printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n",
-				start, end);
 	i = conflicting_nodes(start, end);
 	if (i == node) {
 		printk(KERN_WARNING
@@ -199,6 +317,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 		return;
 	}
 	nd = &nodes[node];
+	oldnode = *nd;
 	if (!node_test_and_set(node, nodes_parsed)) {
 		nd->start = start;
 		nd->end = end;
@@ -208,8 +327,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 		if (nd->end < end)
 			nd->end = end;
 	}
+
 	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
 	       nd->start, nd->end);
+
+#ifdef RESERVE_HOTADD
+ 	if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
+		/* Ignore hotadd region. Undo damage */
+		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+		*nd = oldnode;
+		if ((nd->start | nd->end) == 0)
+			node_clear(node, nodes_parsed);
+	}
+#endif
 }
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -225,6 +355,9 @@ static int nodes_cover_memory(void)
 		unsigned long e = nodes[i].end >> PAGE_SHIFT;
 		pxmram += e - s;
 		pxmram -= e820_hole_size(s, e);
+		pxmram -= nodes_add[i].end - nodes_add[i].start;
+		if ((long)pxmram < 0)
+			pxmram = 0;
 	}
 
 	e820ram = end_pfn - e820_hole_size(0, end_pfn);
@@ -258,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 
 	/* First clean up the node list */
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		cutoff_node(i, start, end);
+ 		cutoff_node(i, start, end);
 		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
 			unparse_node(i);
 	}
@@ -303,6 +436,25 @@ static int node_to_pxm(int n)
        return 0;
 }
 
+void __init srat_reserve_add_area(int nodeid)
+{
+	if (found_add_area && nodes_add[nodeid].end) {
+		u64 total_mb;
+
+		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
+				"for node %d at %Lx-%Lx\n",
+			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
+		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
+					>> PAGE_SHIFT;
+		total_mb *= sizeof(struct page);
+		total_mb >>= 20;
+		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
+				"pre-allocated memory.\n", (unsigned long long)total_mb);
+		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
+			       nodes_add[nodeid].end - nodes_add[nodeid].start);
+	}
+}
+
 int __node_distance(int a, int b)
 {
 	int index;
diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h
index f6cbb4cbb5a3..f0ba4d984bdf 100644
--- a/include/asm-x86_64/numa.h
+++ b/include/asm-x86_64/numa.h
@@ -18,6 +18,8 @@ extern void numa_init_array(void);
 extern int numa_off;
 
 extern void numa_set_node(int cpu, int node);
+extern void srat_reserve_add_area(int nodeid);
+extern int hotadd_percent;
 
 extern unsigned char apicid_to_node[256];
 #ifdef CONFIG_NUMA