Merge branch 'x86/numa' into x86/devel

Conflicts: arch/x86/Kconfig arch/x86/kernel/e820.c arch/x86/kernel/efi_64.c arch/x86/kernel/mpparse.c arch/x86/kernel/setup.c arch/x86/kernel/setup_32.c arch/x86/mm/init_64.c include/asm-x86/proto.h Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2008-07-08 11:59:23 +0200
committer: Ingo Molnar <mingo@elte.hu> 2008-07-08 11:59:23 +0200
commit: 2b4fa851b2f06fdb04cac808b57324f5e51e1578 (patch)
tree: 97db3ad5adda7683923630982f68b8b52c86e790
parent: 3de352bbd86f890dd0c5e1c09a6a1b0b29e0f8ce (diff)
parent: 46f68e1c6b04a04772e828ff3bcd07ed708805c2 (diff)
download: blackbird-op-linux-2b4fa851b2f06fdb04cac808b57324f5e51e1578.tar.gz
blackbird-op-linux-2b4fa851b2f06fdb04cac808b57324f5e51e1578.zip
22 files changed, 611 insertions, 278 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 640dc62a7fa0..112afd368c77 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -121,7 +121,7 @@ config ARCH_HAS_CACHE_LINE_SIZE
 	def_bool y
 
 config HAVE_SETUP_PER_CPU_AREA
-	def_bool X86_64 || (X86_SMP && !X86_VOYAGER)
+	def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
 
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
@@ -579,7 +579,21 @@ config SWIOTLB
 
 config IOMMU_HELPER
 	def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB)
+config MAXSMP
+	bool "Configure Maximum number of SMP Processors and NUMA Nodes"
+	depends on X86_64 && SMP
+	default n
+	help
+	  Configure maximum number of CPUS and NUMA Nodes for this architecture.
+	  If unsure, say N.
 
+if MAXSMP
+config NR_CPUS
+	int
+	default "4096"
+endif
+
+if !MAXSMP
 config NR_CPUS
 	int "Maximum number of CPUs (2-4096)"
 	range 2 4096
@@ -592,7 +606,8 @@ config NR_CPUS
 	  minimum value which makes sense is 2.
 
 	  This is purely to save memory - each supported CPU adds
-	  approximately one kilobyte to the kernel image.
+	  approximately eight kilobytes to the kernel image.
+endif
 
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
@@ -983,13 +998,25 @@ config NUMA_EMU
 	  into virtual nodes when booted with "numa=fake=N", where N is the
 	  number of nodes. This is only useful for debugging.
 
+if MAXSMP
+
+config NODES_SHIFT
+	int
+	default "9"
+endif
+
+if !MAXSMP
 config NODES_SHIFT
-	int "Max num nodes shift(1-9)"
-	range 1 9  if X86_64
+	int "Maximum NUMA Nodes (as a power of 2)"
+	range 1 9   if X86_64
 	default "6" if X86_64
 	default "4" if X86_NUMAQ
 	default "3"
 	depends on NEED_MULTIPLE_NODES
+	help
+	  Specify the maximum number of NUMA Nodes available on the target
+	  system.  Increases memory reserved to accomodate various tables.
+endif
 
 config HAVE_ARCH_BOOTMEM_NODE
 	def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index f0684bb74faf..acc0271920f2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -68,7 +68,7 @@ config DEBUG_PAGEALLOC
 config DEBUG_PER_CPU_MAPS
 	bool "Debug access to per_cpu maps"
 	depends on DEBUG_KERNEL
-	depends on X86_64_SMP
+	depends on X86_SMP
 	default n
 	help
 	  Say Y to verify that the per_cpu map being accessed has
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 570c362eca8c..84ce106b33c8 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -52,9 +52,6 @@
 
 unsigned long mp_lapic_addr;
 
-DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
 /*
  * Knob to control our willingness to enable the local APIC.
  *
@@ -1546,9 +1543,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 	}
 #ifdef CONFIG_SMP
 	/* are we being called early in kernel startup? */
-	if (x86_cpu_to_apicid_early_ptr) {
-		u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
-		u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+	if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
+		u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+		u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
 
 		cpu_to_apicid[cpu] = apicid;
 		bios_cpu_apicid[cpu] = apicid;
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index d7406aa1c985..e494809fc508 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -90,9 +90,6 @@ static unsigned long apic_phys;
 
 unsigned long mp_lapic_addr;
 
-DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
 unsigned int __cpuinitdata maxcpus = NR_CPUS;
 /*
  * Get the LAPIC version
@@ -1075,9 +1072,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		max_physical_apicid = apicid;
 
 	/* are we being called early in kernel startup? */
-	if (x86_cpu_to_apicid_early_ptr) {
-		u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
-		u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+	if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
+		u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+		u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
 
 		cpu_to_apicid[cpu] = apicid;
 		bios_cpu_apicid[cpu] = apicid;
@@ -1253,7 +1250,7 @@ __cpuinit int apic_is_clustered_box(void)
 	if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
 		return 0;
 
-	bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+	bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
 	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
 	for (i = 0; i < NR_CPUS; i++) {
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5fbed459ff3b..c970929bb15d 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,6 +25,20 @@
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
 
+/* boot cpu pda */
+static struct x8664_pda _boot_cpu_pda __read_mostly;
+
+#ifdef CONFIG_SMP
+/*
+ * We install an empty cpu_pda pointer table to indicate to early users
+ * (numa_set_node) that the cpu_pda pointer table for cpus other than
+ * the boot cpu is not yet setup.
+ */
+static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
+#else
+static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
+#endif
+
 static void __init zap_identity_mappings(void)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
@@ -88,10 +102,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
 	early_printk("Kernel alive\n");
 
- 	for (i = 0; i < NR_CPUS; i++)
- 		cpu_pda(i) = &boot_cpu_pda[i];
-
+	_cpu_pda = __cpu_pda;
+	cpu_pda(0) = &_boot_cpu_pda;
 	pda_init(0);
+
+	early_printk("Kernel really alive\n");
+
 	copy_bootdata(__va(real_mode_data));
 
 	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 0060e44e8989..d62f3b66b529 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -90,7 +90,7 @@ int __init check_nmi_watchdog(void)
 	if (!atomic_read(&nmi_active))
 		return 0;
 
-	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+	prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
 	if (!prev_nmi_count)
 		goto error;
 
@@ -101,7 +101,7 @@ int __init check_nmi_watchdog(void)
 		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 #endif
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
 		prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
 	local_irq_enable();
 	mdelay((20*1000)/nmi_hz); // wait 20 ticks
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5b0de38cde48..ebb0a2bcdc08 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -20,13 +20,34 @@ unsigned int boot_cpu_physical_apicid = -1U;
 unsigned int max_physical_apicid;
 EXPORT_SYMBOL(boot_cpu_physical_apicid);
 
-DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map;
 #endif
 
+/* map cpu index to physical APIC ID */
+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#define	X86_64_NUMA	1
+
+/* map cpu index to node index */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+/* which logical CPUs are on which nodes */
+cpumask_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+/* setup node_to_cpumask_map */
+static void __init setup_node_to_cpumask_map(void);
+
+#else
+static inline void setup_node_to_cpumask_map(void) { }
+#endif
+
 #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
 /*
  * Copy data used in early init routines from the initial arrays to the
@@ -38,20 +59,21 @@ static void __init setup_per_cpu_maps(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
+		per_cpu(x86_cpu_to_apicid, cpu) =
+				early_per_cpu_map(x86_cpu_to_apicid, cpu);
 		per_cpu(x86_bios_cpu_apicid, cpu) =
-						x86_bios_cpu_apicid_init[cpu];
-#ifdef CONFIG_NUMA
+				early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+#ifdef X86_64_NUMA
 		per_cpu(x86_cpu_to_node_map, cpu) =
-						x86_cpu_to_node_map_init[cpu];
+				early_per_cpu_map(x86_cpu_to_node_map, cpu);
 #endif
 	}
 
 	/* indicate the early static arrays will soon be gone */
-	x86_cpu_to_apicid_early_ptr = NULL;
-	x86_bios_cpu_apicid_early_ptr = NULL;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = NULL;
+	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+#ifdef X86_64_NUMA
+	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
 #endif
 }
 
@@ -80,6 +102,50 @@ static inline void setup_cpumask_of_cpu(void) { }
  */
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
+static inline void setup_cpu_pda_map(void) { }
+
+#elif !defined(CONFIG_SMP)
+static inline void setup_cpu_pda_map(void) { }
+
+#else /* CONFIG_SMP && CONFIG_X86_64 */
+
+/*
+ * Allocate cpu_pda pointer table and array via alloc_bootmem.
+ */
+static void __init setup_cpu_pda_map(void)
+{
+	char *pda;
+	struct x8664_pda **new_cpu_pda;
+	unsigned long size;
+	int cpu;
+
+	size = roundup(sizeof(struct x8664_pda), cache_line_size());
+
+	/* allocate cpu_pda array and pointer table */
+	{
+		unsigned long tsize = nr_cpu_ids * sizeof(void *);
+		unsigned long asize = size * (nr_cpu_ids - 1);
+
+		tsize = roundup(tsize, cache_line_size());
+		new_cpu_pda = alloc_bootmem(tsize + asize);
+		pda = (char *)new_cpu_pda + tsize;
+	}
+
+	/* initialize pointer table to static pda's */
+	for_each_possible_cpu(cpu) {
+		if (cpu == 0) {
+			/* leave boot cpu pda in place */
+			new_cpu_pda[0] = cpu_pda(0);
+			continue;
+		}
+		new_cpu_pda[cpu] = (struct x8664_pda *)pda;
+		new_cpu_pda[cpu]->in_bootmem = 1;
+		pda += size;
+	}
+
+	/* point to new pointer table */
+	_cpu_pda = new_cpu_pda;
+}
 #endif
 
 /*
@@ -89,50 +155,52 @@ EXPORT_SYMBOL(__per_cpu_offset);
  */
 void __init setup_per_cpu_areas(void)
 {
-	int i, highest_cpu = 0;
-	unsigned long size;
+	ssize_t size = PERCPU_ENOUGH_ROOM;
+	char *ptr;
+	int cpu;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	prefill_possible_map();
+#else
+	nr_cpu_ids = num_processors;
 #endif
 
+	/* Setup cpu_pda map */
+	setup_cpu_pda_map();
+
 	/* Copy section for each CPU (we discard the original) */
 	size = PERCPU_ENOUGH_ROOM;
-	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
+	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);
 
-	for_each_possible_cpu(i) {
-		char *ptr;
+	for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 		ptr = alloc_bootmem_pages(size);
 #else
-		int node = early_cpu_to_node(i);
+		int node = early_cpu_to_node(cpu);
 		if (!node_online(node) || !NODE_DATA(node)) {
 			ptr = alloc_bootmem_pages(size);
 			printk(KERN_INFO
-			       "cpu %d has no node or node-local memory\n", i);
+			       "cpu %d has no node %d or node-local memory\n",
+				cpu, node);
 		}
 		else
 			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
 #endif
-		if (!ptr)
-			panic("Cannot allocate cpu data for CPU %d\n", i);
-#ifdef CONFIG_X86_64
-		cpu_pda(i)->data_offset = ptr - __per_cpu_start;
-#else
-		__per_cpu_offset[i] = ptr - __per_cpu_start;
-#endif
+		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 
-		highest_cpu = i;
 	}
 
-	nr_cpu_ids = highest_cpu + 1;
-	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
+	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
+		NR_CPUS, nr_cpu_ids, nr_node_ids);
 
 	/* Setup percpu data maps */
 	setup_per_cpu_maps();
 
+	/* Setup node to cpumask map */
+	setup_node_to_cpumask_map();
+
 	/* Setup cpumask_of_cpu map */
 	setup_cpumask_of_cpu();
 }
@@ -163,3 +231,176 @@ void __init parse_setup_data(void)
 		early_iounmap(data, PAGE_SIZE);
 	}
 }
+
+#ifdef X86_64_NUMA
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ */
+static void __init setup_node_to_cpumask_map(void)
+{
+	unsigned int node, num = 0;
+	cpumask_t *map;
+
+	/* setup nr_node_ids if not done yet */
+	if (nr_node_ids == MAX_NUMNODES) {
+		for_each_node_mask(node, node_possible_map)
+			num = node;
+		nr_node_ids = num + 1;
+	}
+
+	/* allocate the map */
+	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+
+	Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
+		map, nr_node_ids);
+
+	/* node_to_cpumask() will now work */
+	node_to_cpumask_map = map;
+}
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+	if (cpu_pda(cpu) && node != NUMA_NO_NODE)
+		cpu_pda(cpu)->nodenumber = node;
+
+	if (cpu_to_node_map)
+		cpu_to_node_map[cpu] = node;
+
+	else if (per_cpu_offset(cpu))
+		per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+	else
+		Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
+}
+
+void __cpuinit numa_clear_node(int cpu)
+{
+	numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
+}
+
+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+/*
+ * --------- debug versions of the numa functions ---------
+ */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+	int node = cpu_to_node(cpu);
+	cpumask_t *mask;
+	char buf[64];
+
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_ERR "node_to_cpumask_map NULL\n");
+		dump_stack();
+		return;
+	}
+
+	mask = &node_to_cpumask_map[node];
+	if (enable)
+		cpu_set(cpu, *mask);
+	else
+		cpu_clear(cpu, *mask);
+
+	cpulist_scnprintf(buf, sizeof(buf), *mask);
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+		enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
+ }
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 0);
+}
+
+int cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+		printk(KERN_WARNING
+			"cpu_to_node(%d): usage too early!\n", cpu);
+		dump_stack();
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map))
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+	if (!per_cpu_offset(cpu)) {
+		printk(KERN_WARNING
+			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+		dump_stack();
+		return NUMA_NO_NODE;
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+cpumask_t *_node_to_cpumask_ptr(int node)
+{
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_WARNING
+			"_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
+			node);
+		dump_stack();
+		return &cpu_online_map;
+	}
+	BUG_ON(node >= nr_node_ids);
+	return &node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(_node_to_cpumask_ptr);
+
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ */
+cpumask_t node_to_cpumask(int node)
+{
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_WARNING
+			"node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
+		dump_stack();
+		return cpu_online_map;
+	}
+	BUG_ON(node >= nr_node_ids);
+	return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(node_to_cpumask);
+
+/*
+ * --------- end of debug versions of the numa functions ---------
+ */
+
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+#endif /* X86_64_NUMA */
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index aee0e8200777..631ea6cc01d8 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -12,6 +12,7 @@
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/kgdb.h>
+#include <linux/topology.h>
 #include <asm/pda.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -34,9 +35,8 @@ struct boot_params boot_params;
 
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
-struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
+struct x8664_pda **_cpu_pda __read_mostly;
 EXPORT_SYMBOL(_cpu_pda);
-struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
 
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
@@ -114,8 +114,10 @@ void pda_init(int cpu)
 			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
 		if (!pda->irqstackptr)
 			panic("cannot allocate irqstack for cpu %d", cpu); 
-	}
 
+		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+			pda->nodenumber = cpu_to_node(cpu);
+	}
 
 	pda->irqstackptr += IRQSTACKSIZE-64;
 } 
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 7e06ecd83174..a9b19ad24edb 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -659,18 +659,6 @@ static void set_mca_bus(int x)
 static void set_mca_bus(int x) { }
 #endif
 
-#ifdef CONFIG_NUMA
-/*
- * In the golden day, when everything among i386 and x86_64 will be
- * integrated, this will not live here
- */
-void *x86_cpu_to_node_map_early_ptr;
-int x86_cpu_to_node_map_init[NR_CPUS] = {
-	[0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
-#endif
-
 static void probe_roms(void);
 
 /*
@@ -866,18 +854,6 @@ void __init setup_arch(char **cmdline_p)
 
 	paravirt_post_allocator_init();
 
-#ifdef CONFIG_X86_SMP
-	/*
-	 * setup to use the early static init tables during kernel startup
-	 * X86_SMP will exclude sub-arches that don't deal well with it.
-	 */
-	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
-	x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-#endif
-#endif
-
 #ifdef CONFIG_X86_GENERICARCH
 	generic_apic_probe();
 #endif
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 9a87113ba996..16ef53ab538a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -376,15 +376,6 @@ void __init setup_arch(char **cmdline_p)
 	kvmclock_init();
 #endif
 
-#ifdef CONFIG_SMP
-	/* setup to use the early static init tables during kernel startup */
-	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
-	x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-#endif
-#endif
-
 #ifdef CONFIG_ACPI
 	/*
 	 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6be701f3027f..ae0a7a200421 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -67,22 +67,6 @@
 #include <mach_wakecpu.h>
 #include <smpboot_hooks.h>
 
-/*
- * FIXME: For x86_64, those are defined in other files. But moving them here,
- * would make the setup areas dependent on smp, which is a loss. When we
- * integrate apic between arches, we can probably do a better job, but
- * right now, they'll stay here -- glommer
- */
-
-/* which logical CPU number maps to which CPU (physical APIC ID) */
-u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
-			{ [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_early_ptr;
-
-u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
-				= { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_bios_cpu_apicid_early_ptr;
-
 #ifdef CONFIG_X86_32
 u8 apicid_2_node[MAX_APICID];
 static int low_mappings;
@@ -814,6 +798,45 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 	complete(&c_idle->done);
 }
 
+#ifdef CONFIG_X86_64
+/*
+ * Allocate node local memory for the AP pda.
+ *
+ * Must be called after the _cpu_pda pointer table is initialized.
+ */
+static int __cpuinit get_local_pda(int cpu)
+{
+	struct x8664_pda *oldpda, *newpda;
+	unsigned long size = sizeof(struct x8664_pda);
+	int node = cpu_to_node(cpu);
+
+	if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
+		return 0;
+
+	oldpda = cpu_pda(cpu);
+	newpda = kmalloc_node(size, GFP_ATOMIC, node);
+	if (!newpda) {
+		printk(KERN_ERR "Could not allocate node local PDA "
+			"for CPU %d on node %d\n", cpu, node);
+
+		if (oldpda)
+			return 0;	/* have a usable pda */
+		else
+			return -1;
+	}
+
+	if (oldpda) {
+		memcpy(newpda, oldpda, size);
+		if (!after_bootmem)
+			free_bootmem((unsigned long)oldpda, size);
+	}
+
+	newpda->in_bootmem = 0;
+	cpu_pda(cpu) = newpda;
+	return 0;
+}
+#endif /* CONFIG_X86_64 */
+
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -839,19 +862,11 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	}
 
 	/* Allocate node local memory for AP pdas */
-	if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
-		struct x8664_pda *newpda, *pda;
-		int node = cpu_to_node(cpu);
-		pda = cpu_pda(cpu);
-		newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC,
-				      node);
-		if (newpda) {
-			memcpy(newpda, pda, sizeof(struct x8664_pda));
-			cpu_pda(cpu) = newpda;
-		} else
-			printk(KERN_ERR
-		"Could not allocate node local PDA for CPU %d on node %d\n",
-				cpu, node);
+	if (cpu > 0) {
+		boot_error = get_local_pda(cpu);
+		if (boot_error)
+			goto restore_state;
+			/* if can't get pda memory, can't start cpu */
 	}
 #endif
 
@@ -970,11 +985,13 @@ do_rest:
 		}
 	}
 
+restore_state:
+
 	if (boot_error) {
 		/* Try to put things back the way they were before ... */
 		unmap_cpu_to_logical_apicid(cpu);
 #ifdef CONFIG_X86_64
-		clear_node_cpumask(cpu); /* was set by numa_add_cpu */
+		numa_remove_cpu(cpu); /* was set by numa_add_cpu */
 #endif
 		cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
 		cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
@@ -1347,6 +1364,8 @@ __init void prefill_possible_map(void)
 
 	for (i = 0; i < possible; i++)
 		cpu_set(i, cpu_possible_map);
+
+	nr_cpu_ids = possible;
 }
 
 static void __ref remove_cpu_from_maps(int cpu)
@@ -1357,7 +1376,7 @@ static void __ref remove_cpu_from_maps(int cpu)
 	cpu_clear(cpu, cpu_callin_map);
 	/* was set by cpu_init() */
 	clear_bit(cpu, (unsigned long *)&cpu_initialized);
-	clear_node_cpumask(cpu);
+	numa_remove_cpu(cpu);
 #endif
 }
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index afb07ffb931d..c4557e25f60c 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -27,30 +27,17 @@
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
-bootmem_data_t plat_node_bdata[MAX_NUMNODES];
+static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
 struct memnode memnode;
 
-#ifdef CONFIG_SMP
-int x86_cpu_to_node_map_init[NR_CPUS] = {
-	[0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-void *x86_cpu_to_node_map_early_ptr;
-EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
-#endif
-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-
 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
-cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_to_cpumask_map);
-
 int numa_off __initdata;
-unsigned long __initdata nodemap_addr;
-unsigned long __initdata nodemap_size;
+static unsigned long __initdata nodemap_addr;
+static unsigned long __initdata nodemap_size;
 
 /*
  * Given a shift value, try to populate memnodemap[]
@@ -192,7 +179,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 void __init setup_node_bootmem(int nodeid, unsigned long start,
 			       unsigned long end)
 {
-	unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size;
+	unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
 	unsigned long bootmap_start, nodedata_phys;
 	void *bootmap;
 	const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
@@ -204,7 +191,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 	       start, end);
 
 	start_pfn = start >> PAGE_SHIFT;
-	end_pfn = end >> PAGE_SHIFT;
+	last_pfn = end >> PAGE_SHIFT;
 
 	node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
 					   SMP_CACHE_BYTES);
@@ -217,7 +204,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
 	NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
 	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
-	NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+	NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
 
 	/*
 	 * Find a place for the bootmem map
@@ -226,7 +213,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 	 * early_node_mem will get that with find_e820_area instead
 	 * of alloc_bootmem, that could clash with reserved range
 	 */
-	bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
+	bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
 	nid = phys_to_nid(nodedata_phys);
 	if (nid == nodeid)
 		bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
@@ -248,7 +235,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 
 	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
 					 bootmap_start >> PAGE_SHIFT,
-					 start_pfn, end_pfn);
+					 start_pfn, last_pfn);
 
 	printk(KERN_INFO "  bootmap [%016lx -  %016lx] pages %lx\n",
 		 bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +296,7 @@ void __init numa_init_array(void)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
-char *cmdline __initdata;
+static char *cmdline __initdata;
 
 /*
  * Setups up nid to range from addr to addr + size.  If the end
@@ -413,15 +400,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
 }
 
 /*
- * Sets up the system RAM area from start_pfn to end_pfn according to the
+ * Sets up the system RAM area from start_pfn to last_pfn according to the
  * numa=fake command-line option.
  */
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 
-static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
 {
 	u64 size, addr = start_pfn << PAGE_SHIFT;
-	u64 max_addr = end_pfn << PAGE_SHIFT;
+	u64 max_addr = last_pfn << PAGE_SHIFT;
 	int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
 
 	memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +514,7 @@ out:
 }
 #endif /* CONFIG_NUMA_EMU */
 
-void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long last_pfn)
 {
 	int i;
 
@@ -535,7 +522,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	nodes_clear(node_online_map);
 
 #ifdef CONFIG_NUMA_EMU
-	if (cmdline && !numa_emulation(start_pfn, end_pfn))
+	if (cmdline && !numa_emulation(start_pfn, last_pfn))
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
@@ -543,7 +530,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 
 #ifdef CONFIG_ACPI_NUMA
 	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
-					  end_pfn << PAGE_SHIFT))
+					  last_pfn << PAGE_SHIFT))
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
@@ -551,7 +538,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 
 #ifdef CONFIG_K8_NUMA
 	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
-					end_pfn<<PAGE_SHIFT))
+					last_pfn<<PAGE_SHIFT))
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
@@ -561,7 +548,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 
 	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
 	       start_pfn << PAGE_SHIFT,
-	       end_pfn << PAGE_SHIFT);
+	       last_pfn << PAGE_SHIFT);
 	/* setup dummy node covering all memory */
 	memnode_shift = 63;
 	memnodemap = memnode.embedded_map;
@@ -570,29 +557,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	node_set(0, node_possible_map);
 	for (i = 0; i < NR_CPUS; i++)
 		numa_set_node(i, 0);
-	/* cpumask_of_cpu() may not be available during early startup */
-	memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
-	cpu_set(0, node_to_cpumask_map[0]);
-	e820_register_active_regions(0, start_pfn, end_pfn);
-	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
-}
-
-__cpuinit void numa_add_cpu(int cpu)
-{
-	set_bit(cpu,
-		(unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-
-void __cpuinit numa_set_node(int cpu, int node)
-{
-	int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
-
-	if(cpu_to_node_map)
-		cpu_to_node_map[cpu] = node;
-	else if(per_cpu_offset(cpu))
-		per_cpu(x86_cpu_to_node_map, cpu) = node;
-	else
-		Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
+	e820_register_active_regions(0, start_pfn, last_pfn);
+	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
 }
 
 unsigned long __init numa_free_all_bootmem(void)
@@ -641,6 +607,7 @@ static __init int numa_setup(char *opt)
 }
 early_param("numa", numa_setup);
 
+#ifdef CONFIG_NUMA
 /*
  * Setup early cpu_to_node.
  *
@@ -652,14 +619,19 @@ early_param("numa", numa_setup);
  * is already initialized in a round robin manner at numa_init_array,
  * prior to this call, and this initialization is good enough
  * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
  */
 void __init init_cpu_to_node(void)
 {
-	int i;
+	int cpu;
+	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
 
-	for (i = 0; i < NR_CPUS; i++) {
+	BUG_ON(cpu_to_apicid == NULL);
+
+	for_each_possible_cpu(cpu) {
 		int node;
-		u16 apicid = x86_cpu_to_apicid_init[i];
+		u16 apicid = cpu_to_apicid[cpu];
 
 		if (apicid == BAD_APICID)
 			continue;
@@ -668,8 +640,9 @@ void __init init_cpu_to_node(void)
 			continue;
 		if (!node_online(node))
 			continue;
-		numa_set_node(i, node);
+		numa_set_node(cpu, node);
 	}
 }
+#endif
 
 
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 391d51035871..b67f5a16755f 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -376,7 +376,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		if (node == NUMA_NO_NODE)
 			continue;
 		if (!node_isset(node, node_possible_map))
-			numa_set_node(i, NUMA_NO_NODE);
+			numa_clear_node(i);
 	}
 	numa_init_array();
 	return 0;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 464279da49c4..4fa52d3dc848 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -171,8 +171,11 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
 	if (node != -1)
 		set_mp_bus_to_node(busnum, node);
 	else
-		node = get_mp_bus_to_node(busnum);
 #endif
+		node = get_mp_bus_to_node(busnum);
+
+	if (node != -1 && !node_online(node))
+		node = -1;
 
 	/* Allocate per-root-bus (not per bus) arch-specific data.
 	 * TODO: leak; this memory is never freed.
@@ -204,14 +207,16 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
 	if (!bus)
 		kfree(sd);
 
+	if (bus && node != -1) {
 #ifdef CONFIG_ACPI_NUMA
-	if (bus) {
-		if (pxm >= 0) {
+		if (pxm >= 0)
 			printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n",
-				busnum, pxm, pxm_to_node(pxm));
-		}
-	}
+				busnum, pxm, node);
+#else
+		printk(KERN_DEBUG "bus %02x -> node %d\n",
+			busnum, node);
 #endif
+	}
 
 	if (bus && (pci_probe & PCI_USE__CRS))
 		get_current_resources(device, busnum, domain, bus);
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index fdf4044d2e74..1efe162e16d7 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -40,6 +40,7 @@ static ssize_t show_##name(struct sys_device *dev, char *buf)	\
 	return sprintf(buf, "%d\n", topology_##name(cpu));	\
 }
 
+#if defined(topology_thread_siblings) || defined(topology_core_siblings)
 static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
 {
 	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
@@ -54,21 +55,41 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
 	}
 	return n;
 }
+#endif
 
+#ifdef arch_provides_topology_pointers
 #define define_siblings_show_map(name)					\
-static inline ssize_t show_##name(struct sys_device *dev, char *buf)	\
+static ssize_t show_##name(struct sys_device *dev, char *buf)	\
 {									\
 	unsigned int cpu = dev->id;					\
 	return show_cpumap(0, &(topology_##name(cpu)), buf);		\
 }
 
 #define define_siblings_show_list(name)					\
-static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
+static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
 {									\
 	unsigned int cpu = dev->id;					\
 	return show_cpumap(1, &(topology_##name(cpu)), buf);		\
 }
 
+#else
+#define define_siblings_show_map(name)					\
+static ssize_t show_##name(struct sys_device *dev, char *buf)	\
+{									\
+	unsigned int cpu = dev->id;					\
+	cpumask_t mask = topology_##name(cpu);				\
+	return show_cpumap(0, &mask, buf);				\
+}
+
+#define define_siblings_show_list(name)					\
+static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
+{									\
+	unsigned int cpu = dev->id;					\
+	cpumask_t mask = topology_##name(cpu);				\
+	return show_cpumap(1, &mask, buf);				\
+}
+#endif
+
 #define define_siblings_show_func(name)		\
 	define_siblings_show_map(name); define_siblings_show_list(name)
 
diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h
index 22e87c9f6a80..b510daf4f4d8 100644
--- a/include/asm-x86/numa_64.h
+++ b/include/asm-x86/numa_64.h
@@ -14,11 +14,9 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
-extern void numa_add_cpu(int cpu);
 extern void numa_init_array(void);
 extern int numa_off;
 
-extern void numa_set_node(int cpu, int node);
 extern void srat_reserve_add_area(int nodeid);
 extern int hotadd_percent;
 
@@ -31,15 +29,16 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 
 #ifdef CONFIG_NUMA
 extern void __init init_cpu_to_node(void);
-
-static inline void clear_node_cpumask(int cpu)
-{
-	clear_bit(cpu, (unsigned long *)&node_to_cpumask_map[cpu_to_node(cpu)]);
-}
-
+extern void __cpuinit numa_set_node(int cpu, int node);
+extern void __cpuinit numa_clear_node(int cpu);
+extern void __cpuinit numa_add_cpu(int cpu);
+extern void __cpuinit numa_remove_cpu(int cpu);
 #else
-#define init_cpu_to_node() do {} while (0)
-#define clear_node_cpumask(cpu) do {} while (0)
+static inline void init_cpu_to_node(void)		{ }
+static inline void numa_set_node(int cpu, int node)	{ }
+static inline void numa_clear_node(int cpu)		{ }
+static inline void numa_add_cpu(int cpu, int node)	{ }
+static inline void numa_remove_cpu(int cpu)		{ }
 #endif
 
 #endif
diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index 101fb9e11954..b34e9a7cc80b 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -22,6 +22,8 @@ struct x8664_pda {
 					   offset 40!!! */
 #endif
 	char *irqstackptr;
+	short nodenumber;		/* number of current node (32k max) */
+	short in_bootmem;		/* pda lives in bootmem */
 	unsigned int __softirq_pending;
 	unsigned int __nmi_count;	/* number of NMI on this CPUs */
 	short mmu_state;
@@ -37,8 +39,7 @@ struct x8664_pda {
 	unsigned irq_spurious_count;
 } ____cacheline_aligned_in_smp;
 
-extern struct x8664_pda *_cpu_pda[];
-extern struct x8664_pda boot_cpu_pda[];
+extern struct x8664_pda **_cpu_pda;
 extern void pda_init(int);
 
 #define cpu_pda(i) (_cpu_pda[i])
diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h
index 736fc3bb8e1e..912a3a17b9db 100644
--- a/include/asm-x86/percpu.h
+++ b/include/asm-x86/percpu.h
@@ -143,4 +143,50 @@ do {							\
 #define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
 #endif /* !__ASSEMBLY__ */
 #endif /* !CONFIG_X86_64 */
+
+#ifdef CONFIG_SMP
+
+/*
+ * Define the "EARLY_PER_CPU" macros.  These are used for some per_cpu
+ * variables that are initialized and accessed before there are per_cpu
+ * areas allocated.
+ */
+
+#define	DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)			\
+	DEFINE_PER_CPU(_type, _name) = _initvalue;			\
+	__typeof__(_type) _name##_early_map[NR_CPUS] __initdata =	\
+				{ [0 ... NR_CPUS-1] = _initvalue };	\
+	__typeof__(_type) *_name##_early_ptr = _name##_early_map
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name)			\
+	EXPORT_PER_CPU_SYMBOL(_name)
+
+#define DECLARE_EARLY_PER_CPU(_type, _name)			\
+	DECLARE_PER_CPU(_type, _name);				\
+	extern __typeof__(_type) *_name##_early_ptr;		\
+	extern __typeof__(_type)  _name##_early_map[]
+
+#define	early_per_cpu_ptr(_name) (_name##_early_ptr)
+#define	early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
+#define	early_per_cpu(_name, _cpu) 				\
+	(early_per_cpu_ptr(_name) ?				\
+		early_per_cpu_ptr(_name)[_cpu] :		\
+		per_cpu(_name, _cpu))
+
+#else	/* !CONFIG_SMP */
+#define	DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)		\
+	DEFINE_PER_CPU(_type, _name) = _initvalue
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name)			\
+	EXPORT_PER_CPU_SYMBOL(_name)
+
+#define DECLARE_EARLY_PER_CPU(_type, _name)			\
+	DECLARE_PER_CPU(_type, _name)
+
+#define	early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
+#define	early_per_cpu_ptr(_name) NULL
+/* no early_per_cpu_map() */
+
+#endif	/* !CONFIG_SMP */
+
 #endif /* _ASM_X86_PERCPU_H_ */
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index 514e52b95cef..fc1007321ef6 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -29,21 +29,12 @@ extern int smp_num_siblings;
 extern unsigned int num_processors;
 extern cpumask_t cpu_initialized;
 
-#ifdef CONFIG_SMP
-extern u16 x86_cpu_to_apicid_init[];
-extern u16 x86_bios_cpu_apicid_init[];
-extern void *x86_cpu_to_apicid_early_ptr;
-extern void *x86_bios_cpu_apicid_early_ptr;
-#else
-#define x86_cpu_to_apicid_early_ptr NULL
-#define x86_bios_cpu_apicid_early_ptr NULL
-#endif
-
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
-DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
-DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
+
+DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
 
 /* Static state in head.S used to set up a CPU */
 extern struct {
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index dcf3f8131d6b..1f97758de4ab 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -35,79 +35,88 @@
 # endif
 #endif
 
+/* Node not present */
+#define NUMA_NO_NODE	(-1)
+
 #ifdef CONFIG_NUMA
 #include <linux/cpumask.h>
 #include <asm/mpspec.h>
 
-/* Mappings between logical cpu number and node number */
 #ifdef CONFIG_X86_32
-extern int cpu_to_node_map[];
-#else
-/* Returns the number of the current Node. */
-#define numa_node_id()		(early_cpu_to_node(raw_smp_processor_id()))
-#endif
-
-DECLARE_PER_CPU(int, x86_cpu_to_node_map);
-
-#ifdef CONFIG_SMP
-extern int x86_cpu_to_node_map_init[];
-extern void *x86_cpu_to_node_map_early_ptr;
-#else
-#define x86_cpu_to_node_map_early_ptr NULL
-#endif
 
+/* Mappings between node number and cpus on that node. */
 extern cpumask_t node_to_cpumask_map[];
 
-#define NUMA_NO_NODE	(-1)
+/* Mappings between logical cpu number and node number */
+extern int cpu_to_node_map[];
 
 /* Returns the number of the node containing CPU 'cpu' */
-#ifdef CONFIG_X86_32
-#define early_cpu_to_node(cpu)	cpu_to_node(cpu)
 static inline int cpu_to_node(int cpu)
 {
 	return cpu_to_node_map[cpu];
 }
+#define early_cpu_to_node(cpu)	cpu_to_node(cpu)
 
-#else /* CONFIG_X86_64 */
-
-#ifdef CONFIG_SMP
-static inline int early_cpu_to_node(int cpu)
+/* Returns a bitmask of CPUs on Node 'node'. */
+static inline cpumask_t node_to_cpumask(int node)
 {
-	int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
-
-	if (cpu_to_node_map)
-		return cpu_to_node_map[cpu];
-	else if (per_cpu_offset(cpu))
-		return per_cpu(x86_cpu_to_node_map, cpu);
-	else
-		return NUMA_NO_NODE;
+	return node_to_cpumask_map[node];
 }
-#else
-#define	early_cpu_to_node(cpu)	cpu_to_node(cpu)
-#endif
 
+#else /* CONFIG_X86_64 */
+
+/* Mappings between node number and cpus on that node. */
+extern cpumask_t *node_to_cpumask_map;
+
+/* Mappings between logical cpu number and node number */
+DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
+
+/* Returns the number of the current Node. */
+#define numa_node_id()		read_pda(nodenumber)
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+extern int cpu_to_node(int cpu);
+extern int early_cpu_to_node(int cpu);
+extern cpumask_t *_node_to_cpumask_ptr(int node);
+extern cpumask_t node_to_cpumask(int node);
+
+#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+/* Returns the number of the node containing CPU 'cpu' */
 static inline int cpu_to_node(int cpu)
 {
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-	if (x86_cpu_to_node_map_early_ptr) {
-		printk("KERN_NOTICE cpu_to_node(%d): usage too early!\n",
-		       (int)cpu);
-		dump_stack();
-		return ((int *)x86_cpu_to_node_map_early_ptr)[cpu];
-	}
-#endif
 	return per_cpu(x86_cpu_to_node_map, cpu);
 }
 
-#ifdef	CONFIG_NUMA
+/* Same function but used if called before per_cpu areas are setup */
+static inline int early_cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map))
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
 
 /* Returns a pointer to the cpumask of CPUs on Node 'node'. */
+static inline cpumask_t *_node_to_cpumask_ptr(int node)
+{
+	return &node_to_cpumask_map[node];
+}
+
+/* Returns a bitmask of CPUs on Node 'node'. */
+static inline cpumask_t node_to_cpumask(int node)
+{
+	return node_to_cpumask_map[node];
+}
+
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+/* Replace default node_to_cpumask_ptr with optimized version */
 #define node_to_cpumask_ptr(v, node)		\
-		cpumask_t *v = &(node_to_cpumask_map[node])
+		cpumask_t *v = _node_to_cpumask_ptr(node)
 
 #define node_to_cpumask_ptr_next(v, node)	\
-			   v = &(node_to_cpumask_map[node])
-#endif
+			   v = _node_to_cpumask_ptr(node)
 
 #endif /* CONFIG_X86_64 */
 
@@ -117,20 +126,6 @@ static inline int cpu_to_node(int cpu)
  */
 #define parent_node(node) (node)
 
-/* Returns a bitmask of CPUs on Node 'node'. */
-static inline cpumask_t node_to_cpumask(int node)
-{
-	return node_to_cpumask_map[node];
-}
-
-/* Returns the number of the first CPU on Node 'node'. */
-static inline int node_to_first_cpu(int node)
-{
-	cpumask_t mask = node_to_cpumask(node);
-
-	return first_cpu(mask);
-}
-
 #define pcibus_to_node(bus) __pcibus_to_node(bus)
 #define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
 
@@ -180,12 +175,44 @@ extern int __node_distance(int, int);
 #define node_distance(a, b) __node_distance(a, b)
 #endif
 
-#else /* CONFIG_NUMA */
+#else /* !CONFIG_NUMA */
 
+#define numa_node_id()		0
+#define	cpu_to_node(cpu)	0
+#define	early_cpu_to_node(cpu)	0
+
+static inline cpumask_t *_node_to_cpumask_ptr(int node)
+{
+	return &cpu_online_map;
+}
+static inline cpumask_t node_to_cpumask(int node)
+{
+	return cpu_online_map;
+}
+static inline int node_to_first_cpu(int node)
+{
+	return first_cpu(cpu_online_map);
+}
+
+/* Replace default node_to_cpumask_ptr with optimized version */
+#define node_to_cpumask_ptr(v, node)		\
+		cpumask_t *v = _node_to_cpumask_ptr(node)
+
+#define node_to_cpumask_ptr_next(v, node)	\
+			   v = _node_to_cpumask_ptr(node)
 #endif
 
 #include <asm-generic/topology.h>
 
+#ifdef CONFIG_NUMA
+/* Returns the number of the first CPU on Node 'node'. */
+static inline int node_to_first_cpu(int node)
+{
+	node_to_cpumask_ptr(mask, node);
+	return first_cpu(*mask);
+}
+#endif
+
 extern cpumask_t cpu_coregroup_map(int cpu);
 
 #ifdef ENABLE_TOPO_DEFINES
@@ -193,6 +220,9 @@ extern cpumask_t cpu_coregroup_map(int cpu);
 #define topology_core_id(cpu)			(cpu_data(cpu).cpu_core_id)
 #define topology_core_siblings(cpu)		(per_cpu(cpu_core_map, cpu))
 #define topology_thread_siblings(cpu)		(per_cpu(cpu_sibling_map, cpu))
+
+/* indicates that pointers to the topology cpumask_t maps are valid */
+#define arch_provides_topology_pointers		yes
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
@@ -220,4 +250,4 @@ static inline void set_mp_bus_to_node(int busnum, int node)
 }
 #endif
 
-#endif
+#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e4de460907c1..3d647b24041f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1026,6 +1026,7 @@ extern void mem_init(void);
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
+extern int after_bootmem;
 
 #ifdef CONFIG_NUMA
 extern void setup_per_cpu_pageset(void);
diff --git a/kernel/sched.c b/kernel/sched.c
index 94ead43eda62..bcc22b569ee9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6538,9 +6538,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
 
 	min_val = INT_MAX;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
+		n = (node + i) % nr_node_ids;
 
 		if (!nr_cpus_node(n))
 			continue;
@@ -6734,7 +6734,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 		if (!sched_group_nodes)
 			continue;
 
-		for (i = 0; i < MAX_NUMNODES; i++) {
+		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
 			*nodemask = node_to_cpumask(i);
@@ -6927,7 +6927,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
+	sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
 				    GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7066,7 +7066,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7090,7 +7090,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 					send_covered, tmpmask);
 	}
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7129,9 +7129,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
 
-		for (j = 0; j < MAX_NUMNODES; j++) {
+		for (j = 0; j < nr_node_ids; j++) {
 			SCHED_CPUMASK_VAR(notcovered, allmasks);
-			int n = (i + j) % MAX_NUMNODES;
+			int n = (i + j) % nr_node_ids;
 			node_to_cpumask_ptr(pnodemask, n);
 
 			cpus_complement(*notcovered, *covered);
@@ -7184,7 +7184,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	}
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++)
+	for (i = 0; i < nr_node_ids; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
 	if (sd_allnodes) {
author	Ingo Molnar <mingo@elte.hu>	2008-07-08 11:59:23 +0200
committer	Ingo Molnar <mingo@elte.hu>	2008-07-08 11:59:23 +0200
commit	2b4fa851b2f06fdb04cac808b57324f5e51e1578 (patch)
tree	97db3ad5adda7683923630982f68b8b52c86e790
parent	3de352bbd86f890dd0c5e1c09a6a1b0b29e0f8ce (diff)
parent	46f68e1c6b04a04772e828ff3bcd07ed708805c2 (diff)
download	blackbird-op-linux-2b4fa851b2f06fdb04cac808b57324f5e51e1578.tar.gz blackbird-op-linux-2b4fa851b2f06fdb04cac808b57324f5e51e1578.zip