diff options
Diffstat (limited to 'mm/memory_hotplug.c')
-rw-r--r-- | mm/memory_hotplug.c | 553 |
1 files changed, 494 insertions, 59 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d04ed87bfacb..b81a367b9f39 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -29,6 +29,7 @@ #include <linux/suspend.h> #include <linux/mm_inline.h> #include <linux/firmware-map.h> +#include <linux/stop_machine.h> #include <asm/tlbflush.h> @@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res) } #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -#ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type) +void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { page->lru.next = (struct list_head *) type; SetPagePrivate(page); @@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page) mutex_lock(&ppb_lock); __free_pages_bootmem(page, 0); mutex_unlock(&ppb_lock); + totalram_pages++; } } +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE +#ifndef CONFIG_SPARSEMEM_VMEMMAP static void register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long *usemap, mapsize, section_nr, i; @@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } +#else /* CONFIG_SPARSEMEM_VMEMMAP */ +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long *usemap, mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + + if (!pfn_valid(start_pfn)) + return; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); + + usemap = __nr_to_section(section_nr)->pageblock_flags; + page = virt_to_page(usemap); + + mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ void register_page_bootmem_info_node(struct pglist_data *pgdat) { @@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) } pfn = pgdat->node_start_pfn; - end_pfn = pfn + pgdat->node_spanned_pages; + end_pfn = pgdat_end_pfn(pgdat); /* register_section info */ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { @@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) register_page_bootmem_info_section(pfn); } } -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) @@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, set_page_links(pfn_to_page(pfn), zid, nid, pfn); } +/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or + * alloc_bootmem_node_nopanic() */ +static int __ref ensure_zone_is_initialized(struct zone *zone, + unsigned long start_pfn, unsigned long num_pages) +{ + if (!zone_is_initialized(zone)) + return init_currently_empty_zone(zone, start_pfn, num_pages, + MEMMAP_HOTPLUG); + return 0; +} + static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, unsigned long start_pfn, unsigned long end_pfn) { @@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, unsigned long flags; unsigned long z1_start_pfn; - if (!z1->wait_table) { - ret = init_currently_empty_zone(z1, start_pfn, - end_pfn - start_pfn, MEMMAP_HOTPLUG); - if (ret) - return ret; - } + ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); + if (ret) + return ret; pgdat_resize_lock(z1->zone_pgdat, &flags); /* can't move pfns which are higher than @z2 */ - if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) + if (end_pfn > zone_end_pfn(z2)) goto out_fail; /* the move out part mast at the left most of @z2 */ if (start_pfn > z2->zone_start_pfn) @@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, z1_start_pfn = start_pfn; resize_zone(z1, z1_start_pfn, end_pfn); - resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); + resize_zone(z2, end_pfn, zone_end_pfn(z2)); pgdat_resize_unlock(z1->zone_pgdat, &flags); @@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, unsigned long flags; unsigned long z2_end_pfn; - if (!z2->wait_table) { - ret = init_currently_empty_zone(z2, start_pfn, - end_pfn - start_pfn, MEMMAP_HOTPLUG); - if (ret) - return ret; - } + ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); + if (ret) + return ret; pgdat_resize_lock(z1->zone_pgdat, &flags); @@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, if (z1->zone_start_pfn > start_pfn) goto out_fail; /* the move out part mast at the right most of @z1 */ - if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) + if (zone_end_pfn(z1) > end_pfn) goto out_fail; /* must included/overlap */ - if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) + if (start_pfn >= zone_end_pfn(z1)) goto out_fail; /* use end_pfn for z2's end_pfn if z2 is empty */ if (z2->spanned_pages) - z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; + z2_end_pfn = zone_end_pfn(z2); else z2_end_pfn = end_pfn; @@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) int nid = pgdat->node_id; int zone_type; unsigned long flags; + int ret; zone_type = zone - pgdat->node_zones; - if (!zone->wait_table) { - int ret; + ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); + if (ret) + return ret; - ret = init_currently_empty_zone(zone, phys_start_pfn, - nr_pages, MEMMAP_HOTPLUG); - if (ret) - return ret; - } pgdat_resize_lock(zone->zone_pgdat, &flags); grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, @@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone, return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } -#ifdef CONFIG_SPARSEMEM_VMEMMAP -static int __remove_section(struct zone *zone, struct mem_section *ms) +/* find the smallest valid pfn in the range [start_pfn, end_pfn) */ +static int find_smallest_section_pfn(int nid, struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mem_section *ms; + + for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(start_pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (unlikely(pfn_to_nid(start_pfn) != nid)) + continue; + + if (zone && zone != page_zone(pfn_to_page(start_pfn))) + continue; + + return start_pfn; + } + + return 0; +} + +/* find the biggest valid pfn in the range [start_pfn, end_pfn). */ +static int find_biggest_section_pfn(int nid, struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mem_section *ms; + unsigned long pfn; + + /* pfn is the end pfn of a memory section. */ + pfn = end_pfn - 1; + for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (unlikely(pfn_to_nid(pfn) != nid)) + continue; + + if (zone && zone != page_zone(pfn_to_page(pfn))) + continue; + + return pfn; + } + + return 0; +} + +static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) { + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + unsigned long pfn; + struct mem_section *ms; + int nid = zone_to_nid(zone); + + zone_span_writelock(zone); + if (zone_start_pfn == start_pfn) { + /* + * If the section is smallest section in the zone, it need + * shrink zone->zone_start_pfn and zone->zone_spanned_pages. + * In this case, we find second smallest valid mem_section + * for shrinking zone. + */ + pfn = find_smallest_section_pfn(nid, zone, end_pfn, + zone_end_pfn); + if (pfn) { + zone->zone_start_pfn = pfn; + zone->spanned_pages = zone_end_pfn - pfn; + } + } else if (zone_end_pfn == end_pfn) { + /* + * If the section is biggest section in the zone, it need + * shrink zone->spanned_pages. + * In this case, we find second biggest valid mem_section for + * shrinking zone. + */ + pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, + start_pfn); + if (pfn) + zone->spanned_pages = pfn - zone_start_pfn + 1; + } + /* - * XXX: Freeing memmap with vmemmap is not implement yet. - * This should be removed later. + * The section is not biggest or smallest mem_section in the zone, it + * only creates a hole in the zone. So in this case, we need not + * change the zone. But perhaps, the zone has only hole data. Thus + * it check the zone has only hole or not. */ - return -EBUSY; + pfn = zone_start_pfn; + for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (page_zone(pfn_to_page(pfn)) != zone) + continue; + + /* If the section is current section, it continues the loop */ + if (start_pfn == pfn) + continue; + + /* If we find valid section, we have nothing to do */ + zone_span_writeunlock(zone); + return; + } + + /* The zone has no valid section */ + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + zone_span_writeunlock(zone); } -#else -static int __remove_section(struct zone *zone, struct mem_section *ms) + +static void shrink_pgdat_span(struct pglist_data *pgdat, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pgdat_start_pfn = pgdat->node_start_pfn; + unsigned long pgdat_end_pfn = + pgdat->node_start_pfn + pgdat->node_spanned_pages; + unsigned long pfn; + struct mem_section *ms; + int nid = pgdat->node_id; + + if (pgdat_start_pfn == start_pfn) { + /* + * If the section is smallest section in the pgdat, it need + * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. + * In this case, we find second smallest valid mem_section + * for shrinking zone. + */ + pfn = find_smallest_section_pfn(nid, NULL, end_pfn, + pgdat_end_pfn); + if (pfn) { + pgdat->node_start_pfn = pfn; + pgdat->node_spanned_pages = pgdat_end_pfn - pfn; + } + } else if (pgdat_end_pfn == end_pfn) { + /* + * If the section is biggest section in the pgdat, it need + * shrink pgdat->node_spanned_pages. + * In this case, we find second biggest valid mem_section for + * shrinking zone. + */ + pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, + start_pfn); + if (pfn) + pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; + } + + /* + * If the section is not biggest or smallest mem_section in the pgdat, + * it only creates a hole in the pgdat. So in this case, we need not + * change the pgdat. + * But perhaps, the pgdat has only hole data. Thus it check the pgdat + * has only hole or not. + */ + pfn = pgdat_start_pfn; + for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (pfn_to_nid(pfn) != nid) + continue; + + /* If the section is current section, it continues the loop */ + if (start_pfn == pfn) + continue; + + /* If we find valid section, we have nothing to do */ + return; + } + + /* The pgdat has no valid section */ + pgdat->node_start_pfn = 0; + pgdat->node_spanned_pages = 0; +} + +static void __remove_zone(struct zone *zone, unsigned long start_pfn) { - unsigned long flags; struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int zone_type; + unsigned long flags; + + zone_type = zone - pgdat->node_zones; + + pgdat_resize_lock(zone->zone_pgdat, &flags); + shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); + shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); +} + +static int __remove_section(struct zone *zone, struct mem_section *ms) +{ + unsigned long start_pfn; + int scn_nr; int ret = -EINVAL; if (!valid_section(ms)) @@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) if (ret) return ret; - pgdat_resize_lock(pgdat, &flags); + scn_nr = __section_nr(ms); + start_pfn = section_nr_to_pfn(scn_nr); + __remove_zone(zone, start_pfn); + sparse_remove_one_section(zone, ms); - pgdat_resize_unlock(pgdat, &flags); return 0; } -#endif /* * Reasonably generic function for adding memory. It is @@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) unsigned long zholes_size[MAX_NR_ZONES] = {0}; unsigned long start_pfn = start >> PAGE_SHIFT; - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - return NULL; + pgdat = NODE_DATA(nid); + if (!pgdat) { + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) + return NULL; - arch_refresh_nodedata(nid, pgdat); + arch_refresh_nodedata(nid, pgdat); + } /* we can use NODE_DATA(nid) from here */ @@ -854,7 +1080,8 @@ out: int __ref add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat = NULL; - int new_pgdat = 0; + bool new_pgdat; + bool new_node; struct resource *res; int ret; @@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size) if (!res) goto out; - if (!node_online(nid)) { + { /* Stupid hack to suppress address-never-null warning */ + void *p = NODE_DATA(nid); + new_pgdat = !p; + } + new_node = !node_online(nid); + if (new_node) { pgdat = hotadd_new_pgdat(nid, start); ret = -ENOMEM; if (!pgdat) goto error; - new_pgdat = 1; } /* call arch's memory hotadd */ @@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size) /* we online node here. we can't roll back from here. */ node_set_online(nid); - if (new_pgdat) { + if (new_node) { ret = register_one_node(nid); /* * If sysfs file of new node can't create, cpu on the node @@ -901,8 +1132,7 @@ error: /* rollback pgdat allocation and others */ if (new_pgdat) rollback_node_hotadd(nid, pgdat); - if (res) - release_memory_resource(res); + release_memory_resource(res); out: unlock_memory_hotplug(); @@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * migrate_pages returns # of failed pages. */ ret = migrate_pages(&source, alloc_migrate_target, 0, - true, MIGRATE_SYNC, - MR_MEMORY_HOTPLUG); + MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_lru_pages(&source); } @@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); } -int remove_memory(u64 start, u64 size) +/** + * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) + * @start_pfn: start pfn of the memory range + * @end_pfn: end pft of the memory range + * @arg: argument passed to func + * @func: callback for each memory section walked + * + * This function walks through all present mem sections in range + * [start_pfn, end_pfn) and call func on each mem section. + * + * Returns the return value of func. + */ +static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, + void *arg, int (*func)(struct memory_block *, void *)) { struct memory_block *mem = NULL; struct mem_section *section; - unsigned long start_pfn, end_pfn; unsigned long pfn, section_nr; int ret; - start_pfn = PFN_DOWN(start); - end_pfn = start_pfn + PFN_DOWN(size); - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { section_nr = pfn_to_section_nr(pfn); if (!present_section_nr(section_nr)) @@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size) if (!mem) continue; - ret = offline_memory_block(mem); + ret = func(mem, arg); if (ret) { kobject_put(&mem->dev.kobj); return ret; @@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size) return 0; } + +/** + * offline_memory_block_cb - callback function for offlining memory block + * @mem: the memory block to be offlined + * @arg: buffer to hold error msg + * + * Always return 0, and put the error msg in arg if any. + */ +static int offline_memory_block_cb(struct memory_block *mem, void *arg) +{ + int *ret = arg; + int error = offline_memory_block(mem); + + if (error != 0 && *ret == 0) + *ret = error; + + return 0; +} + +static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) +{ + int ret = !is_memblock_offlined(mem); + + if (unlikely(ret)) + pr_warn("removing memory fails, because memory " + "[%#010llx-%#010llx] is onlined\n", + PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)), + PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1); + + return ret; +} + +static int check_cpu_on_node(void *data) +{ + struct pglist_data *pgdat = data; + int cpu; + + for_each_present_cpu(cpu) { + if (cpu_to_node(cpu) == pgdat->node_id) + /* + * the cpu on this node isn't removed, and we can't + * offline this node. + */ + return -EBUSY; + } + + return 0; +} + +static void unmap_cpu_on_node(void *data) +{ +#ifdef CONFIG_ACPI_NUMA + struct pglist_data *pgdat = data; + int cpu; + + for_each_possible_cpu(cpu) + if (cpu_to_node(cpu) == pgdat->node_id) + numa_clear_node(cpu); +#endif +} + +static int check_and_unmap_cpu_on_node(void *data) +{ + int ret = check_cpu_on_node(data); + + if (ret) + return ret; + + /* + * the node will be offlined when we come here, so we can clear + * the cpu_to_node() now. + */ + + unmap_cpu_on_node(data); + return 0; +} + +/* offline the node if all memory sections of this node are removed */ +void try_offline_node(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = pgdat->node_start_pfn; + unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; + unsigned long pfn; + struct page *pgdat_page = virt_to_page(pgdat); + int i; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + + if (!present_section_nr(section_nr)) + continue; + + if (pfn_to_nid(pfn) != nid) + continue; + + /* + * some memory sections of this node are not removed, and we + * can't offline node now. + */ + return; + } + + if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) + return; + + /* + * all memory/cpu of this node are removed, we can offline this + * node now. + */ + node_set_offline(nid); + unregister_one_node(nid); + + if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) + /* node data is allocated from boot memory */ + return; + + /* free waittable in each zone */ + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (zone->wait_table) + vfree(zone->wait_table); + } + + /* + * Since there is no way to guarentee the address of pgdat/zone is not + * on stack of any kernel threads or used by other kernel objects + * without reference counting or other symchronizing method, do not + * reset node_data and free pgdat here. Just reset it to 0 and reuse + * the memory when the node is online again. + */ + memset(pgdat, 0, sizeof(*pgdat)); +} +EXPORT_SYMBOL(try_offline_node); + +int __ref remove_memory(int nid, u64 start, u64 size) +{ + unsigned long start_pfn, end_pfn; + int ret = 0; + int retry = 1; + + start_pfn = PFN_DOWN(start); + end_pfn = start_pfn + PFN_DOWN(size); + + /* + * When CONFIG_MEMCG is on, one memory block may be used by other + * blocks to store page cgroup when onlining pages. But we don't know + * in what order pages are onlined. So we iterate twice to offline + * memory: + * 1st iterate: offline every non primary memory block. + * 2nd iterate: offline primary (i.e. first added) memory block. + */ +repeat: + walk_memory_range(start_pfn, end_pfn, &ret, + offline_memory_block_cb); + if (ret) { + if (!retry) + return ret; + + retry = 0; + ret = 0; + goto repeat; + } + + lock_memory_hotplug(); + + /* + * we have offlined all memory blocks like this: + * 1. lock memory hotplug + * 2. offline a memory block + * 3. unlock memory hotplug + * + * repeat step1-3 to offline the memory block. All memory blocks + * must be offlined before removing memory. But we don't hold the + * lock in the whole operation. So we should check whether all + * memory blocks are offlined. + */ + + ret = walk_memory_range(start_pfn, end_pfn, NULL, + is_memblock_offlined_cb); + if (ret) { + unlock_memory_hotplug(); + return ret; + } + + /* remove memmap entry */ + firmware_map_remove(start, start + size, "System RAM"); + + arch_remove_memory(start, size); + + try_offline_node(nid); + + unlock_memory_hotplug(); + + return 0; +} #else int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return -EINVAL; } -int remove_memory(u64 start, u64 size) +int remove_memory(int nid, u64 start, u64 size) { return -EINVAL; } |