summaryrefslogtreecommitdiffstats
path: root/mm/memory_hotplug.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory_hotplug.c')
-rw-r--r--mm/memory_hotplug.c475
1 files changed, 219 insertions, 256 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c73f09913165..0a54ffac8c68 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -49,8 +49,6 @@
* and restore_online_page_callback() for generic callback restore.
*/
-static void generic_online_page(struct page *page, unsigned int order);
-
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
@@ -278,6 +276,22 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
return 0;
}
+static int check_hotplug_memory_addressable(unsigned long pfn,
+ unsigned long nr_pages)
+{
+ const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
+
+ if (max_addr >> MAX_PHYSMEM_BITS) {
+ const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
+ WARN(1,
+ "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
+ (u64)PFN_PHYS(pfn), max_addr, max_allowed);
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
/*
* Reasonably generic function for adding memory. It is
* expected that archs that support memory hotplug will
@@ -291,6 +305,10 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
unsigned long nr, start_sec, end_sec;
struct vmem_altmap *altmap = restrictions->altmap;
+ err = check_hotplug_memory_addressable(pfn, nr_pages);
+ if (err)
+ return err;
+
if (altmap) {
/*
* Validate altmap is within bounds of the total request
@@ -331,13 +349,13 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long end_pfn)
{
for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(start_pfn)))
+ if (unlikely(!pfn_to_online_page(start_pfn)))
continue;
if (unlikely(pfn_to_nid(start_pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+ if (zone != page_zone(pfn_to_page(start_pfn)))
continue;
return start_pfn;
@@ -356,13 +374,13 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (unlikely(pfn_to_nid(pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(pfn)))
+ if (zone != page_zone(pfn_to_page(pfn)))
continue;
return pfn;
@@ -374,14 +392,11 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long zone_start_pfn = zone->zone_start_pfn;
- unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
- unsigned long zone_end_pfn = z;
unsigned long pfn;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
- if (zone_start_pfn == start_pfn) {
+ if (zone->zone_start_pfn == start_pfn) {
/*
* If the section is smallest section in the zone, it need
* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
@@ -389,144 +404,106 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
* for shrinking zone.
*/
pfn = find_smallest_section_pfn(nid, zone, end_pfn,
- zone_end_pfn);
+ zone_end_pfn(zone));
if (pfn) {
+ zone->spanned_pages = zone_end_pfn(zone) - pfn;
zone->zone_start_pfn = pfn;
- zone->spanned_pages = zone_end_pfn - pfn;
+ } else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
}
- } else if (zone_end_pfn == end_pfn) {
+ } else if (zone_end_pfn(zone) == end_pfn) {
/*
* If the section is biggest section in the zone, it need
* shrink zone->spanned_pages.
* In this case, we find second biggest valid mem_section for
* shrinking zone.
*/
- pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+ pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
start_pfn);
if (pfn)
- zone->spanned_pages = pfn - zone_start_pfn + 1;
- }
-
- /*
- * The section is not biggest or smallest mem_section in the zone, it
- * only creates a hole in the zone. So in this case, we need not
- * change the zone. But perhaps, the zone has only hole data. Thus
- * it check the zone has only hole or not.
- */
- pfn = zone_start_pfn;
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
- continue;
-
- if (page_zone(pfn_to_page(pfn)) != zone)
- continue;
-
- /* Skip range to be removed */
- if (pfn >= start_pfn && pfn < end_pfn)
- continue;
-
- /* If we find valid section, we have nothing to do */
- zone_span_writeunlock(zone);
- return;
+ zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
+ else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
}
-
- /* The zone has no valid section */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
zone_span_writeunlock(zone);
}
-static void shrink_pgdat_span(struct pglist_data *pgdat,
- unsigned long start_pfn, unsigned long end_pfn)
+static void update_pgdat_span(struct pglist_data *pgdat)
{
- unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
- unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
- unsigned long pgdat_end_pfn = p;
- unsigned long pfn;
- int nid = pgdat->node_id;
+ unsigned long node_start_pfn = 0, node_end_pfn = 0;
+ struct zone *zone;
- if (pgdat_start_pfn == start_pfn) {
- /*
- * If the section is smallest section in the pgdat, it need
- * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
- * In this case, we find second smallest valid mem_section
- * for shrinking zone.
- */
- pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
- pgdat_end_pfn);
- if (pfn) {
- pgdat->node_start_pfn = pfn;
- pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
- }
- } else if (pgdat_end_pfn == end_pfn) {
- /*
- * If the section is biggest section in the pgdat, it need
- * shrink pgdat->node_spanned_pages.
- * In this case, we find second biggest valid mem_section for
- * shrinking zone.
- */
- pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
- start_pfn);
- if (pfn)
- pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
- }
+ for (zone = pgdat->node_zones;
+ zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+ unsigned long zone_end_pfn = zone->zone_start_pfn +
+ zone->spanned_pages;
- /*
- * If the section is not biggest or smallest mem_section in the pgdat,
- * it only creates a hole in the pgdat. So in this case, we need not
- * change the pgdat.
- * But perhaps, the pgdat has only hole data. Thus it check the pgdat
- * has only hole or not.
- */
- pfn = pgdat_start_pfn;
- for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
- continue;
-
- if (pfn_to_nid(pfn) != nid)
+ /* No need to lock the zones, they can't change. */
+ if (!zone->spanned_pages)
continue;
-
- /* Skip range to be removed */
- if (pfn >= start_pfn && pfn < end_pfn)
+ if (!node_end_pfn) {
+ node_start_pfn = zone->zone_start_pfn;
+ node_end_pfn = zone_end_pfn;
continue;
+ }
- /* If we find valid section, we have nothing to do */
- return;
+ if (zone_end_pfn > node_end_pfn)
+ node_end_pfn = zone_end_pfn;
+ if (zone->zone_start_pfn < node_start_pfn)
+ node_start_pfn = zone->zone_start_pfn;
}
- /* The pgdat has no valid section */
- pgdat->node_start_pfn = 0;
- pgdat->node_spanned_pages = 0;
+ pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
-static void __remove_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages)
+void __ref remove_pfn_range_from_zone(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long nr_pages)
{
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long flags;
+ /* Poison struct pages because they are now uninitialized again. */
+ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
+ * we will not try to shrink the zones - which is okay as
+ * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
+ */
+ if (zone_idx(zone) == ZONE_DEVICE)
+ return;
+#endif
+
+ clear_zone_contiguous(zone);
+
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
- shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+ update_pgdat_span(pgdat);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ set_zone_contiguous(zone);
}
-static void __remove_section(struct zone *zone, unsigned long pfn,
- unsigned long nr_pages, unsigned long map_offset,
- struct vmem_altmap *altmap)
+static void __remove_section(unsigned long pfn, unsigned long nr_pages,
+ unsigned long map_offset,
+ struct vmem_altmap *altmap)
{
struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
if (WARN_ON_ONCE(!valid_section(ms)))
return;
- __remove_zone(zone, pfn, nr_pages);
sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
}
/**
- * __remove_pages() - remove sections of pages from a zone
- * @zone: zone from which pages need to be removed
+ * __remove_pages() - remove sections of pages
* @pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
* @altmap: alternative device page map or %NULL if default memmap is used
@@ -536,34 +513,25 @@ static void __remove_section(struct zone *zone, unsigned long pfn,
* sure that pages are marked reserved and zones are adjust properly by
* calling offline_pages().
*/
-void __remove_pages(struct zone *zone, unsigned long pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+void __remove_pages(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
unsigned long map_offset = 0;
- unsigned long nr, start_sec, end_sec;
map_offset = vmem_altmap_offset(altmap);
- clear_zone_contiguous(zone);
-
if (check_pfn_span(pfn, nr_pages, "remove"))
return;
- start_sec = pfn_to_section_nr(pfn);
- end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
- for (nr = start_sec; nr <= end_sec; nr++) {
- unsigned long pfns;
-
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
- pfns = min(nr_pages, PAGES_PER_SECTION
- - (pfn & ~PAGE_SECTION_MASK));
- __remove_section(zone, pfn, pfns, map_offset, altmap);
- pfn += pfns;
- nr_pages -= pfns;
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK));
+ __remove_section(pfn, cur_nr_pages, map_offset, altmap);
map_offset = 0;
}
-
- set_zone_contiguous(zone);
}
int set_online_page_callback(online_page_callback_t callback)
@@ -604,24 +572,7 @@ int restore_online_page_callback(online_page_callback_t callback)
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);
-void __online_page_set_limits(struct page *page)
-{
-}
-EXPORT_SYMBOL_GPL(__online_page_set_limits);
-
-void __online_page_increment_counters(struct page *page)
-{
- adjust_managed_page_count(page, 1);
-}
-EXPORT_SYMBOL_GPL(__online_page_increment_counters);
-
-void __online_page_free(struct page *page)
-{
- __free_reserved_page(page);
-}
-EXPORT_SYMBOL_GPL(__online_page_free);
-
-static void generic_online_page(struct page *page, unsigned int order)
+void generic_online_page(struct page *page, unsigned int order)
{
kernel_map_pages(page, 1 << order, 1);
__free_pages_core(page, order);
@@ -631,34 +582,32 @@ static void generic_online_page(struct page *page, unsigned int order)
totalhigh_pages_add(1UL << order);
#endif
}
-
-static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
-{
- unsigned long end = start + nr_pages;
- int order, onlined_pages = 0;
-
- while (start < end) {
- order = min(MAX_ORDER - 1,
- get_order(PFN_PHYS(end) - PFN_PHYS(start)));
- (*online_page_callback)(pfn_to_page(start), order);
-
- onlined_pages += (1UL << order);
- start += (1UL << order);
- }
- return onlined_pages;
-}
+EXPORT_SYMBOL_GPL(generic_online_page);
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
- unsigned long onlined_pages = *(unsigned long *)arg;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn;
+ int order;
- if (PageReserved(pfn_to_page(start_pfn)))
- onlined_pages += online_pages_blocks(start_pfn, nr_pages);
+ /*
+ * Online the pages. The callback might decide to keep some pages
+ * PG_reserved (to add them to the buddy later), but we still account
+ * them as being online/belonging to this zone ("present").
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
+ order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
+ /* __free_pages_core() wants pfns to be aligned to the order */
+ if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
+ order = 0;
+ (*online_page_callback)(pfn_to_page(pfn), order);
+ }
- online_mem_sections(start_pfn, start_pfn + nr_pages);
+ /* mark all involved sections as online */
+ online_mem_sections(start_pfn, end_pfn);
- *(unsigned long *)arg = onlined_pages;
+ *(unsigned long *)arg += nr_pages;
return 0;
}
@@ -714,8 +663,13 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
pgdat->node_start_pfn = start_pfn;
pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
-}
+}
+/*
+ * Associate the pfn range with the given zone, initializing the memmaps
+ * and resizing the pgdat/zone data to span the added pages. After this
+ * call, all affected pages are PG_reserved.
+ */
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap)
{
@@ -804,43 +758,21 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-/*
- * Associates the given pfn range with the given node and the zone appropriate
- * for the given online type.
- */
-static struct zone * __meminit move_pfn_range(int online_type, int nid,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- struct zone *zone;
-
- zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
- move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
- return zone;
-}
-
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+ int online_type, int nid)
{
unsigned long flags;
unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
- int nid;
int ret;
struct memory_notify arg;
- struct memory_block *mem;
mem_hotplug_begin();
- /*
- * We can't use pfn_to_nid() because nid might be stored in struct page
- * which is not yet initialized. Instead, we find nid from memory block.
- */
- mem = find_memory_block(__pfn_to_section(pfn));
- nid = mem->nid;
- put_device(&mem->dev);
-
/* associate pfn range with the zone */
- zone = move_pfn_range(online_type, nid, pfn, nr_pages);
+ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -864,6 +796,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+ /* not a single memory resource was applicable */
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
goto failed_addition;
@@ -877,27 +810,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
shuffle_zone(zone);
- if (onlined_pages) {
- node_states_set_node(nid, &arg);
- if (need_zonelists_rebuild)
- build_all_zonelists(NULL);
- else
- zone_pcp_update(zone);
- }
+ node_states_set_node(nid, &arg);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(NULL);
+ else
+ zone_pcp_update(zone);
init_per_zone_wmark_min();
- if (onlined_pages) {
- kswapd_run(nid);
- kcompactd_run(nid);
- }
+ kswapd_run(nid);
+ kcompactd_run(nid);
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
- if (onlined_pages)
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &arg);
mem_hotplug_done();
return 0;
@@ -906,6 +834,7 @@ failed_addition:
(unsigned long long) pfn << PAGE_SHIFT,
(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
+ remove_pfn_range_from_zone(zone, pfn, nr_pages);
mem_hotplug_done();
return ret;
}
@@ -933,8 +862,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
if (!pgdat)
return NULL;
+ pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
arch_refresh_nodedata(nid, pgdat);
} else {
+ int cpu;
/*
* Reset the nr_zones, order and classzone_idx before reuse.
* Note that kswapd will init kswapd_classzone_idx properly
@@ -943,6 +875,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = 0;
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
}
/* we can use NODE_DATA(nid) from here */
@@ -952,7 +890,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_core_hotplug(nid);
- pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
/*
* The node we allocated has no zone fallback lists. For avoiding
@@ -1211,7 +1148,8 @@ static bool is_pageblock_removable_nolock(unsigned long pfn)
if (!zone_spans_pfn(zone, pfn))
return false;
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
+ return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
+ MEMORY_OFFLINE);
}
/* Checks if this range of memory is likely to be hot-removable. */
@@ -1234,14 +1172,13 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) belong to the same zone.
- * When true, return its valid [start, end).
+ * Confirm all pages in a range [start, end) belong to the same zone (skipping
+ * memory holes). When true, return the zone.
*/
-int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
- unsigned long *valid_start, unsigned long *valid_end)
+struct zone *test_pages_in_a_zone(unsigned long start_pfn,
+ unsigned long end_pfn)
{
unsigned long pfn, sec_end_pfn;
- unsigned long start, end;
struct zone *zone = NULL;
struct page *page;
int i;
@@ -1262,24 +1199,15 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
continue;
/* Check if we got outside of the zone */
if (zone && !zone_spans_pfn(zone, pfn + i))
- return 0;
+ return NULL;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
- return 0;
- if (!zone)
- start = pfn + i;
+ return NULL;
zone = page_zone(page);
- end = pfn + MAX_ORDER_NR_PAGES;
}
}
- if (zone) {
- *valid_start = start;
- *valid_end = min(end, end_pfn);
- return 1;
- } else {
- return 0;
- }
+ return zone;
}
/*
@@ -1309,7 +1237,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
head = compound_head(page);
if (page_huge_active(head))
return pfn;
- skip = (1 << compound_order(head)) - (page - head);
+ skip = compound_nr(head) - (page - head);
pfn += skip - 1;
}
return 0;
@@ -1347,7 +1275,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
struct page *head = compound_head(page);
- pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
+ pfn = page_to_pfn(head) + compound_nr(head) - 1;
isolate_huge_page(head, &source);
continue;
} else if (PageTransHuge(page))
@@ -1408,9 +1336,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
-/*
- * remove from free_area[] and mark all as Reserved.
- */
+/* Mark all sections offline and remove all free pages from the buddy. */
static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
void *data)
@@ -1428,7 +1354,8 @@ static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
void *data)
{
- return test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
+ return test_pages_isolated(start_pfn, start_pfn + nr_pages,
+ MEMORY_OFFLINE);
}
static int __init cmdline_parse_movable_node(char *p)
@@ -1509,37 +1436,58 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
node_clear_state(node, N_MEMORY);
}
+static int count_system_ram_pages_cb(unsigned long start_pfn,
+ unsigned long nr_pages, void *data)
+{
+ unsigned long *nr_system_ram_pages = data;
+
+ *nr_system_ram_pages += nr_pages;
+ return 0;
+}
+
static int __ref __offline_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long pfn, nr_pages;
+ unsigned long pfn, nr_pages = 0;
unsigned long offlined_pages = 0;
int ret, node, nr_isolate_pageblock;
unsigned long flags;
- unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
char *reason;
mem_hotplug_begin();
+ /*
+ * Don't allow to offline memory blocks that contain holes.
+ * Consequently, memory blocks with holes can never get onlined
+ * via the hotplug path - online_pages() - as hotplugged memory has
+ * no holes. This way, we e.g., don't have to worry about marking
+ * memory holes PG_reserved, don't need pfn_valid() checks, and can
+ * avoid using walk_system_ram_range() later.
+ */
+ walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages,
+ count_system_ram_pages_cb);
+ if (nr_pages != end_pfn - start_pfn) {
+ ret = -EINVAL;
+ reason = "memory holes";
+ goto failed_removal;
+ }
+
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
- if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
- &valid_end)) {
+ zone = test_pages_in_a_zone(start_pfn, end_pfn);
+ if (!zone) {
ret = -EINVAL;
reason = "multizone range";
goto failed_removal;
}
-
- zone = page_zone(pfn_to_page(valid_start));
node = zone_to_nid(zone);
- nr_pages = end_pfn - start_pfn;
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
- SKIP_HWPOISON | REPORT_FAILURE);
+ MEMORY_OFFLINE | REPORT_FAILURE);
if (ret < 0) {
reason = "failure to isolate range";
goto failed_removal;
@@ -1633,6 +1581,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
+ remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
mem_hotplug_done();
return 0;
@@ -1662,7 +1611,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
phys_addr_t beginpa, endpa;
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
- endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
+ endpa = beginpa + memory_block_size_bytes() - 1;
pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
@@ -1687,6 +1636,18 @@ static int check_cpu_on_node(pg_data_t *pgdat)
return 0;
}
+static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
+{
+ int nid = *(int *)arg;
+
+ /*
+ * If a memory block belongs to multiple nodes, the stored nid is not
+ * reliable. However, such blocks are always online (e.g., cannot get
+ * offlined) and, therefore, are still spanned by the node.
+ */
+ return mem->nid == nid ? -EEXIST : 0;
+}
+
/**
* try_offline_node
* @nid: the node ID
@@ -1699,25 +1660,24 @@ static int check_cpu_on_node(pg_data_t *pgdat)
void try_offline_node(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long start_pfn = pgdat->node_start_pfn;
- unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
- unsigned long pfn;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(pfn);
-
- if (!present_section_nr(section_nr))
- continue;
+ int rc;
- if (pfn_to_nid(pfn) != nid)
- continue;
+ /*
+ * If the node still spans pages (especially ZONE_DEVICE), don't
+ * offline it. A node spans memory after move_pfn_range_to_zone(),
+ * e.g., after the memory block was onlined.
+ */
+ if (pgdat->node_spanned_pages)
+ return;
- /*
- * some memory sections of this node are not removed, and we
- * can't offline node now.
- */
+ /*
+ * Especially offline memory blocks might not be spanned by the
+ * node. They will get spanned by the node once they get onlined.
+ * However, they link to the node in sysfs and can get onlined later.
+ */
+ rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
+ if (rc)
return;
- }
if (check_cpu_on_node(pgdat))
return;
@@ -1757,8 +1717,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
BUG_ON(check_hotplug_memory_range(start, size));
- mem_hotplug_begin();
-
/*
* All memory blocks must be offlined before removing memory. Check
* whether all memory blocks in question are offline and return error
@@ -1770,13 +1728,18 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
- memblock_free(start, size);
- memblock_remove(start, size);
- /* remove memory block devices before removing memory */
+ /*
+ * Memory block device removal under the device_hotplug_lock is
+ * a barrier against racing online attempts.
+ */
remove_memory_block_devices(start, size);
+ mem_hotplug_begin();
+
arch_remove_memory(nid, start, size, NULL);
+ memblock_free(start, size);
+ memblock_remove(start, size);
__release_memory_resource(start, size);
try_offline_node(nid);
@@ -1800,7 +1763,7 @@ void __remove_memory(int nid, u64 start, u64 size)
{
/*
- * trigger BUG() is some memory is not offlined prior to calling this
+ * trigger BUG() if some memory is not offlined prior to calling this
* function
*/
if (try_remove_memory(nid, start, size))
OpenPOWER on IntegriCloud