diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 466 | 
1 files changed, 270 insertions, 196 deletions
| diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 77e4d3c5c57b..d4096f4a5c1f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -24,7 +24,6 @@  #include <linux/memblock.h>  #include <linux/compiler.h>  #include <linux/kernel.h> -#include <linux/kmemcheck.h>  #include <linux/kasan.h>  #include <linux/module.h>  #include <linux/suspend.h> @@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);  EXPORT_PER_CPU_SYMBOL(numa_node);  #endif +DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); +  #ifdef CONFIG_HAVE_MEMORYLESS_NODES  /*   * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. @@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes);  int page_group_by_mobility_disabled __read_mostly;  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + +/* + * Determine how many pages need to be initialized durig early boot + * (non-deferred initialization). + * The value of first_deferred_pfn will be set later, once non-deferred pages + * are initialized, but for now set it ULONG_MAX. + */  static inline void reset_deferred_meminit(pg_data_t *pgdat)  { -	unsigned long max_initialise; -	unsigned long reserved_lowmem; +	phys_addr_t start_addr, end_addr; +	unsigned long max_pgcnt; +	unsigned long reserved;  	/*  	 * Initialise at least 2G of a node but also take into account that  	 * two large system hashes that can take up 1GB for 0.25TB/node.  	 */ -	max_initialise = max(2UL << (30 - PAGE_SHIFT), -		(pgdat->node_spanned_pages >> 8)); +	max_pgcnt = max(2UL << (30 - PAGE_SHIFT), +			(pgdat->node_spanned_pages >> 8));  	/*  	 * Compensate the all the memblock reservations (e.g. crash kernel)  	 * from the initial estimation to make sure we will initialize enough  	 * memory to boot.  	 */ -	reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, -			pgdat->node_start_pfn + max_initialise); -	max_initialise += reserved_lowmem; +	start_addr = PFN_PHYS(pgdat->node_start_pfn); +	end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); +	reserved = memblock_reserved_memory_within(start_addr, end_addr); +	max_pgcnt += PHYS_PFN(reserved); -	pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); +	pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);  	pgdat->first_deferred_pfn = ULONG_MAX;  } @@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,  	if (zone_end < pgdat_end_pfn(pgdat))  		return true;  	(*nr_initialised)++; -	if ((*nr_initialised > pgdat->static_init_size) && +	if ((*nr_initialised > pgdat->static_init_pgcnt) &&  	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {  		pgdat->first_deferred_pfn = pfn;  		return false; @@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page,  	VM_BUG_ON_PAGE(PageTail(page), page);  	trace_mm_page_free(page, order); -	kmemcheck_free_shadow(page, order);  	/*  	 * Check tail pages before head page information is cleared to @@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone,  static void __meminit __init_single_page(struct page *page, unsigned long pfn,  				unsigned long zone, int nid)  { +	mm_zero_struct_page(page);  	set_page_links(page, zone, nid, pfn);  	init_page_count(page);  	page_mapcount_reset(page); @@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone)  }  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void __init deferred_free_range(struct page *page, -					unsigned long pfn, int nr_pages) +static void __init deferred_free_range(unsigned long pfn, +				       unsigned long nr_pages)  { -	int i; +	struct page *page; +	unsigned long i; -	if (!page) +	if (!nr_pages)  		return; +	page = pfn_to_page(pfn); +  	/* Free a large naturally-aligned chunk if possible */  	if (nr_pages == pageblock_nr_pages &&  	    (pfn & (pageblock_nr_pages - 1)) == 0) { @@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void)  		complete(&pgdat_init_all_done_comp);  } +/* + * Helper for deferred_init_range, free the given range, reset the counters, and + * return number of pages freed. + */ +static inline unsigned long __init __def_free(unsigned long *nr_free, +					      unsigned long *free_base_pfn, +					      struct page **page) +{ +	unsigned long nr = *nr_free; + +	deferred_free_range(*free_base_pfn, nr); +	*free_base_pfn = 0; +	*nr_free = 0; +	*page = NULL; + +	return nr; +} + +static unsigned long __init deferred_init_range(int nid, int zid, +						unsigned long start_pfn, +						unsigned long end_pfn) +{ +	struct mminit_pfnnid_cache nid_init_state = { }; +	unsigned long nr_pgmask = pageblock_nr_pages - 1; +	unsigned long free_base_pfn = 0; +	unsigned long nr_pages = 0; +	unsigned long nr_free = 0; +	struct page *page = NULL; +	unsigned long pfn; + +	/* +	 * First we check if pfn is valid on architectures where it is possible +	 * to have holes within pageblock_nr_pages. On systems where it is not +	 * possible, this function is optimized out. +	 * +	 * Then, we check if a current large page is valid by only checking the +	 * validity of the head pfn. +	 * +	 * meminit_pfn_in_nid is checked on systems where pfns can interleave +	 * within a node: a pfn is between start and end of a node, but does not +	 * belong to this memory node. +	 * +	 * Finally, we minimize pfn page lookups and scheduler checks by +	 * performing it only once every pageblock_nr_pages. +	 * +	 * We do it in two loops: first we initialize struct page, than free to +	 * buddy allocator, becuse while we are freeing pages we can access +	 * pages that are ahead (computing buddy page in __free_one_page()). +	 */ +	for (pfn = start_pfn; pfn < end_pfn; pfn++) { +		if (!pfn_valid_within(pfn)) +			continue; +		if ((pfn & nr_pgmask) || pfn_valid(pfn)) { +			if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { +				if (page && (pfn & nr_pgmask)) +					page++; +				else +					page = pfn_to_page(pfn); +				__init_single_page(page, pfn, zid, nid); +				cond_resched(); +			} +		} +	} + +	page = NULL; +	for (pfn = start_pfn; pfn < end_pfn; pfn++) { +		if (!pfn_valid_within(pfn)) { +			nr_pages += __def_free(&nr_free, &free_base_pfn, &page); +		} else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { +			nr_pages += __def_free(&nr_free, &free_base_pfn, &page); +		} else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { +			nr_pages += __def_free(&nr_free, &free_base_pfn, &page); +		} else if (page && (pfn & nr_pgmask)) { +			page++; +			nr_free++; +		} else { +			nr_pages += __def_free(&nr_free, &free_base_pfn, &page); +			page = pfn_to_page(pfn); +			free_base_pfn = pfn; +			nr_free = 1; +			cond_resched(); +		} +	} +	/* Free the last block of pages to allocator */ +	nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + +	return nr_pages; +} +  /* Initialise remaining memory on a node */  static int __init deferred_init_memmap(void *data)  {  	pg_data_t *pgdat = data;  	int nid = pgdat->node_id; -	struct mminit_pfnnid_cache nid_init_state = { };  	unsigned long start = jiffies;  	unsigned long nr_pages = 0; -	unsigned long walk_start, walk_end; -	int i, zid; +	unsigned long spfn, epfn; +	phys_addr_t spa, epa; +	int zid;  	struct zone *zone;  	unsigned long first_init_pfn = pgdat->first_deferred_pfn;  	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); +	u64 i;  	if (first_init_pfn == ULONG_MAX) {  		pgdat_init_report_one_done(); @@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data)  		if (first_init_pfn < zone_end_pfn(zone))  			break;  	} +	first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); -	for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { -		unsigned long pfn, end_pfn; -		struct page *page = NULL; -		struct page *free_base_page = NULL; -		unsigned long free_base_pfn = 0; -		int nr_to_free = 0; - -		end_pfn = min(walk_end, zone_end_pfn(zone)); -		pfn = first_init_pfn; -		if (pfn < walk_start) -			pfn = walk_start; -		if (pfn < zone->zone_start_pfn) -			pfn = zone->zone_start_pfn; - -		for (; pfn < end_pfn; pfn++) { -			if (!pfn_valid_within(pfn)) -				goto free_range; - -			/* -			 * Ensure pfn_valid is checked every -			 * pageblock_nr_pages for memory holes -			 */ -			if ((pfn & (pageblock_nr_pages - 1)) == 0) { -				if (!pfn_valid(pfn)) { -					page = NULL; -					goto free_range; -				} -			} - -			if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { -				page = NULL; -				goto free_range; -			} - -			/* Minimise pfn page lookups and scheduler checks */ -			if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { -				page++; -			} else { -				nr_pages += nr_to_free; -				deferred_free_range(free_base_page, -						free_base_pfn, nr_to_free); -				free_base_page = NULL; -				free_base_pfn = nr_to_free = 0; - -				page = pfn_to_page(pfn); -				cond_resched(); -			} - -			if (page->flags) { -				VM_BUG_ON(page_zone(page) != zone); -				goto free_range; -			} - -			__init_single_page(page, pfn, zid, nid); -			if (!free_base_page) { -				free_base_page = page; -				free_base_pfn = pfn; -				nr_to_free = 0; -			} -			nr_to_free++; - -			/* Where possible, batch up pages for a single free */ -			continue; -free_range: -			/* Free the current block of pages to allocator */ -			nr_pages += nr_to_free; -			deferred_free_range(free_base_page, free_base_pfn, -								nr_to_free); -			free_base_page = NULL; -			free_base_pfn = nr_to_free = 0; -		} -		/* Free the last block of pages to allocator */ -		nr_pages += nr_to_free; -		deferred_free_range(free_base_page, free_base_pfn, nr_to_free); - -		first_init_pfn = max(end_pfn, first_init_pfn); +	for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { +		spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); +		epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); +		nr_pages += deferred_init_range(nid, zid, spfn, epfn);  	}  	/* Sanity check that the next zone really is unpopulated */ @@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags   * Go through the free lists for the given migratetype and remove   * the smallest available page from the freelists   */ -static inline +static __always_inline  struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,  						int migratetype)  { @@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {  };  #ifdef CONFIG_CMA -static struct page *__rmqueue_cma_fallback(struct zone *zone, +static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,  					unsigned int order)  {  	return __rmqueue_smallest(zone, order, MIGRATE_CMA); @@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,   * deviation from the rest of this file, to make the for loop   * condition simpler.   */ -static inline bool +static __always_inline bool  __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)  {  	struct free_area *area; @@ -2289,8 +2321,8 @@ do_steal:   * Do the hard work of removing an element from the buddy allocator.   * Call me with the zone->lock already held.   */ -static struct page *__rmqueue(struct zone *zone, unsigned int order, -				int migratetype) +static __always_inline struct page * +__rmqueue(struct zone *zone, unsigned int order, int migratetype)  {  	struct page *page; @@ -2315,7 +2347,7 @@ retry:   */  static int rmqueue_bulk(struct zone *zone, unsigned int order,  			unsigned long count, struct list_head *list, -			int migratetype, bool cold) +			int migratetype)  {  	int i, alloced = 0; @@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,  			continue;  		/* -		 * Split buddy pages returned by expand() are received here -		 * in physical page order. The page is added to the callers and -		 * list and the list head then moves forward. From the callers -		 * perspective, the linked list is ordered by page number in -		 * some conditions. This is useful for IO devices that can -		 * merge IO requests if the physical pages are ordered -		 * properly. +		 * Split buddy pages returned by expand() are received here in +		 * physical page order. The page is added to the tail of +		 * caller's list. From the callers perspective, the linked list +		 * is ordered by page number under some conditions. This is +		 * useful for IO devices that can forward direction from the +		 * head, thus also in the physical page order. This is useful +		 * for IO devices that can merge IO requests if the physical +		 * pages are ordered properly.  		 */ -		if (likely(!cold)) -			list_add(&page->lru, list); -		else -			list_add_tail(&page->lru, list); -		list = &page->lru; +		list_add_tail(&page->lru, list);  		alloced++;  		if (is_migrate_cma(get_pcppage_migratetype(page)))  			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -2590,24 +2619,25 @@ void mark_free_pages(struct zone *zone)  }  #endif /* CONFIG_PM */ -/* - * Free a 0-order page - * cold == true ? free a cold page : free a hot page - */ -void free_hot_cold_page(struct page *page, bool cold) +static bool free_unref_page_prepare(struct page *page, unsigned long pfn)  { -	struct zone *zone = page_zone(page); -	struct per_cpu_pages *pcp; -	unsigned long flags; -	unsigned long pfn = page_to_pfn(page);  	int migratetype;  	if (!free_pcp_prepare(page)) -		return; +		return false;  	migratetype = get_pfnblock_migratetype(page, pfn);  	set_pcppage_migratetype(page, migratetype); -	local_irq_save(flags); +	return true; +} + +static void free_unref_page_commit(struct page *page, unsigned long pfn) +{ +	struct zone *zone = page_zone(page); +	struct per_cpu_pages *pcp; +	int migratetype; + +	migratetype = get_pcppage_migratetype(page);  	__count_vm_event(PGFREE);  	/* @@ -2620,38 +2650,62 @@ void free_hot_cold_page(struct page *page, bool cold)  	if (migratetype >= MIGRATE_PCPTYPES) {  		if (unlikely(is_migrate_isolate(migratetype))) {  			free_one_page(zone, page, pfn, 0, migratetype); -			goto out; +			return;  		}  		migratetype = MIGRATE_MOVABLE;  	}  	pcp = &this_cpu_ptr(zone->pageset)->pcp; -	if (!cold) -		list_add(&page->lru, &pcp->lists[migratetype]); -	else -		list_add_tail(&page->lru, &pcp->lists[migratetype]); +	list_add(&page->lru, &pcp->lists[migratetype]);  	pcp->count++;  	if (pcp->count >= pcp->high) {  		unsigned long batch = READ_ONCE(pcp->batch);  		free_pcppages_bulk(zone, batch, pcp);  		pcp->count -= batch;  	} +} -out: +/* + * Free a 0-order page + */ +void free_unref_page(struct page *page) +{ +	unsigned long flags; +	unsigned long pfn = page_to_pfn(page); + +	if (!free_unref_page_prepare(page, pfn)) +		return; + +	local_irq_save(flags); +	free_unref_page_commit(page, pfn);  	local_irq_restore(flags);  }  /*   * Free a list of 0-order pages   */ -void free_hot_cold_page_list(struct list_head *list, bool cold) +void free_unref_page_list(struct list_head *list)  {  	struct page *page, *next; +	unsigned long flags, pfn; + +	/* Prepare pages for freeing */ +	list_for_each_entry_safe(page, next, list, lru) { +		pfn = page_to_pfn(page); +		if (!free_unref_page_prepare(page, pfn)) +			list_del(&page->lru); +		set_page_private(page, pfn); +	} +	local_irq_save(flags);  	list_for_each_entry_safe(page, next, list, lru) { -		trace_mm_page_free_batched(page, cold); -		free_hot_cold_page(page, cold); +		unsigned long pfn = page_private(page); + +		set_page_private(page, 0); +		trace_mm_page_free_batched(page); +		free_unref_page_commit(page, pfn);  	} +	local_irq_restore(flags);  }  /* @@ -2669,15 +2723,6 @@ void split_page(struct page *page, unsigned int order)  	VM_BUG_ON_PAGE(PageCompound(page), page);  	VM_BUG_ON_PAGE(!page_count(page), page); -#ifdef CONFIG_KMEMCHECK -	/* -	 * Split shadow pages too, because free(page[0]) would -	 * otherwise free the whole shadow. -	 */ -	if (kmemcheck_page_is_tracked(page)) -		split_page(virt_to_page(page[0].shadow), order); -#endif -  	for (i = 1; i < (1 << order); i++)  		set_page_refcounted(page + i);  	split_page_owner(page, order); @@ -2743,6 +2788,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)  #ifdef CONFIG_NUMA  	enum numa_stat_item local_stat = NUMA_LOCAL; +	/* skip numa counters update if numa stats is disabled */ +	if (!static_branch_likely(&vm_numa_stat_key)) +		return; +  	if (z->node != numa_node_id())  		local_stat = NUMA_OTHER; @@ -2758,7 +2807,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)  /* Remove page from the per-cpu list, caller must protect the list */  static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, -			bool cold, struct per_cpu_pages *pcp, +			struct per_cpu_pages *pcp,  			struct list_head *list)  {  	struct page *page; @@ -2767,16 +2816,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,  		if (list_empty(list)) {  			pcp->count += rmqueue_bulk(zone, 0,  					pcp->batch, list, -					migratetype, cold); +					migratetype);  			if (unlikely(list_empty(list)))  				return NULL;  		} -		if (cold) -			page = list_last_entry(list, struct page, lru); -		else -			page = list_first_entry(list, struct page, lru); - +		page = list_first_entry(list, struct page, lru);  		list_del(&page->lru);  		pcp->count--;  	} while (check_new_pcp(page)); @@ -2791,14 +2836,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,  {  	struct per_cpu_pages *pcp;  	struct list_head *list; -	bool cold = ((gfp_flags & __GFP_COLD) != 0);  	struct page *page;  	unsigned long flags;  	local_irq_save(flags);  	pcp = &this_cpu_ptr(zone->pageset)->pcp;  	list = &pcp->lists[migratetype]; -	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list); +	page = __rmqueue_pcplist(zone,  migratetype, pcp, list);  	if (page) {  		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);  		zone_statistics(preferred_zone, zone); @@ -3006,9 +3050,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,  		if (!area->nr_free)  			continue; -		if (alloc_harder) -			return true; -  		for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {  			if (!list_empty(&area->free_list[mt]))  				return true; @@ -3020,6 +3061,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,  			return true;  		}  #endif +		if (alloc_harder && +			!list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) +			return true;  	}  	return false;  } @@ -3235,20 +3279,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)  	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))  		return; -	pr_warn("%s: ", current->comm); -  	va_start(args, fmt);  	vaf.fmt = fmt;  	vaf.va = &args; -	pr_cont("%pV", &vaf); +	pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", +			current->comm, &vaf, gfp_mask, &gfp_mask, +			nodemask_pr_args(nodemask));  	va_end(args); -	pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); -	if (nodemask) -		pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); -	else -		pr_cont("(null)\n"); -  	cpuset_print_current_mems_allowed();  	dump_stack(); @@ -3868,8 +3906,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,  	enum compact_result compact_result;  	int compaction_retries;  	int no_progress_loops; -	unsigned long alloc_start = jiffies; -	unsigned int stall_timeout = 10 * HZ;  	unsigned int cpuset_mems_cookie;  	int reserve_flags; @@ -4001,14 +4037,6 @@ retry:  	if (!can_direct_reclaim)  		goto nopage; -	/* Make sure we know about allocations which stall for too long */ -	if (time_after(jiffies, alloc_start + stall_timeout)) { -		warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, -			"page allocation stalls for %ums, order:%u", -			jiffies_to_msecs(jiffies-alloc_start), order); -		stall_timeout += 10 * HZ; -	} -  	/* Avoid recursion of direct reclaim */  	if (current->flags & PF_MEMALLOC)  		goto nopage; @@ -4223,9 +4251,6 @@ out:  		page = NULL;  	} -	if (kmemcheck_enabled && page) -		kmemcheck_pagealloc_alloc(page, order, gfp_mask); -  	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);  	return page; @@ -4262,7 +4287,7 @@ void __free_pages(struct page *page, unsigned int order)  {  	if (put_page_testzero(page)) {  		if (order == 0) -			free_hot_cold_page(page, false); +			free_unref_page(page);  		else  			__free_pages_ok(page, order);  	} @@ -4320,7 +4345,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)  		unsigned int order = compound_order(page);  		if (order == 0) -			free_hot_cold_page(page, false); +			free_unref_page(page);  		else  			__free_pages_ok(page, order);  	} @@ -6126,6 +6151,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)  	}  } +#ifdef CONFIG_FLAT_NODE_MEM_MAP  static void __ref alloc_node_mem_map(struct pglist_data *pgdat)  {  	unsigned long __maybe_unused start = 0; @@ -6135,7 +6161,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)  	if (!pgdat->node_spanned_pages)  		return; -#ifdef CONFIG_FLAT_NODE_MEM_MAP  	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);  	offset = pgdat->node_start_pfn - start;  	/* ia64 gets its own node_mem_map, before this, without bootmem */ @@ -6157,6 +6182,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)  							       pgdat->node_id);  		pgdat->node_mem_map = map + offset;  	} +	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", +				__func__, pgdat->node_id, (unsigned long)pgdat, +				(unsigned long)pgdat->node_mem_map);  #ifndef CONFIG_NEED_MULTIPLE_NODES  	/*  	 * With no DISCONTIG, the global mem_map is just set as node 0's @@ -6169,8 +6197,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */  	}  #endif -#endif /* CONFIG_FLAT_NODE_MEM_MAP */  } +#else +static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } +#endif /* CONFIG_FLAT_NODE_MEM_MAP */  void __paginginit free_area_init_node(int nid, unsigned long *zones_size,  		unsigned long node_start_pfn, unsigned long *zholes_size) @@ -6197,16 +6227,49 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,  				  zones_size, zholes_size);  	alloc_node_mem_map(pgdat); -#ifdef CONFIG_FLAT_NODE_MEM_MAP -	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", -		nid, (unsigned long)pgdat, -		(unsigned long)pgdat->node_mem_map); -#endif  	reset_deferred_meminit(pgdat);  	free_area_init_core(pgdat);  } +#ifdef CONFIG_HAVE_MEMBLOCK +/* + * Only struct pages that are backed by physical memory are zeroed and + * initialized by going through __init_single_page(). But, there are some + * struct pages which are reserved in memblock allocator and their fields + * may be accessed (for example page_to_pfn() on some configuration accesses + * flags). We must explicitly zero those struct pages. + */ +void __paginginit zero_resv_unavail(void) +{ +	phys_addr_t start, end; +	unsigned long pfn; +	u64 i, pgcnt; + +	/* +	 * Loop through ranges that are reserved, but do not have reported +	 * physical memory backing. +	 */ +	pgcnt = 0; +	for_each_resv_unavail_range(i, &start, &end) { +		for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { +			mm_zero_struct_page(pfn_to_page(pfn)); +			pgcnt++; +		} +	} + +	/* +	 * Struct pages that do not have backing memory. This could be because +	 * firmware is using some of this memory, or for some other reasons. +	 * Once memblock is changed so such behaviour is not allowed: i.e. +	 * list of "reserved" memory must be a subset of list of "memory", then +	 * this code can be removed. +	 */ +	if (pgcnt) +		pr_info("Reserved but unavailable: %lld pages", pgcnt); +} +#endif /* CONFIG_HAVE_MEMBLOCK */ +  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP  #if MAX_NUMNODES > 1 @@ -6630,6 +6693,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)  			node_set_state(nid, N_MEMORY);  		check_for_memory(pgdat, nid);  	} +	zero_resv_unavail();  }  static int __init cmdline_parse_core(char *p, unsigned long *core) @@ -6793,6 +6857,7 @@ void __init free_area_init(unsigned long *zones_size)  {  	free_area_init_node(0, zones_size,  			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); +	zero_resv_unavail();  }  static int page_alloc_cpu_dead(unsigned int cpu) @@ -7305,18 +7370,17 @@ void *__init alloc_large_system_hash(const char *tablename,  	log2qty = ilog2(numentries); -	/* -	 * memblock allocator returns zeroed memory already, so HASH_ZERO is -	 * currently not used when HASH_EARLY is specified. -	 */  	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;  	do {  		size = bucketsize << log2qty; -		if (flags & HASH_EARLY) -			table = memblock_virt_alloc_nopanic(size, 0); -		else if (hashdist) +		if (flags & HASH_EARLY) { +			if (flags & HASH_ZERO) +				table = memblock_virt_alloc_nopanic(size, 0); +			else +				table = memblock_virt_alloc_raw(size, 0); +		} else if (hashdist) {  			table = __vmalloc(size, gfp_flags, PAGE_KERNEL); -		else { +		} else {  			/*  			 * If bucketsize is not a power-of-two, we may free  			 * some pages at the end of hash table which @@ -7353,10 +7417,10 @@ void *__init alloc_large_system_hash(const char *tablename,   * race condition. So you can't expect this function should be exact.   */  bool has_unmovable_pages(struct zone *zone, struct page *page, int count, +			 int migratetype,  			 bool skip_hwpoisoned_pages)  {  	unsigned long pfn, iter, found; -	int mt;  	/*  	 * For avoiding noise data, lru_add_drain_all() should be called @@ -7364,8 +7428,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,  	 */  	if (zone_idx(zone) == ZONE_MOVABLE)  		return false; -	mt = get_pageblock_migratetype(page); -	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) + +	/* +	 * CMA allocations (alloc_contig_range) really need to mark isolate +	 * CMA pageblocks even when they are not movable in fact so consider +	 * them movable here. +	 */ +	if (is_migrate_cma(migratetype) && +			is_migrate_cma(get_pageblock_migratetype(page)))  		return false;  	pfn = page_to_pfn(page); @@ -7377,6 +7447,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,  		page = pfn_to_page(check); +		if (PageReserved(page)) +			return true; +  		/*  		 * Hugepages are not in LRU lists, but they're movable.  		 * We need not scan over tail pages bacause we don't @@ -7450,7 +7523,7 @@ bool is_pageblock_removable_nolock(struct page *page)  	if (!zone_spans_pfn(zone, pfn))  		return false; -	return !has_unmovable_pages(zone, page, 0, true); +	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);  }  #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) @@ -7546,6 +7619,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,  		.zone = page_zone(pfn_to_page(start)),  		.mode = MIGRATE_SYNC,  		.ignore_skip_hint = true, +		.no_set_skip_hint = true,  		.gfp_mask = current_gfp_context(gfp_mask),  	};  	INIT_LIST_HEAD(&cc.migratepages); | 

