Blame - src/kernel/linux/v4.19/mm/memory_hotplug.c - T800

blob: 413f6709039a24151b02970a39564647761ebdb8 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* linux/mm/memory_hotplug.c
				3	*
				4	* Copyright (C)
				5	*/
				6
				7	#include <linux/stddef.h>
				8	#include <linux/mm.h>
				9	#include <linux/sched/signal.h>
				10	#include <linux/swap.h>
				11	#include <linux/interrupt.h>
				12	#include <linux/pagemap.h>
				13	#include <linux/compiler.h>
				14	#include <linux/export.h>
				15	#include <linux/pagevec.h>
				16	#include <linux/writeback.h>
				17	#include <linux/slab.h>
				18	#include <linux/sysctl.h>
				19	#include <linux/cpu.h>
				20	#include <linux/memory.h>
				21	#include <linux/memremap.h>
				22	#include <linux/memory_hotplug.h>
				23	#include <linux/highmem.h>
				24	#include <linux/vmalloc.h>
				25	#include <linux/ioport.h>
				26	#include <linux/delay.h>
				27	#include <linux/migrate.h>
				28	#include <linux/page-isolation.h>
				29	#include <linux/pfn.h>
				30	#include <linux/suspend.h>
				31	#include <linux/mm_inline.h>
				32	#include <linux/firmware-map.h>
				33	#include <linux/stop_machine.h>
				34	#include <linux/hugetlb.h>
				35	#include <linux/memblock.h>
				36	#include <linux/bootmem.h>
				37	#include <linux/compaction.h>
				38	#include <linux/rmap.h>
				39
				40	#include <asm/tlbflush.h>
				41
				42	#include "internal.h"
				43
				44	/*
				45	* online_page_callback contains pointer to current page onlining function.
				46	* Initially it is generic_online_page(). If it is required it could be
				47	* changed by calling set_online_page_callback() for callback registration
				48	* and restore_online_page_callback() for generic callback restore.
				49	*/
				50
				51	static void generic_online_page(struct page *page);
				52
				53	static online_page_callback_t online_page_callback = generic_online_page;
				54	static DEFINE_MUTEX(online_page_callback_lock);
				55
				56	DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
				57
				58	void get_online_mems(void)
				59	{
				60	percpu_down_read(&mem_hotplug_lock);
				61	}
				62
				63	void put_online_mems(void)
				64	{
				65	percpu_up_read(&mem_hotplug_lock);
				66	}
				67
				68	bool movable_node_enabled = false;
				69
				70	#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
				71	bool memhp_auto_online;
				72	#else
				73	bool memhp_auto_online = true;
				74	#endif
				75	EXPORT_SYMBOL_GPL(memhp_auto_online);
				76
				77	static int __init setup_memhp_default_state(char *str)
				78	{
				79	if (!strcmp(str, "online"))
				80	memhp_auto_online = true;
				81	else if (!strcmp(str, "offline"))
				82	memhp_auto_online = false;
				83
				84	return 1;
				85	}
				86	__setup("memhp_default_state=", setup_memhp_default_state);
				87
				88	void mem_hotplug_begin(void)
				89	{
				90	cpus_read_lock();
				91	percpu_down_write(&mem_hotplug_lock);
				92	}
				93
				94	void mem_hotplug_done(void)
				95	{
				96	percpu_up_write(&mem_hotplug_lock);
				97	cpus_read_unlock();
				98	}
				99
				100	/* add this memory to iomem resource */
				101	static struct resource *register_memory_resource(u64 start, u64 size)
				102	{
				103	struct resource res, conflict;
				104	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
				105	if (!res)
				106	return ERR_PTR(-ENOMEM);
				107
				108	res->name = "System RAM";
				109	res->start = start;
				110	res->end = start + size - 1;
				111	res->flags = IORESOURCE_SYSTEM_RAM \| IORESOURCE_BUSY;
				112	conflict = request_resource_conflict(&iomem_resource, res);
				113	if (conflict) {
				114	if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
				115	pr_debug("Device unaddressable memory block "
				116	"memory hotplug at %#010llx !\n",
				117	(unsigned long long)start);
				118	}
				119	pr_debug("System RAM resource %pR cannot be added\n", res);
				120	kfree(res);
				121	return ERR_PTR(-EEXIST);
				122	}
				123	return res;
				124	}
				125
				126	static void release_memory_resource(struct resource *res)
				127	{
				128	if (!res)
				129	return;
				130	release_resource(res);
				131	kfree(res);
				132	return;
				133	}
				134
				135	#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
				136	void get_page_bootmem(unsigned long info, struct page *page,
				137	unsigned long type)
				138	{
				139	page->freelist = (void *)type;
				140	SetPagePrivate(page);
				141	set_page_private(page, info);
				142	page_ref_inc(page);
				143	}
				144
				145	void put_page_bootmem(struct page *page)
				146	{
				147	unsigned long type;
				148
				149	type = (unsigned long) page->freelist;
				150	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE \|\|
				151	type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
				152
				153	if (page_ref_dec_return(page) == 1) {
				154	page->freelist = NULL;
				155	ClearPagePrivate(page);
				156	set_page_private(page, 0);
				157	INIT_LIST_HEAD(&page->lru);
				158	free_reserved_page(page);
				159	}
				160	}
				161
				162	#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
				163	#ifndef CONFIG_SPARSEMEM_VMEMMAP
				164	static void register_page_bootmem_info_section(unsigned long start_pfn)
				165	{
				166	unsigned long *usemap, mapsize, section_nr, i;
				167	struct mem_section *ms;
				168	struct page page, memmap;
				169
				170	section_nr = pfn_to_section_nr(start_pfn);
				171	ms = __nr_to_section(section_nr);
				172
				173	/* Get section's memmap address */
				174	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
				175
				176	/*
				177	* Get page for the memmap's phys address
				178	* XXX: need more consideration for sparse_vmemmap...
				179	*/
				180	page = virt_to_page(memmap);
				181	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
				182	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
				183
				184	/* remember memmap's page */
				185	for (i = 0; i < mapsize; i++, page++)
				186	get_page_bootmem(section_nr, page, SECTION_INFO);
				187
				188	usemap = ms->pageblock_flags;
				189	page = virt_to_page(usemap);
				190
				191	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
				192
				193	for (i = 0; i < mapsize; i++, page++)
				194	get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
				195
				196	}
				197	#else /* CONFIG_SPARSEMEM_VMEMMAP */
				198	static void register_page_bootmem_info_section(unsigned long start_pfn)
				199	{
				200	unsigned long *usemap, mapsize, section_nr, i;
				201	struct mem_section *ms;
				202	struct page page, memmap;
				203
				204	section_nr = pfn_to_section_nr(start_pfn);
				205	ms = __nr_to_section(section_nr);
				206
				207	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
				208
				209	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
				210
				211	usemap = ms->pageblock_flags;
				212	page = virt_to_page(usemap);
				213
				214	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
				215
				216	for (i = 0; i < mapsize; i++, page++)
				217	get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
				218	}
				219	#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
				220
				221	void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
				222	{
				223	unsigned long i, pfn, end_pfn, nr_pages;
				224	int node = pgdat->node_id;
				225	struct page *page;
				226
				227	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
				228	page = virt_to_page(pgdat);
				229
				230	for (i = 0; i < nr_pages; i++, page++)
				231	get_page_bootmem(node, page, NODE_INFO);
				232
				233	pfn = pgdat->node_start_pfn;
				234	end_pfn = pgdat_end_pfn(pgdat);
				235
				236	/* register section info */
				237	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
				238	/*
				239	* Some platforms can assign the same pfn to multiple nodes - on
				240	* node0 as well as nodeN. To avoid registering a pfn against
				241	* multiple nodes we check that this pfn does not already
				242	* reside in some other nodes.
				243	*/
				244	if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
				245	register_page_bootmem_info_section(pfn);
				246	}
				247	}
				248	#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
				249
				250	static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
				251	struct vmem_altmap *altmap, bool want_memblock)
				252	{
				253	int ret;
				254
				255	if (pfn_valid(phys_start_pfn))
				256	return -EEXIST;
				257
				258	ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap);
				259	if (ret < 0)
				260	return ret;
				261
				262	if (!want_memblock)
				263	return 0;
				264
				265	return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
				266	}
				267
				268	/*
				269	* Reasonably generic function for adding memory. It is
				270	* expected that archs that support memory hotplug will
				271	* call this function after deciding the zone to which to
				272	* add the new pages.
				273	*/
				274	int __ref __add_pages(int nid, unsigned long phys_start_pfn,
				275	unsigned long nr_pages, struct vmem_altmap *altmap,
				276	bool want_memblock)
				277	{
				278	unsigned long i;
				279	int err = 0;
				280	int start_sec, end_sec;
				281
				282	/* during initialize mem_map, align hot-added range to section */
				283	start_sec = pfn_to_section_nr(phys_start_pfn);
				284	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
				285
				286	if (altmap) {
				287	/*
				288	* Validate altmap is within bounds of the total request
				289	*/
				290	if (altmap->base_pfn != phys_start_pfn
				291	\|\| vmem_altmap_offset(altmap) > nr_pages) {
				292	pr_warn_once("memory add fail, invalid altmap\n");
				293	err = -EINVAL;
				294	goto out;
				295	}
				296	altmap->alloc = 0;
				297	}
				298
				299	for (i = start_sec; i <= end_sec; i++) {
				300	err = __add_section(nid, section_nr_to_pfn(i), altmap,
				301	want_memblock);
				302
				303	/*
				304	* EEXIST is finally dealt with by ioresource collision
				305	* check. see add_memory() => register_memory_resource()
				306	* Warning will be printed if there is collision.
				307	*/
				308	if (err && (err != -EEXIST))
				309	break;
				310	err = 0;
				311	cond_resched();
				312	}
				313	vmemmap_populate_print_last();
				314	out:
				315	return err;
				316	}
				317
				318	#ifdef CONFIG_MEMORY_HOTREMOVE
				319	/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
				320	static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
				321	unsigned long start_pfn,
				322	unsigned long end_pfn)
				323	{
				324	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
				325	if (unlikely(!pfn_to_online_page(start_pfn)))
				326	continue;
				327
				328	if (unlikely(pfn_to_nid(start_pfn) != nid))
				329	continue;
				330
				331	if (zone && zone != page_zone(pfn_to_page(start_pfn)))
				332	continue;
				333
				334	return start_pfn;
				335	}
				336
				337	return 0;
				338	}
				339
				340	/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
				341	static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
				342	unsigned long start_pfn,
				343	unsigned long end_pfn)
				344	{
				345	unsigned long pfn;
				346
				347	/* pfn is the end pfn of a memory section. */
				348	pfn = end_pfn - 1;
				349	for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
				350	if (unlikely(!pfn_to_online_page(pfn)))
				351	continue;
				352
				353	if (unlikely(pfn_to_nid(pfn) != nid))
				354	continue;
				355
				356	if (zone && zone != page_zone(pfn_to_page(pfn)))
				357	continue;
				358
				359	return pfn;
				360	}
				361
				362	return 0;
				363	}
				364
				365	static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
				366	unsigned long end_pfn)
				367	{
				368	unsigned long zone_start_pfn = zone->zone_start_pfn;
				369	unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
				370	unsigned long zone_end_pfn = z;
				371	unsigned long pfn;
				372	int nid = zone_to_nid(zone);
				373
				374	zone_span_writelock(zone);
				375	if (zone_start_pfn == start_pfn) {
				376	/*
				377	* If the section is smallest section in the zone, it need
				378	* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
				379	* In this case, we find second smallest valid mem_section
				380	* for shrinking zone.
				381	*/
				382	pfn = find_smallest_section_pfn(nid, zone, end_pfn,
				383	zone_end_pfn);
				384	if (pfn) {
				385	zone->zone_start_pfn = pfn;
				386	zone->spanned_pages = zone_end_pfn - pfn;
				387	}
				388	} else if (zone_end_pfn == end_pfn) {
				389	/*
				390	* If the section is biggest section in the zone, it need
				391	* shrink zone->spanned_pages.
				392	* In this case, we find second biggest valid mem_section for
				393	* shrinking zone.
				394	*/
				395	pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
				396	start_pfn);
				397	if (pfn)
				398	zone->spanned_pages = pfn - zone_start_pfn + 1;
				399	}
				400
				401	/*
				402	* The section is not biggest or smallest mem_section in the zone, it
				403	* only creates a hole in the zone. So in this case, we need not
				404	* change the zone. But perhaps, the zone has only hole data. Thus
				405	* it check the zone has only hole or not.
				406	*/
				407	pfn = zone_start_pfn;
				408	for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
				409	if (unlikely(!pfn_to_online_page(pfn)))
				410	continue;
				411
				412	if (page_zone(pfn_to_page(pfn)) != zone)
				413	continue;
				414
				415	/* If the section is current section, it continues the loop */
				416	if (start_pfn == pfn)
				417	continue;
				418
				419	/* If we find valid section, we have nothing to do */
				420	zone_span_writeunlock(zone);
				421	return;
				422	}
				423
				424	/* The zone has no valid section */
				425	zone->zone_start_pfn = 0;
				426	zone->spanned_pages = 0;
				427	zone_span_writeunlock(zone);
				428	}
				429
				430	static void update_pgdat_span(struct pglist_data *pgdat)
				431	{
				432	unsigned long node_start_pfn = 0, node_end_pfn = 0;
				433	struct zone *zone;
				434
				435	for (zone = pgdat->node_zones;
				436	zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
				437	unsigned long zone_end_pfn = zone->zone_start_pfn +
				438	zone->spanned_pages;
				439
				440	/* No need to lock the zones, they can't change. */
				441	if (!zone->spanned_pages)
				442	continue;
				443	if (!node_end_pfn) {
				444	node_start_pfn = zone->zone_start_pfn;
				445	node_end_pfn = zone_end_pfn;
				446	continue;
				447	}
				448
				449	if (zone_end_pfn > node_end_pfn)
				450	node_end_pfn = zone_end_pfn;
				451	if (zone->zone_start_pfn < node_start_pfn)
				452	node_start_pfn = zone->zone_start_pfn;
				453	}
				454
				455	pgdat->node_start_pfn = node_start_pfn;
				456	pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
				457	}
				458
				459	static void __remove_zone(struct zone *zone, unsigned long start_pfn)
				460	{
				461	struct pglist_data *pgdat = zone->zone_pgdat;
				462	int nr_pages = PAGES_PER_SECTION;
				463	unsigned long flags;
				464
				465	#ifdef CONFIG_ZONE_DEVICE
				466	/*
				467	* Zone shrinking code cannot properly deal with ZONE_DEVICE. So
				468	* we will not try to shrink the zones - which is okay as
				469	* set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
				470	*/
				471	if (zone_idx(zone) == ZONE_DEVICE)
				472	return;
				473	#endif
				474
				475	pgdat_resize_lock(zone->zone_pgdat, &flags);
				476	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
				477	update_pgdat_span(pgdat);
				478	pgdat_resize_unlock(zone->zone_pgdat, &flags);
				479	}
				480
				481	static int __remove_section(struct zone zone, struct mem_section ms,
				482	unsigned long map_offset, struct vmem_altmap *altmap)
				483	{
				484	unsigned long start_pfn;
				485	int scn_nr;
				486	int ret = -EINVAL;
				487
				488	if (!valid_section(ms))
				489	return ret;
				490
				491	ret = unregister_memory_section(ms);
				492	if (ret)
				493	return ret;
				494
				495	scn_nr = __section_nr(ms);
				496	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
				497	__remove_zone(zone, start_pfn);
				498
				499	sparse_remove_one_section(zone, ms, map_offset, altmap);
				500	return 0;
				501	}
				502
				503	/**
				504	* __remove_pages() - remove sections of pages from a zone
				505	* @zone: zone from which pages need to be removed
				506	* @phys_start_pfn: starting pageframe (must be aligned to start of a section)
				507	* @nr_pages: number of pages to remove (must be multiple of section size)
				508	* @altmap: alternative device page map or %NULL if default memmap is used
				509	*
				510	* Generic helper function to remove section mappings and sysfs entries
				511	* for the section of the memory we are removing. Caller needs to make
				512	* sure that pages are marked reserved and zones are adjust properly by
				513	* calling offline_pages().
				514	*/
				515	int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
				516	unsigned long nr_pages, struct vmem_altmap *altmap)
				517	{
				518	unsigned long i;
				519	unsigned long map_offset = 0;
				520	int sections_to_remove, ret = 0;
				521
				522	/* In the ZONE_DEVICE case device driver owns the memory region */
				523	if (is_dev_zone(zone)) {
				524	if (altmap)
				525	map_offset = vmem_altmap_offset(altmap);
				526	} else {
				527	resource_size_t start, size;
				528
				529	start = phys_start_pfn << PAGE_SHIFT;
				530	size = nr_pages * PAGE_SIZE;
				531
				532	ret = release_mem_region_adjustable(&iomem_resource, start,
				533	size);
				534	if (ret) {
				535	resource_size_t endres = start + size - 1;
				536
				537	pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
				538	&start, &endres, ret);
				539	}
				540	}
				541
				542	clear_zone_contiguous(zone);
				543
				544	/*
				545	* We can only remove entire sections
				546	*/
				547	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
				548	BUG_ON(nr_pages % PAGES_PER_SECTION);
				549
				550	sections_to_remove = nr_pages / PAGES_PER_SECTION;
				551	for (i = 0; i < sections_to_remove; i++) {
				552	unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
				553
				554	cond_resched();
				555	ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
				556	altmap);
				557	map_offset = 0;
				558	if (ret)
				559	break;
				560	}
				561
				562	set_zone_contiguous(zone);
				563
				564	return ret;
				565	}
				566	#endif /* CONFIG_MEMORY_HOTREMOVE */
				567
				568	int set_online_page_callback(online_page_callback_t callback)
				569	{
				570	int rc = -EINVAL;
				571
				572	get_online_mems();
				573	mutex_lock(&online_page_callback_lock);
				574
				575	if (online_page_callback == generic_online_page) {
				576	online_page_callback = callback;
				577	rc = 0;
				578	}
				579
				580	mutex_unlock(&online_page_callback_lock);
				581	put_online_mems();
				582
				583	return rc;
				584	}
				585	EXPORT_SYMBOL_GPL(set_online_page_callback);
				586
				587	int restore_online_page_callback(online_page_callback_t callback)
				588	{
				589	int rc = -EINVAL;
				590
				591	get_online_mems();
				592	mutex_lock(&online_page_callback_lock);
				593
				594	if (online_page_callback == callback) {
				595	online_page_callback = generic_online_page;
				596	rc = 0;
				597	}
				598
				599	mutex_unlock(&online_page_callback_lock);
				600	put_online_mems();
				601
				602	return rc;
				603	}
				604	EXPORT_SYMBOL_GPL(restore_online_page_callback);
				605
				606	void __online_page_set_limits(struct page *page)
				607	{
				608	}
				609	EXPORT_SYMBOL_GPL(__online_page_set_limits);
				610
				611	void __online_page_increment_counters(struct page *page)
				612	{
				613	adjust_managed_page_count(page, 1);
				614	}
				615	EXPORT_SYMBOL_GPL(__online_page_increment_counters);
				616
				617	void __online_page_free(struct page *page)
				618	{
				619	__free_reserved_page(page);
				620	}
				621	EXPORT_SYMBOL_GPL(__online_page_free);
				622
				623	static void generic_online_page(struct page *page)
				624	{
				625	__online_page_set_limits(page);
				626	__online_page_increment_counters(page);
				627	__online_page_free(page);
				628	}
				629
				630	static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
				631	void *arg)
				632	{
				633	unsigned long i;
				634	unsigned long onlined_pages = (unsigned long )arg;
				635	struct page *page;
				636
				637	if (PageReserved(pfn_to_page(start_pfn)))
				638	for (i = 0; i < nr_pages; i++) {
				639	page = pfn_to_page(start_pfn + i);
				640	(*online_page_callback)(page);
				641	onlined_pages++;
				642	}
				643
				644	online_mem_sections(start_pfn, start_pfn + nr_pages);
				645
				646	(unsigned long )arg = onlined_pages;
				647	return 0;
				648	}
				649
				650	/* check which state of node_states will be changed when online memory */
				651	static void node_states_check_changes_online(unsigned long nr_pages,
				652	struct zone zone, struct memory_notify arg)
				653	{
				654	int nid = zone_to_nid(zone);
				655	enum zone_type zone_last = ZONE_NORMAL;
				656
				657	/*
				658	* If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
				659	* contains nodes which have zones of 0...ZONE_NORMAL,
				660	* set zone_last to ZONE_NORMAL.
				661	*
				662	* If we don't have HIGHMEM nor movable node,
				663	* node_states[N_NORMAL_MEMORY] contains nodes which have zones of
				664	* 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
				665	*/
				666	if (N_MEMORY == N_NORMAL_MEMORY)
				667	zone_last = ZONE_MOVABLE;
				668
				669	/*
				670	* if the memory to be online is in a zone of 0...zone_last, and
				671	* the zones of 0...zone_last don't have memory before online, we will
				672	* need to set the node to node_states[N_NORMAL_MEMORY] after
				673	* the memory is online.
				674	*/
				675	if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
				676	arg->status_change_nid_normal = nid;
				677	else
				678	arg->status_change_nid_normal = -1;
				679
				680	#ifdef CONFIG_HIGHMEM
				681	/*
				682	* If we have movable node, node_states[N_HIGH_MEMORY]
				683	* contains nodes which have zones of 0...ZONE_HIGHMEM,
				684	* set zone_last to ZONE_HIGHMEM.
				685	*
				686	* If we don't have movable node, node_states[N_NORMAL_MEMORY]
				687	* contains nodes which have zones of 0...ZONE_MOVABLE,
				688	* set zone_last to ZONE_MOVABLE.
				689	*/
				690	zone_last = ZONE_HIGHMEM;
				691	if (N_MEMORY == N_HIGH_MEMORY)
				692	zone_last = ZONE_MOVABLE;
				693
				694	if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
				695	arg->status_change_nid_high = nid;
				696	else
				697	arg->status_change_nid_high = -1;
				698	#else
				699	arg->status_change_nid_high = arg->status_change_nid_normal;
				700	#endif
				701
				702	/*
				703	* if the node don't have memory befor online, we will need to
				704	* set the node to node_states[N_MEMORY] after the memory
				705	* is online.
				706	*/
				707	if (!node_state(nid, N_MEMORY))
				708	arg->status_change_nid = nid;
				709	else
				710	arg->status_change_nid = -1;
				711	}
				712
				713	static void node_states_set_node(int node, struct memory_notify *arg)
				714	{
				715	if (arg->status_change_nid_normal >= 0)
				716	node_set_state(node, N_NORMAL_MEMORY);
				717
				718	if (arg->status_change_nid_high >= 0)
				719	node_set_state(node, N_HIGH_MEMORY);
				720
				721	node_set_state(node, N_MEMORY);
				722	}
				723
				724	static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
				725	unsigned long nr_pages)
				726	{
				727	unsigned long old_end_pfn = zone_end_pfn(zone);
				728
				729	if (zone_is_empty(zone) \|\| start_pfn < zone->zone_start_pfn)
				730	zone->zone_start_pfn = start_pfn;
				731
				732	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
				733	}
				734
				735	static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
				736	unsigned long nr_pages)
				737	{
				738	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
				739
				740	if (!pgdat->node_spanned_pages \|\| start_pfn < pgdat->node_start_pfn)
				741	pgdat->node_start_pfn = start_pfn;
				742
				743	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
				744	}
				745
				746	void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
				747	unsigned long nr_pages, struct vmem_altmap *altmap)
				748	{
				749	struct pglist_data *pgdat = zone->zone_pgdat;
				750	int nid = pgdat->node_id;
				751	unsigned long flags;
				752
				753	if (zone_is_empty(zone))
				754	init_currently_empty_zone(zone, start_pfn, nr_pages);
				755
				756	clear_zone_contiguous(zone);
				757
				758	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
				759	pgdat_resize_lock(pgdat, &flags);
				760	zone_span_writelock(zone);
				761	resize_zone_range(zone, start_pfn, nr_pages);
				762	zone_span_writeunlock(zone);
				763	resize_pgdat_range(pgdat, start_pfn, nr_pages);
				764	pgdat_resize_unlock(pgdat, &flags);
				765
				766	/*
				767	* TODO now we have a visible range of pages which are not associated
				768	* with their zone properly. Not nice but set_pfnblock_flags_mask
				769	* expects the zone spans the pfn range. All the pages in the range
				770	* are reserved so nobody should be touching them so we should be safe
				771	*/
				772	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
				773	MEMMAP_HOTPLUG, altmap);
				774
				775	set_zone_contiguous(zone);
				776	}
				777
				778	/*
				779	* Returns a default kernel memory zone for the given pfn range.
				780	* If no kernel zone covers this pfn range it will automatically go
				781	* to the ZONE_NORMAL.
				782	*/
				783	static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
				784	unsigned long nr_pages)
				785	{
				786	struct pglist_data *pgdat = NODE_DATA(nid);
				787	int zid;
				788
				789	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
				790	struct zone *zone = &pgdat->node_zones[zid];
				791
				792	if (zone_intersects(zone, start_pfn, nr_pages))
				793	return zone;
				794	}
				795
				796	return &pgdat->node_zones[ZONE_NORMAL];
				797	}
				798
				799	static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
				800	unsigned long nr_pages)
				801	{
				802	struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
				803	nr_pages);
				804	struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
				805	bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
				806	bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
				807
				808	/*
				809	* We inherit the existing zone in a simple case where zones do not
				810	* overlap in the given range
				811	*/
				812	if (in_kernel ^ in_movable)
				813	return (in_kernel) ? kernel_zone : movable_zone;
				814
				815	/*
				816	* If the range doesn't belong to any zone or two zones overlap in the
				817	* given range then we use movable zone only if movable_node is
				818	* enabled because we always online to a kernel zone by default.
				819	*/
				820	return movable_node_enabled ? movable_zone : kernel_zone;
				821	}
				822
				823	struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
				824	unsigned long nr_pages)
				825	{
				826	if (online_type == MMOP_ONLINE_KERNEL)
				827	return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
				828
				829	if (online_type == MMOP_ONLINE_MOVABLE)
				830	return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
				831
				832	return default_zone_for_pfn(nid, start_pfn, nr_pages);
				833	}
				834
				835	/*
				836	* Associates the given pfn range with the given node and the zone appropriate
				837	* for the given online type.
				838	*/
				839	static struct zone * __meminit move_pfn_range(int online_type, int nid,
				840	unsigned long start_pfn, unsigned long nr_pages)
				841	{
				842	struct zone *zone;
				843
				844	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
				845	move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
				846	return zone;
				847	}
				848
				849	int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
				850	{
				851	unsigned long flags;
				852	unsigned long onlined_pages = 0;
				853	struct zone *zone;
				854	int need_zonelists_rebuild = 0;
				855	int nid;
				856	int ret;
				857	struct memory_notify arg;
				858	struct memory_block *mem;
				859
				860	mem_hotplug_begin();
				861
				862	/*
				863	* We can't use pfn_to_nid() because nid might be stored in struct page
				864	* which is not yet initialized. Instead, we find nid from memory block.
				865	*/
				866	mem = find_memory_block(__pfn_to_section(pfn));
				867	nid = mem->nid;
				868	put_device(&mem->dev);
				869
				870	/* associate pfn range with the zone */
				871	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
				872
				873	arg.start_pfn = pfn;
				874	arg.nr_pages = nr_pages;
				875	node_states_check_changes_online(nr_pages, zone, &arg);
				876
				877	ret = memory_notify(MEM_GOING_ONLINE, &arg);
				878	ret = notifier_to_errno(ret);
				879	if (ret)
				880	goto failed_addition;
				881
				882	/*
				883	* If this zone is not populated, then it is not in zonelist.
				884	* This means the page allocator ignores this zone.
				885	* So, zonelist must be updated after online.
				886	*/
				887	if (!populated_zone(zone)) {
				888	need_zonelists_rebuild = 1;
				889	setup_zone_pageset(zone);
				890	}
				891
				892	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
				893	online_pages_range);
				894	if (ret) {
				895	if (need_zonelists_rebuild)
				896	zone_pcp_reset(zone);
				897	goto failed_addition;
				898	}
				899
				900	zone->present_pages += onlined_pages;
				901
				902	pgdat_resize_lock(zone->zone_pgdat, &flags);
				903	zone->zone_pgdat->node_present_pages += onlined_pages;
				904	pgdat_resize_unlock(zone->zone_pgdat, &flags);
				905
				906	if (onlined_pages) {
				907	node_states_set_node(nid, &arg);
				908	if (need_zonelists_rebuild)
				909	build_all_zonelists(NULL);
				910	else
				911	zone_pcp_update(zone);
				912	}
				913
				914	init_per_zone_wmark_min();
				915
				916	if (onlined_pages) {
				917	kswapd_run(nid);
				918	kcompactd_run(nid);
				919	}
				920
				921	vm_total_pages = nr_free_pagecache_pages();
				922
				923	writeback_set_ratelimit();
				924
				925	if (onlined_pages)
				926	memory_notify(MEM_ONLINE, &arg);
				927	mem_hotplug_done();
				928	return 0;
				929
				930	failed_addition:
				931	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
				932	(unsigned long long) pfn << PAGE_SHIFT,
				933	(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
				934	memory_notify(MEM_CANCEL_ONLINE, &arg);
				935	mem_hotplug_done();
				936	return ret;
				937	}
				938	#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
				939
				940	static void reset_node_present_pages(pg_data_t *pgdat)
				941	{
				942	struct zone *z;
				943
				944	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
				945	z->present_pages = 0;
				946
				947	pgdat->node_present_pages = 0;
				948	}
				949
				950	/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
				951	static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
				952	{
				953	struct pglist_data *pgdat;
				954	unsigned long start_pfn = PFN_DOWN(start);
				955
				956	pgdat = NODE_DATA(nid);
				957	if (!pgdat) {
				958	pgdat = arch_alloc_nodedata(nid);
				959	if (!pgdat)
				960	return NULL;
				961
				962	arch_refresh_nodedata(nid, pgdat);
				963	} else {
				964	/*
				965	* Reset the nr_zones, order and classzone_idx before reuse.
				966	* Note that kswapd will init kswapd_classzone_idx properly
				967	* when it starts in the near future.
				968	*/
				969	pgdat->nr_zones = 0;
				970	pgdat->kswapd_order = 0;
				971	pgdat->kswapd_classzone_idx = 0;
				972	}
				973
				974	/* we can use NODE_DATA(nid) from here */
				975
				976	pgdat->node_id = nid;
				977	pgdat->node_start_pfn = start_pfn;
				978
				979	/* init node's zones as empty zones, we don't have any present pages.*/
				980	free_area_init_core_hotplug(nid);
				981	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
				982
				983	/*
				984	* The node we allocated has no zone fallback lists. For avoiding
				985	* to access not-initialized zonelist, build here.
				986	*/
				987	build_all_zonelists(pgdat);
				988
				989	/*
				990	* When memory is hot-added, all the memory is in offline state. So
				991	* clear all zones' present_pages because they will be updated in
				992	* online_pages() and offline_pages().
				993	*/
				994	reset_node_managed_pages(pgdat);
				995	reset_node_present_pages(pgdat);
				996
				997	return pgdat;
				998	}
				999
				1000	static void rollback_node_hotadd(int nid)
				1001	{
				1002	pg_data_t *pgdat = NODE_DATA(nid);
				1003
				1004	arch_refresh_nodedata(nid, NULL);
				1005	free_percpu(pgdat->per_cpu_nodestats);
				1006	arch_free_nodedata(pgdat);
				1007	return;
				1008	}
				1009
				1010
				1011	/**
				1012	* try_online_node - online a node if offlined
				1013	* @nid: the node ID
				1014	* @start: start addr of the node
				1015	* @set_node_online: Whether we want to online the node
				1016	* called by cpu_up() to online a node without onlined memory.
				1017	*
				1018	* Returns:
				1019	* 1 -> a new node has been allocated
				1020	* 0 -> the node is already online
				1021	* -ENOMEM -> the node could not be allocated
				1022	*/
				1023	static int __try_online_node(int nid, u64 start, bool set_node_online)
				1024	{
				1025	pg_data_t *pgdat;
				1026	int ret = 1;
				1027
				1028	if (node_online(nid))
				1029	return 0;
				1030
				1031	pgdat = hotadd_new_pgdat(nid, start);
				1032	if (!pgdat) {
				1033	pr_err("Cannot online node %d due to NULL pgdat\n", nid);
				1034	ret = -ENOMEM;
				1035	goto out;
				1036	}
				1037
				1038	if (set_node_online) {
				1039	node_set_online(nid);
				1040	ret = register_one_node(nid);
				1041	BUG_ON(ret);
				1042	}
				1043	out:
				1044	return ret;
				1045	}
				1046
				1047	/*
				1048	* Users of this function always want to online/register the node
				1049	*/
				1050	int try_online_node(int nid)
				1051	{
				1052	int ret;
				1053
				1054	mem_hotplug_begin();
				1055	ret = __try_online_node(nid, 0, true);
				1056	mem_hotplug_done();
				1057	return ret;
				1058	}
				1059
				1060	static int check_hotplug_memory_range(u64 start, u64 size)
				1061	{
				1062	unsigned long block_sz = memory_block_size_bytes();
				1063	u64 block_nr_pages = block_sz >> PAGE_SHIFT;
				1064	u64 nr_pages = size >> PAGE_SHIFT;
				1065	u64 start_pfn = PFN_DOWN(start);
				1066
				1067	/* memory range must be block size aligned */
				1068	if (!nr_pages \|\| !IS_ALIGNED(start_pfn, block_nr_pages) \|\|
				1069	!IS_ALIGNED(nr_pages, block_nr_pages)) {
				1070	pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
				1071	block_sz, start, size);
				1072	return -EINVAL;
				1073	}
				1074
				1075	return 0;
				1076	}
				1077
				1078	static int online_memory_block(struct memory_block mem, void arg)
				1079	{
				1080	return device_online(&mem->dev);
				1081	}
				1082
				1083	/*
				1084	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				1085	* and online/offline operations (triggered e.g. by sysfs).
				1086	*
				1087	* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
				1088	*/
				1089	int __ref add_memory_resource(int nid, struct resource *res, bool online)
				1090	{
				1091	u64 start, size;
				1092	bool new_node = false;
				1093	int ret;
				1094
				1095	start = res->start;
				1096	size = resource_size(res);
				1097
				1098	ret = check_hotplug_memory_range(start, size);
				1099	if (ret)
				1100	return ret;
				1101
				1102	mem_hotplug_begin();
				1103
				1104	/*
				1105	* Add new range to memblock so that when hotadd_new_pgdat() is called
				1106	* to allocate new pgdat, get_pfn_range_for_nid() will be able to find
				1107	* this new range and calculate total pages correctly. The range will
				1108	* be removed at hot-remove time.
				1109	*/
				1110	memblock_add_node(start, size, nid);
				1111
				1112	ret = __try_online_node(nid, start, false);
				1113	if (ret < 0)
				1114	goto error;
				1115	new_node = ret;
				1116
				1117	/* call arch's memory hotadd */
				1118	ret = arch_add_memory(nid, start, size, NULL, true);
				1119	if (ret < 0)
				1120	goto error;
				1121
				1122	if (new_node) {
				1123	/* If sysfs file of new node can't be created, cpu on the node
				1124	* can't be hot-added. There is no rollback way now.
				1125	* So, check by BUG_ON() to catch it reluctantly..
				1126	* We online node here. We can't roll back from here.
				1127	*/
				1128	node_set_online(nid);
				1129	ret = __register_one_node(nid);
				1130	BUG_ON(ret);
				1131	}
				1132
				1133	/* link memory sections under this node.*/
				1134	ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
				1135	BUG_ON(ret);
				1136
				1137	/* create new memmap entry */
				1138	firmware_map_add_hotplug(start, start + size, "System RAM");
				1139
				1140	/* device_online() will take the lock when calling online_pages() */
				1141	mem_hotplug_done();
				1142
				1143	/* online pages if requested */
				1144	if (online)
				1145	walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
				1146	NULL, online_memory_block);
				1147
				1148	return ret;
				1149	error:
				1150	/* rollback pgdat allocation and others */
				1151	if (new_node)
				1152	rollback_node_hotadd(nid);
				1153	memblock_remove(start, size);
				1154	mem_hotplug_done();
				1155	return ret;
				1156	}
				1157
				1158	/* requires device_hotplug_lock, see add_memory_resource() */
				1159	int __ref __add_memory(int nid, u64 start, u64 size)
				1160	{
				1161	struct resource *res;
				1162	int ret;
				1163
				1164	res = register_memory_resource(start, size);
				1165	if (IS_ERR(res))
				1166	return PTR_ERR(res);
				1167
				1168	ret = add_memory_resource(nid, res, memhp_auto_online);
				1169	if (ret < 0)
				1170	release_memory_resource(res);
				1171	return ret;
				1172	}
				1173
				1174	int add_memory(int nid, u64 start, u64 size)
				1175	{
				1176	int rc;
				1177
				1178	lock_device_hotplug();
				1179	rc = __add_memory(nid, start, size);
				1180	unlock_device_hotplug();
				1181
				1182	return rc;
				1183	}
				1184	EXPORT_SYMBOL_GPL(add_memory);
				1185
				1186	#ifdef CONFIG_MEMORY_HOTREMOVE
				1187	/*
				1188	* A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
				1189	* set and the size of the free page is given by page_order(). Using this,
				1190	* the function determines if the pageblock contains only free pages.
				1191	* Due to buddy contraints, a free page at least the size of a pageblock will
				1192	* be located at the start of the pageblock
				1193	*/
				1194	static inline int pageblock_free(struct page *page)
				1195	{
				1196	return PageBuddy(page) && page_order(page) >= pageblock_order;
				1197	}
				1198
				1199	/* Return the pfn of the start of the next active pageblock after a given pfn */
				1200	static unsigned long next_active_pageblock(unsigned long pfn)
				1201	{
				1202	struct page *page = pfn_to_page(pfn);
				1203
				1204	/* Ensure the starting page is pageblock-aligned */
				1205	BUG_ON(pfn & (pageblock_nr_pages - 1));
				1206
				1207	/* If the entire pageblock is free, move to the end of free page */
				1208	if (pageblock_free(page)) {
				1209	int order;
				1210	/* be careful. we don't have locks, page_order can be changed.*/
				1211	order = page_order(page);
				1212	if ((order < MAX_ORDER) && (order >= pageblock_order))
				1213	return pfn + (1 << order);
				1214	}
				1215
				1216	return pfn + pageblock_nr_pages;
				1217	}
				1218
				1219	static bool is_pageblock_removable_nolock(unsigned long pfn)
				1220	{
				1221	struct page *page = pfn_to_page(pfn);
				1222	struct zone *zone;
				1223
				1224	/*
				1225	* We have to be careful here because we are iterating over memory
				1226	* sections which are not zone aware so we might end up outside of
				1227	* the zone but still within the section.
				1228	* We have to take care about the node as well. If the node is offline
				1229	* its NODE_DATA will be NULL - see page_zone.
				1230	*/
				1231	if (!node_online(page_to_nid(page)))
				1232	return false;
				1233
				1234	zone = page_zone(page);
				1235	pfn = page_to_pfn(page);
				1236	if (!zone_spans_pfn(zone, pfn))
				1237	return false;
				1238
				1239	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
				1240	}
				1241
				1242	/* Checks if this range of memory is likely to be hot-removable. */
				1243	bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
				1244	{
				1245	unsigned long end_pfn, pfn;
				1246
				1247	end_pfn = min(start_pfn + nr_pages,
				1248	zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
				1249
				1250	/* Check the starting page of each pageblock within the range */
				1251	for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
				1252	if (!is_pageblock_removable_nolock(pfn))
				1253	return false;
				1254	cond_resched();
				1255	}
				1256
				1257	/* All pageblocks in the memory block are likely to be hot-removable */
				1258	return true;
				1259	}
				1260
				1261	/*
				1262	* Confirm all pages in a range [start, end) belong to the same zone.
				1263	* When true, return its valid [start, end).
				1264	*/
				1265	int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
				1266	unsigned long valid_start, unsigned long valid_end)
				1267	{
				1268	unsigned long pfn, sec_end_pfn;
				1269	unsigned long start, end;
				1270	struct zone *zone = NULL;
				1271	struct page *page;
				1272	int i;
				1273	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
				1274	pfn < end_pfn;
				1275	pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
				1276	/* Make sure the memory section is present first */
				1277	if (!present_section_nr(pfn_to_section_nr(pfn)))
				1278	continue;
				1279	for (; pfn < sec_end_pfn && pfn < end_pfn;
				1280	pfn += MAX_ORDER_NR_PAGES) {
				1281	i = 0;
				1282	/* This is just a CONFIG_HOLES_IN_ZONE check.*/
				1283	while ((i < MAX_ORDER_NR_PAGES) &&
				1284	!pfn_valid_within(pfn + i))
				1285	i++;
				1286	if (i == MAX_ORDER_NR_PAGES \|\| pfn + i >= end_pfn)
				1287	continue;
				1288	/* Check if we got outside of the zone */
				1289	if (zone && !zone_spans_pfn(zone, pfn + i))
				1290	return 0;
				1291	page = pfn_to_page(pfn + i);
				1292	if (zone && page_zone(page) != zone)
				1293	return 0;
				1294	if (!zone)
				1295	start = pfn + i;
				1296	zone = page_zone(page);
				1297	end = pfn + MAX_ORDER_NR_PAGES;
				1298	}
				1299	}
				1300
				1301	if (zone) {
				1302	*valid_start = start;
				1303	*valid_end = min(end, end_pfn);
				1304	return 1;
				1305	} else {
				1306	return 0;
				1307	}
				1308	}
				1309
				1310	/*
				1311	* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
				1312	* non-lru movable pages and hugepages). We scan pfn because it's much
				1313	* easier than scanning over linked list. This function returns the pfn
				1314	* of the first found movable page if it's found, otherwise 0.
				1315	*/
				1316	static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
				1317	{
				1318	unsigned long pfn;
				1319
				1320	for (pfn = start; pfn < end; pfn++) {
				1321	struct page page, head;
				1322	unsigned long skip;
				1323
				1324	if (!pfn_valid(pfn))
				1325	continue;
				1326	page = pfn_to_page(pfn);
				1327	if (PageLRU(page))
				1328	return pfn;
				1329	if (__PageMovable(page))
				1330	return pfn;
				1331
				1332	if (!PageHuge(page))
				1333	continue;
				1334	head = compound_head(page);
				1335	if (hugepage_migration_supported(page_hstate(head)) &&
				1336	page_huge_active(head))
				1337	return pfn;
				1338	skip = (1 << compound_order(head)) - (page - head);
				1339	pfn += skip - 1;
				1340	}
				1341	return 0;
				1342	}
				1343
				1344	static struct page new_node_page(struct page page, unsigned long private)
				1345	{
				1346	int nid = page_to_nid(page);
				1347	nodemask_t nmask = node_states[N_MEMORY];
				1348
				1349	/*
				1350	* try to allocate from a different node but reuse this node if there
				1351	* are no other online nodes to be used (e.g. we are offlining a part
				1352	* of the only existing node)
				1353	*/
				1354	node_clear(nid, nmask);
				1355	if (nodes_empty(nmask))
				1356	node_set(nid, nmask);
				1357
				1358	return new_page_nodemask(page, nid, &nmask);
				1359	}
				1360
				1361	#define NR_OFFLINE_AT_ONCE_PAGES (256)
				1362	static int
				1363	do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
				1364	{
				1365	unsigned long pfn;
				1366	struct page *page;
				1367	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
				1368	int not_managed = 0;
				1369	int ret = 0;
				1370	LIST_HEAD(source);
				1371
				1372	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
				1373	if (!pfn_valid(pfn))
				1374	continue;
				1375	page = pfn_to_page(pfn);
				1376
				1377	if (PageHuge(page)) {
				1378	struct page *head = compound_head(page);
				1379	pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
				1380	if (compound_order(head) > PFN_SECTION_SHIFT) {
				1381	ret = -EBUSY;
				1382	break;
				1383	}
				1384	if (isolate_huge_page(page, &source))
				1385	move_pages -= 1 << compound_order(head);
				1386	continue;
				1387	} else if (PageTransHuge(page))
				1388	pfn = page_to_pfn(compound_head(page))
				1389	+ hpage_nr_pages(page) - 1;
				1390
				1391	/*
				1392	* HWPoison pages have elevated reference counts so the migration would
				1393	* fail on them. It also doesn't make any sense to migrate them in the
				1394	* first place. Still try to unmap such a page in case it is still mapped
				1395	* (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
				1396	* the unmap as the catch all safety net).
				1397	*/
				1398	if (PageHWPoison(page)) {
				1399	if (WARN_ON(PageLRU(page)))
				1400	isolate_lru_page(page);
				1401	if (page_mapped(page))
				1402	try_to_unmap(page, TTU_IGNORE_MLOCK \| TTU_IGNORE_ACCESS);
				1403	continue;
				1404	}
				1405
				1406	if (!get_page_unless_zero(page))
				1407	continue;
				1408	/*
				1409	* We can skip free pages. And we can deal with pages on
				1410	* LRU and non-lru movable pages.
				1411	*/
				1412	if (PageLRU(page))
				1413	ret = isolate_lru_page(page);
				1414	else
				1415	ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
				1416	if (!ret) { /* Success */
				1417	put_page(page);
				1418	list_add_tail(&page->lru, &source);
				1419	move_pages--;
				1420	if (!__PageMovable(page))
				1421	inc_node_page_state(page, NR_ISOLATED_ANON +
				1422	page_is_file_cache(page));
				1423
				1424	} else {
				1425	#ifdef CONFIG_DEBUG_VM
				1426	pr_alert("failed to isolate pfn %lx\n", pfn);
				1427	dump_page(page, "isolation failed");
				1428	#endif
				1429	put_page(page);
				1430	/* Because we don't have big zone->lock. we should
				1431	check this again here. */
				1432	if (page_count(page)) {
				1433	not_managed++;
				1434	ret = -EBUSY;
				1435	break;
				1436	}
				1437	}
				1438	}
				1439	if (!list_empty(&source)) {
				1440	if (not_managed) {
				1441	putback_movable_pages(&source);
				1442	goto out;
				1443	}
				1444
				1445	/* Allocate a new page from the nearest neighbor node */
				1446	ret = migrate_pages(&source, new_node_page, NULL, 0,
				1447	MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
				1448	if (ret)
				1449	putback_movable_pages(&source);
				1450	}
				1451	out:
				1452	return ret;
				1453	}
				1454
				1455	/*
				1456	* remove from free_area[] and mark all as Reserved.
				1457	*/
				1458	static int
				1459	offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
				1460	void *data)
				1461	{
				1462	__offline_isolated_pages(start, start + nr_pages);
				1463	return 0;
				1464	}
				1465
				1466	static void
				1467	offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
				1468	{
				1469	walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
				1470	offline_isolated_pages_cb);
				1471	}
				1472
				1473	/*
				1474	* Check all pages in range, recoreded as memory resource, are isolated.
				1475	*/
				1476	static int
				1477	check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
				1478	void *data)
				1479	{
				1480	int ret;
				1481	long offlined = (long )data;
				1482	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
				1483	offlined = nr_pages;
				1484	if (!ret)
				1485	(long )data += offlined;
				1486	return ret;
				1487	}
				1488
				1489	static long
				1490	check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
				1491	{
				1492	long offlined = 0;
				1493	int ret;
				1494
				1495	ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
				1496	check_pages_isolated_cb);
				1497	if (ret < 0)
				1498	offlined = (long)ret;
				1499	return offlined;
				1500	}
				1501
				1502	static int __init cmdline_parse_movable_node(char *p)
				1503	{
				1504	#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
				1505	movable_node_enabled = true;
				1506	#else
				1507	pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
				1508	#endif
				1509	return 0;
				1510	}
				1511	early_param("movable_node", cmdline_parse_movable_node);
				1512
				1513	/* check which state of node_states will be changed when offline memory */
				1514	static void node_states_check_changes_offline(unsigned long nr_pages,
				1515	struct zone zone, struct memory_notify arg)
				1516	{
				1517	struct pglist_data *pgdat = zone->zone_pgdat;
				1518	unsigned long present_pages = 0;
				1519	enum zone_type zt, zone_last = ZONE_NORMAL;
				1520
				1521	/*
				1522	* If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
				1523	* contains nodes which have zones of 0...ZONE_NORMAL,
				1524	* set zone_last to ZONE_NORMAL.
				1525	*
				1526	* If we don't have HIGHMEM nor movable node,
				1527	* node_states[N_NORMAL_MEMORY] contains nodes which have zones of
				1528	* 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
				1529	*/
				1530	if (N_MEMORY == N_NORMAL_MEMORY)
				1531	zone_last = ZONE_MOVABLE;
				1532
				1533	/*
				1534	* check whether node_states[N_NORMAL_MEMORY] will be changed.
				1535	* If the memory to be offline is in a zone of 0...zone_last,
				1536	* and it is the last present memory, 0...zone_last will
				1537	* become empty after offline , thus we can determind we will
				1538	* need to clear the node from node_states[N_NORMAL_MEMORY].
				1539	*/
				1540	for (zt = 0; zt <= zone_last; zt++)
				1541	present_pages += pgdat->node_zones[zt].present_pages;
				1542	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
				1543	arg->status_change_nid_normal = zone_to_nid(zone);
				1544	else
				1545	arg->status_change_nid_normal = -1;
				1546
				1547	#ifdef CONFIG_HIGHMEM
				1548	/*
				1549	* If we have movable node, node_states[N_HIGH_MEMORY]
				1550	* contains nodes which have zones of 0...ZONE_HIGHMEM,
				1551	* set zone_last to ZONE_HIGHMEM.
				1552	*
				1553	* If we don't have movable node, node_states[N_NORMAL_MEMORY]
				1554	* contains nodes which have zones of 0...ZONE_MOVABLE,
				1555	* set zone_last to ZONE_MOVABLE.
				1556	*/
				1557	zone_last = ZONE_HIGHMEM;
				1558	if (N_MEMORY == N_HIGH_MEMORY)
				1559	zone_last = ZONE_MOVABLE;
				1560
				1561	for (; zt <= zone_last; zt++)
				1562	present_pages += pgdat->node_zones[zt].present_pages;
				1563	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
				1564	arg->status_change_nid_high = zone_to_nid(zone);
				1565	else
				1566	arg->status_change_nid_high = -1;
				1567	#else
				1568	arg->status_change_nid_high = arg->status_change_nid_normal;
				1569	#endif
				1570
				1571	/*
				1572	* node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
				1573	*/
				1574	zone_last = ZONE_MOVABLE;
				1575
				1576	/*
				1577	* check whether node_states[N_HIGH_MEMORY] will be changed
				1578	* If we try to offline the last present @nr_pages from the node,
				1579	* we can determind we will need to clear the node from
				1580	* node_states[N_HIGH_MEMORY].
				1581	*/
				1582	for (; zt <= zone_last; zt++)
				1583	present_pages += pgdat->node_zones[zt].present_pages;
				1584	if (nr_pages >= present_pages)
				1585	arg->status_change_nid = zone_to_nid(zone);
				1586	else
				1587	arg->status_change_nid = -1;
				1588	}
				1589
				1590	static void node_states_clear_node(int node, struct memory_notify *arg)
				1591	{
				1592	if (arg->status_change_nid_normal >= 0)
				1593	node_clear_state(node, N_NORMAL_MEMORY);
				1594
				1595	if ((N_MEMORY != N_NORMAL_MEMORY) &&
				1596	(arg->status_change_nid_high >= 0))
				1597	node_clear_state(node, N_HIGH_MEMORY);
				1598
				1599	if ((N_MEMORY != N_HIGH_MEMORY) &&
				1600	(arg->status_change_nid >= 0))
				1601	node_clear_state(node, N_MEMORY);
				1602	}
				1603
				1604	static int __ref __offline_pages(unsigned long start_pfn,
				1605	unsigned long end_pfn)
				1606	{
				1607	unsigned long pfn, nr_pages;
				1608	long offlined_pages;
				1609	int ret, node;
				1610	unsigned long flags;
				1611	unsigned long valid_start, valid_end;
				1612	struct zone *zone;
				1613	struct memory_notify arg;
				1614
				1615	/* at least, alignment against pageblock is necessary */
				1616	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
				1617	return -EINVAL;
				1618	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
				1619	return -EINVAL;
				1620
				1621	mem_hotplug_begin();
				1622
				1623	/* This makes hotplug much easier...and readable.
				1624	we assume this for now. .*/
				1625	if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
				1626	&valid_end)) {
				1627	mem_hotplug_done();
				1628	return -EINVAL;
				1629	}
				1630
				1631	zone = page_zone(pfn_to_page(valid_start));
				1632	node = zone_to_nid(zone);
				1633	nr_pages = end_pfn - start_pfn;
				1634
				1635	/* set above range as isolated */
				1636	ret = start_isolate_page_range(start_pfn, end_pfn,
				1637	MIGRATE_MOVABLE, true);
				1638	if (ret) {
				1639	mem_hotplug_done();
				1640	return ret;
				1641	}
				1642
				1643	arg.start_pfn = start_pfn;
				1644	arg.nr_pages = nr_pages;
				1645	node_states_check_changes_offline(nr_pages, zone, &arg);
				1646
				1647	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
				1648	ret = notifier_to_errno(ret);
				1649	if (ret)
				1650	goto failed_removal;
				1651
				1652	pfn = start_pfn;
				1653	repeat:
				1654	/* start memory hot removal */
				1655	ret = -EINTR;
				1656	if (signal_pending(current))
				1657	goto failed_removal;
				1658
				1659	cond_resched();
				1660	lru_add_drain_all();
				1661	drain_all_pages(zone);
				1662
				1663	pfn = scan_movable_pages(start_pfn, end_pfn);
				1664	if (pfn) { /* We have movable pages */
				1665	ret = do_migrate_range(pfn, end_pfn);
				1666	goto repeat;
				1667	}
				1668
				1669	/*
				1670	* dissolve free hugepages in the memory block before doing offlining
				1671	* actually in order to make hugetlbfs's object counting consistent.
				1672	*/
				1673	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
				1674	if (ret)
				1675	goto failed_removal;
				1676	/* check again */
				1677	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
				1678	if (offlined_pages < 0)
				1679	goto repeat;
				1680	pr_info("Offlined Pages %ld\n", offlined_pages);
				1681	/* Ok, all of our target is isolated.
				1682	We cannot do rollback at this point. */
				1683	offline_isolated_pages(start_pfn, end_pfn);
				1684	/* reset pagetype flags and makes migrate type to be MOVABLE */
				1685	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
				1686	/* removal success */
				1687	adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
				1688	zone->present_pages -= offlined_pages;
				1689
				1690	pgdat_resize_lock(zone->zone_pgdat, &flags);
				1691	zone->zone_pgdat->node_present_pages -= offlined_pages;
				1692	pgdat_resize_unlock(zone->zone_pgdat, &flags);
				1693
				1694	init_per_zone_wmark_min();
				1695
				1696	if (!populated_zone(zone)) {
				1697	zone_pcp_reset(zone);
				1698	build_all_zonelists(NULL);
				1699	} else
				1700	zone_pcp_update(zone);
				1701
				1702	node_states_clear_node(node, &arg);
				1703	if (arg.status_change_nid >= 0) {
				1704	kswapd_stop(node);
				1705	kcompactd_stop(node);
				1706	}
				1707
				1708	vm_total_pages = nr_free_pagecache_pages();
				1709	writeback_set_ratelimit();
				1710
				1711	memory_notify(MEM_OFFLINE, &arg);
				1712	mem_hotplug_done();
				1713	return 0;
				1714
				1715	failed_removal:
				1716	pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
				1717	(unsigned long long) start_pfn << PAGE_SHIFT,
				1718	((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
				1719	memory_notify(MEM_CANCEL_OFFLINE, &arg);
				1720	/* pushback to free area */
				1721	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
				1722	mem_hotplug_done();
				1723	return ret;
				1724	}
				1725
				1726	int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
				1727	{
				1728	return __offline_pages(start_pfn, start_pfn + nr_pages);
				1729	}
				1730	#endif /* CONFIG_MEMORY_HOTREMOVE */
				1731
				1732	/**
				1733	* walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
				1734	* @start_pfn: start pfn of the memory range
				1735	* @end_pfn: end pfn of the memory range
				1736	* @arg: argument passed to func
				1737	* @func: callback for each memory section walked
				1738	*
				1739	* This function walks through all present mem sections in range
				1740	* [start_pfn, end_pfn) and call func on each mem section.
				1741	*
				1742	* Returns the return value of func.
				1743	*/
				1744	int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
				1745	void arg, int (func)(struct memory_block , void ))
				1746	{
				1747	struct memory_block *mem = NULL;
				1748	struct mem_section *section;
				1749	unsigned long pfn, section_nr;
				1750	int ret;
				1751
				1752	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
				1753	section_nr = pfn_to_section_nr(pfn);
				1754	if (!present_section_nr(section_nr))
				1755	continue;
				1756
				1757	section = __nr_to_section(section_nr);
				1758	/* same memblock? */
				1759	if (mem)
				1760	if ((section_nr >= mem->start_section_nr) &&
				1761	(section_nr <= mem->end_section_nr))
				1762	continue;
				1763
				1764	mem = find_memory_block_hinted(section, mem);
				1765	if (!mem)
				1766	continue;
				1767
				1768	ret = func(mem, arg);
				1769	if (ret) {
				1770	kobject_put(&mem->dev.kobj);
				1771	return ret;
				1772	}
				1773	}
				1774
				1775	if (mem)
				1776	kobject_put(&mem->dev.kobj);
				1777
				1778	return 0;
				1779	}
				1780
				1781	#ifdef CONFIG_MEMORY_HOTREMOVE
				1782	static int check_memblock_offlined_cb(struct memory_block mem, void arg)
				1783	{
				1784	int ret = !is_memblock_offlined(mem);
				1785
				1786	if (unlikely(ret)) {
				1787	phys_addr_t beginpa, endpa;
				1788
				1789	beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
				1790	endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
				1791	pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
				1792	&beginpa, &endpa);
				1793	}
				1794
				1795	return ret;
				1796	}
				1797
				1798	static int check_cpu_on_node(pg_data_t *pgdat)
				1799	{
				1800	int cpu;
				1801
				1802	for_each_present_cpu(cpu) {
				1803	if (cpu_to_node(cpu) == pgdat->node_id)
				1804	/*
				1805	* the cpu on this node isn't removed, and we can't
				1806	* offline this node.
				1807	*/
				1808	return -EBUSY;
				1809	}
				1810
				1811	return 0;
				1812	}
				1813
				1814	static void unmap_cpu_on_node(pg_data_t *pgdat)
				1815	{
				1816	#ifdef CONFIG_ACPI_NUMA
				1817	int cpu;
				1818
				1819	for_each_possible_cpu(cpu)
				1820	if (cpu_to_node(cpu) == pgdat->node_id)
				1821	numa_clear_node(cpu);
				1822	#endif
				1823	}
				1824
				1825	static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
				1826	{
				1827	int ret;
				1828
				1829	ret = check_cpu_on_node(pgdat);
				1830	if (ret)
				1831	return ret;
				1832
				1833	/*
				1834	* the node will be offlined when we come here, so we can clear
				1835	* the cpu_to_node() now.
				1836	*/
				1837
				1838	unmap_cpu_on_node(pgdat);
				1839	return 0;
				1840	}
				1841
				1842	/**
				1843	* try_offline_node
				1844	* @nid: the node ID
				1845	*
				1846	* Offline a node if all memory sections and cpus of the node are removed.
				1847	*
				1848	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				1849	* and online/offline operations before this call.
				1850	*/
				1851	void try_offline_node(int nid)
				1852	{
				1853	pg_data_t *pgdat = NODE_DATA(nid);
				1854	unsigned long start_pfn = pgdat->node_start_pfn;
				1855	unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
				1856	unsigned long pfn;
				1857
				1858	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
				1859	unsigned long section_nr = pfn_to_section_nr(pfn);
				1860
				1861	if (!present_section_nr(section_nr))
				1862	continue;
				1863
				1864	if (pfn_to_nid(pfn) != nid)
				1865	continue;
				1866
				1867	/*
				1868	* some memory sections of this node are not removed, and we
				1869	* can't offline node now.
				1870	*/
				1871	return;
				1872	}
				1873
				1874	if (check_and_unmap_cpu_on_node(pgdat))
				1875	return;
				1876
				1877	/*
				1878	* all memory/cpu of this node are removed, we can offline this
				1879	* node now.
				1880	*/
				1881	node_set_offline(nid);
				1882	unregister_one_node(nid);
				1883	}
				1884	EXPORT_SYMBOL(try_offline_node);
				1885
				1886	/**
				1887	* remove_memory
				1888	* @nid: the node ID
				1889	* @start: physical address of the region to remove
				1890	* @size: size of the region to remove
				1891	*
				1892	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				1893	* and online/offline operations before this call, as required by
				1894	* try_offline_node().
				1895	*/
				1896	void __ref remove_memory(int nid, u64 start, u64 size)
				1897	{
				1898	int ret;
				1899
				1900	BUG_ON(check_hotplug_memory_range(start, size));
				1901
				1902	mem_hotplug_begin();
				1903
				1904	/*
				1905	* All memory blocks must be offlined before removing memory. Check
				1906	* whether all memory blocks in question are offline and trigger a BUG()
				1907	* if this is not the case.
				1908	*/
				1909	ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
				1910	check_memblock_offlined_cb);
				1911	if (ret)
				1912	BUG();
				1913
				1914	/* remove memmap entry */
				1915	firmware_map_remove(start, start + size, "System RAM");
				1916	memblock_free(start, size);
				1917	memblock_remove(start, size);
				1918
				1919	arch_remove_memory(start, size, NULL);
				1920
				1921	try_offline_node(nid);
				1922
				1923	mem_hotplug_done();
				1924	}
				1925	EXPORT_SYMBOL_GPL(remove_memory);
				1926	#endif /* CONFIG_MEMORY_HOTREMOVE */