Blame - src/kernel/linux/v4.19/arch/x86/xen/setup.c - T800

blob: 075ed47993bbf5c1bf9b760511268613c035d8c1 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Machine specific setup for xen
				4	*
				5	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
				6	*/
				7
				8	#include <linux/init.h>
				9	#include <linux/sched.h>
				10	#include <linux/mm.h>
				11	#include <linux/pm.h>
				12	#include <linux/memblock.h>
				13	#include <linux/cpuidle.h>
				14	#include <linux/cpufreq.h>
				15
				16	#include <asm/elf.h>
				17	#include <asm/vdso.h>
				18	#include <asm/e820/api.h>
				19	#include <asm/setup.h>
				20	#include <asm/acpi.h>
				21	#include <asm/numa.h>
				22	#include <asm/xen/hypervisor.h>
				23	#include <asm/xen/hypercall.h>
				24
				25	#include <xen/xen.h>
				26	#include <xen/page.h>
				27	#include <xen/interface/callback.h>
				28	#include <xen/interface/memory.h>
				29	#include <xen/interface/physdev.h>
				30	#include <xen/features.h>
				31	#include <xen/hvc-console.h>
				32	#include "xen-ops.h"
				33	#include "vdso.h"
				34	#include "mmu.h"
				35
				36	#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
				37
				38	/* Amount of extra memory space we add to the e820 ranges */
				39	struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
				40
				41	/* Number of pages released from the initial allocation. */
				42	unsigned long xen_released_pages;
				43
				44	/* E820 map used during setting up memory. */
				45	static struct e820_table xen_e820_table __initdata;
				46
				47	/*
				48	* Buffer used to remap identity mapped pages. We only need the virtual space.
				49	* The physical page behind this address is remapped as needed to different
				50	* buffer pages.
				51	*/
				52	#define REMAP_SIZE (P2M_PER_PAGE - 3)
				53	static struct {
				54	unsigned long next_area_mfn;
				55	unsigned long target_pfn;
				56	unsigned long size;
				57	unsigned long mfns[REMAP_SIZE];
				58	} xen_remap_buf __initdata __aligned(PAGE_SIZE);
				59	static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
				60
				61	/*
				62	* The maximum amount of extra memory compared to the base size. The
				63	* main scaling factor is the size of struct page. At extreme ratios
				64	* of base:extra, all the base memory can be filled with page
				65	* structures for the extra memory, leaving no space for anything
				66	* else.
				67	*
				68	* 10x seems like a reasonable balance between scaling flexibility and
				69	* leaving a practically usable system.
				70	*/
				71	#define EXTRA_MEM_RATIO (10)
				72
				73	static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
				74
				75	static void __init xen_parse_512gb(void)
				76	{
				77	bool val = false;
				78	char *arg;
				79
				80	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
				81	if (!arg)
				82	return;
				83
				84	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
				85	if (!arg)
				86	val = true;
				87	else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
				88	return;
				89
				90	xen_512gb_limit = val;
				91	}
				92
				93	static void __init xen_add_extra_mem(unsigned long start_pfn,
				94	unsigned long n_pfns)
				95	{
				96	int i;
				97
				98	/*
				99	* No need to check for zero size, should happen rarely and will only
				100	* write a new entry regarded to be unused due to zero size.
				101	*/
				102	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
				103	/* Add new region. */
				104	if (xen_extra_mem[i].n_pfns == 0) {
				105	xen_extra_mem[i].start_pfn = start_pfn;
				106	xen_extra_mem[i].n_pfns = n_pfns;
				107	break;
				108	}
				109	/* Append to existing region. */
				110	if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
				111	start_pfn) {
				112	xen_extra_mem[i].n_pfns += n_pfns;
				113	break;
				114	}
				115	}
				116	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
				117	printk(KERN_WARNING "Warning: not enough extra memory regions\n");
				118
				119	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
				120	}
				121
				122	static void __init xen_del_extra_mem(unsigned long start_pfn,
				123	unsigned long n_pfns)
				124	{
				125	int i;
				126	unsigned long start_r, size_r;
				127
				128	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
				129	start_r = xen_extra_mem[i].start_pfn;
				130	size_r = xen_extra_mem[i].n_pfns;
				131
				132	/* Start of region. */
				133	if (start_r == start_pfn) {
				134	BUG_ON(n_pfns > size_r);
				135	xen_extra_mem[i].start_pfn += n_pfns;
				136	xen_extra_mem[i].n_pfns -= n_pfns;
				137	break;
				138	}
				139	/* End of region. */
				140	if (start_r + size_r == start_pfn + n_pfns) {
				141	BUG_ON(n_pfns > size_r);
				142	xen_extra_mem[i].n_pfns -= n_pfns;
				143	break;
				144	}
				145	/* Mid of region. */
				146	if (start_pfn > start_r && start_pfn < start_r + size_r) {
				147	BUG_ON(start_pfn + n_pfns > start_r + size_r);
				148	xen_extra_mem[i].n_pfns = start_pfn - start_r;
				149	/* Calling memblock_reserve() again is okay. */
				150	xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
				151	(start_pfn + n_pfns));
				152	break;
				153	}
				154	}
				155	memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
				156	}
				157
				158	/*
				159	* Called during boot before the p2m list can take entries beyond the
				160	* hypervisor supplied p2m list. Entries in extra mem are to be regarded as
				161	* invalid.
				162	*/
				163	unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
				164	{
				165	int i;
				166
				167	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
				168	if (pfn >= xen_extra_mem[i].start_pfn &&
				169	pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
				170	return INVALID_P2M_ENTRY;
				171	}
				172
				173	return IDENTITY_FRAME(pfn);
				174	}
				175
				176	/*
				177	* Mark all pfns of extra mem as invalid in p2m list.
				178	*/
				179	void __init xen_inv_extra_mem(void)
				180	{
				181	unsigned long pfn, pfn_s, pfn_e;
				182	int i;
				183
				184	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
				185	if (!xen_extra_mem[i].n_pfns)
				186	continue;
				187	pfn_s = xen_extra_mem[i].start_pfn;
				188	pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
				189	for (pfn = pfn_s; pfn < pfn_e; pfn++)
				190	set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
				191	}
				192	}
				193
				194	/*
				195	* Finds the next RAM pfn available in the E820 map after min_pfn.
				196	* This function updates min_pfn with the pfn found and returns
				197	* the size of that range or zero if not found.
				198	*/
				199	static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
				200	{
				201	const struct e820_entry *entry = xen_e820_table.entries;
				202	unsigned int i;
				203	unsigned long done = 0;
				204
				205	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
				206	unsigned long s_pfn;
				207	unsigned long e_pfn;
				208
				209	if (entry->type != E820_TYPE_RAM)
				210	continue;
				211
				212	e_pfn = PFN_DOWN(entry->addr + entry->size);
				213
				214	/* We only care about E820 after this */
				215	if (e_pfn <= *min_pfn)
				216	continue;
				217
				218	s_pfn = PFN_UP(entry->addr);
				219
				220	/* If min_pfn falls within the E820 entry, we want to start
				221	* at the min_pfn PFN.
				222	*/
				223	if (s_pfn <= *min_pfn) {
				224	done = e_pfn - *min_pfn;
				225	} else {
				226	done = e_pfn - s_pfn;
				227	*min_pfn = s_pfn;
				228	}
				229	break;
				230	}
				231
				232	return done;
				233	}
				234
				235	static int __init xen_free_mfn(unsigned long mfn)
				236	{
				237	struct xen_memory_reservation reservation = {
				238	.address_bits = 0,
				239	.extent_order = 0,
				240	.domid = DOMID_SELF
				241	};
				242
				243	set_xen_guest_handle(reservation.extent_start, &mfn);
				244	reservation.nr_extents = 1;
				245
				246	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
				247	}
				248
				249	/*
				250	* This releases a chunk of memory and then does the identity map. It's used
				251	* as a fallback if the remapping fails.
				252	*/
				253	static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
				254	unsigned long end_pfn, unsigned long nr_pages)
				255	{
				256	unsigned long pfn, end;
				257	int ret;
				258
				259	WARN_ON(start_pfn > end_pfn);
				260
				261	/* Release pages first. */
				262	end = min(end_pfn, nr_pages);
				263	for (pfn = start_pfn; pfn < end; pfn++) {
				264	unsigned long mfn = pfn_to_mfn(pfn);
				265
				266	/* Make sure pfn exists to start with */
				267	if (mfn == INVALID_P2M_ENTRY \|\| mfn_to_pfn(mfn) != pfn)
				268	continue;
				269
				270	ret = xen_free_mfn(mfn);
				271	WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
				272
				273	if (ret == 1) {
				274	xen_released_pages++;
				275	if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
				276	break;
				277	} else
				278	break;
				279	}
				280
				281	set_phys_range_identity(start_pfn, end_pfn);
				282	}
				283
				284	/*
				285	* Helper function to update the p2m and m2p tables and kernel mapping.
				286	*/
				287	static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
				288	{
				289	struct mmu_update update = {
				290	.ptr = ((uint64_t)mfn << PAGE_SHIFT) \| MMU_MACHPHYS_UPDATE,
				291	.val = pfn
				292	};
				293
				294	/* Update p2m */
				295	if (!set_phys_to_machine(pfn, mfn)) {
				296	WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
				297	pfn, mfn);
				298	BUG();
				299	}
				300
				301	/* Update m2p */
				302	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
				303	WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
				304	mfn, pfn);
				305	BUG();
				306	}
				307
				308	/* Update kernel mapping, but not for highmem. */
				309	if (pfn >= PFN_UP(__pa(high_memory - 1)))
				310	return;
				311
				312	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
				313	mfn_pte(mfn, PAGE_KERNEL), 0)) {
				314	WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
				315	mfn, pfn);
				316	BUG();
				317	}
				318	}
				319
				320	/*
				321	* This function updates the p2m and m2p tables with an identity map from
				322	* start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
				323	* original allocation at remap_pfn. The information needed for remapping is
				324	* saved in the memory itself to avoid the need for allocating buffers. The
				325	* complete remap information is contained in a list of MFNs each containing
				326	* up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
				327	* This enables us to preserve the original mfn sequence while doing the
				328	* remapping at a time when the memory management is capable of allocating
				329	* virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
				330	* its callers.
				331	*/
				332	static void __init xen_do_set_identity_and_remap_chunk(
				333	unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
				334	{
				335	unsigned long buf = (unsigned long)&xen_remap_buf;
				336	unsigned long mfn_save, mfn;
				337	unsigned long ident_pfn_iter, remap_pfn_iter;
				338	unsigned long ident_end_pfn = start_pfn + size;
				339	unsigned long left = size;
				340	unsigned int i, chunk;
				341
				342	WARN_ON(size == 0);
				343
				344	mfn_save = virt_to_mfn(buf);
				345
				346	for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
				347	ident_pfn_iter < ident_end_pfn;
				348	ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
				349	chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
				350
				351	/* Map first pfn to xen_remap_buf */
				352	mfn = pfn_to_mfn(ident_pfn_iter);
				353	set_pte_mfn(buf, mfn, PAGE_KERNEL);
				354
				355	/* Save mapping information in page */
				356	xen_remap_buf.next_area_mfn = xen_remap_mfn;
				357	xen_remap_buf.target_pfn = remap_pfn_iter;
				358	xen_remap_buf.size = chunk;
				359	for (i = 0; i < chunk; i++)
				360	xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
				361
				362	/* Put remap buf into list. */
				363	xen_remap_mfn = mfn;
				364
				365	/* Set identity map */
				366	set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
				367
				368	left -= chunk;
				369	}
				370
				371	/* Restore old xen_remap_buf mapping */
				372	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
				373	}
				374
				375	/*
				376	* This function takes a contiguous pfn range that needs to be identity mapped
				377	* and:
				378	*
				379	* 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
				380	* 2) Calls the do_ function to actually do the mapping/remapping work.
				381	*
				382	* The goal is to not allocate additional memory but to remap the existing
				383	* pages. In the case of an error the underlying memory is simply released back
				384	* to Xen and not remapped.
				385	*/
				386	static unsigned long __init xen_set_identity_and_remap_chunk(
				387	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
				388	unsigned long remap_pfn)
				389	{
				390	unsigned long pfn;
				391	unsigned long i = 0;
				392	unsigned long n = end_pfn - start_pfn;
				393
				394	if (remap_pfn == 0)
				395	remap_pfn = nr_pages;
				396
				397	while (i < n) {
				398	unsigned long cur_pfn = start_pfn + i;
				399	unsigned long left = n - i;
				400	unsigned long size = left;
				401	unsigned long remap_range_size;
				402
				403	/* Do not remap pages beyond the current allocation */
				404	if (cur_pfn >= nr_pages) {
				405	/* Identity map remaining pages */
				406	set_phys_range_identity(cur_pfn, cur_pfn + size);
				407	break;
				408	}
				409	if (cur_pfn + size > nr_pages)
				410	size = nr_pages - cur_pfn;
				411
				412	remap_range_size = xen_find_pfn_range(&remap_pfn);
				413	if (!remap_range_size) {
				414	pr_warning("Unable to find available pfn range, not remapping identity pages\n");
				415	xen_set_identity_and_release_chunk(cur_pfn,
				416	cur_pfn + left, nr_pages);
				417	break;
				418	}
				419	/* Adjust size to fit in current e820 RAM region */
				420	if (size > remap_range_size)
				421	size = remap_range_size;
				422
				423	xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
				424
				425	/* Update variables to reflect new mappings. */
				426	i += size;
				427	remap_pfn += size;
				428	}
				429
				430	/*
				431	* If the PFNs are currently mapped, the VA mapping also needs
				432	* to be updated to be 1:1.
				433	*/
				434	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
				435	(void)HYPERVISOR_update_va_mapping(
				436	(unsigned long)__va(pfn << PAGE_SHIFT),
				437	mfn_pte(pfn, PAGE_KERNEL_IO), 0);
				438
				439	return remap_pfn;
				440	}
				441
				442	static unsigned long __init xen_count_remap_pages(
				443	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
				444	unsigned long remap_pages)
				445	{
				446	if (start_pfn >= nr_pages)
				447	return remap_pages;
				448
				449	return remap_pages + min(end_pfn, nr_pages) - start_pfn;
				450	}
				451
				452	static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
				453	unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn,
				454	unsigned long nr_pages, unsigned long last_val))
				455	{
				456	phys_addr_t start = 0;
				457	unsigned long ret_val = 0;
				458	const struct e820_entry *entry = xen_e820_table.entries;
				459	int i;
				460
				461	/*
				462	* Combine non-RAM regions and gaps until a RAM region (or the
				463	* end of the map) is reached, then call the provided function
				464	* to perform its duty on the non-RAM region.
				465	*
				466	* The combined non-RAM regions are rounded to a whole number
				467	* of pages so any partial pages are accessible via the 1:1
				468	* mapping. This is needed for some BIOSes that put (for
				469	* example) the DMI tables in a reserved region that begins on
				470	* a non-page boundary.
				471	*/
				472	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
				473	phys_addr_t end = entry->addr + entry->size;
				474	if (entry->type == E820_TYPE_RAM \|\| i == xen_e820_table.nr_entries - 1) {
				475	unsigned long start_pfn = PFN_DOWN(start);
				476	unsigned long end_pfn = PFN_UP(end);
				477
				478	if (entry->type == E820_TYPE_RAM)
				479	end_pfn = PFN_UP(entry->addr);
				480
				481	if (start_pfn < end_pfn)
				482	ret_val = func(start_pfn, end_pfn, nr_pages,
				483	ret_val);
				484	start = end;
				485	}
				486	}
				487
				488	return ret_val;
				489	}
				490
				491	/*
				492	* Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
				493	* The remap information (which mfn remap to which pfn) is contained in the
				494	* to be remapped memory itself in a linked list anchored at xen_remap_mfn.
				495	* This scheme allows to remap the different chunks in arbitrary order while
				496	* the resulting mapping will be independant from the order.
				497	*/
				498	void __init xen_remap_memory(void)
				499	{
				500	unsigned long buf = (unsigned long)&xen_remap_buf;
				501	unsigned long mfn_save, pfn;
				502	unsigned long remapped = 0;
				503	unsigned int i;
				504	unsigned long pfn_s = ~0UL;
				505	unsigned long len = 0;
				506
				507	mfn_save = virt_to_mfn(buf);
				508
				509	while (xen_remap_mfn != INVALID_P2M_ENTRY) {
				510	/* Map the remap information */
				511	set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);
				512
				513	BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);
				514
				515	pfn = xen_remap_buf.target_pfn;
				516	for (i = 0; i < xen_remap_buf.size; i++) {
				517	xen_update_mem_tables(pfn, xen_remap_buf.mfns[i]);
				518	remapped++;
				519	pfn++;
				520	}
				521	if (pfn_s == ~0UL \|\| pfn == pfn_s) {
				522	pfn_s = xen_remap_buf.target_pfn;
				523	len += xen_remap_buf.size;
				524	} else if (pfn_s + len == xen_remap_buf.target_pfn) {
				525	len += xen_remap_buf.size;
				526	} else {
				527	xen_del_extra_mem(pfn_s, len);
				528	pfn_s = xen_remap_buf.target_pfn;
				529	len = xen_remap_buf.size;
				530	}
				531	xen_remap_mfn = xen_remap_buf.next_area_mfn;
				532	}
				533
				534	if (pfn_s != ~0UL && len)
				535	xen_del_extra_mem(pfn_s, len);
				536
				537	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
				538
				539	pr_info("Remapped %ld page(s)\n", remapped);
				540	}
				541
				542	static unsigned long __init xen_get_pages_limit(void)
				543	{
				544	unsigned long limit;
				545
				546	#ifdef CONFIG_X86_32
				547	limit = GB(64) / PAGE_SIZE;
				548	#else
				549	limit = MAXMEM / PAGE_SIZE;
				550	if (!xen_initial_domain() && xen_512gb_limit)
				551	limit = GB(512) / PAGE_SIZE;
				552	#endif
				553	return limit;
				554	}
				555
				556	static unsigned long __init xen_get_max_pages(void)
				557	{
				558	unsigned long max_pages, limit;
				559	domid_t domid = DOMID_SELF;
				560	long ret;
				561
				562	limit = xen_get_pages_limit();
				563	max_pages = limit;
				564
				565	/*
				566	* For the initial domain we use the maximum reservation as
				567	* the maximum page.
				568	*
				569	* For guest domains the current maximum reservation reflects
				570	* the current maximum rather than the static maximum. In this
				571	* case the e820 map provided to us will cover the static
				572	* maximum region.
				573	*/
				574	if (xen_initial_domain()) {
				575	ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
				576	if (ret > 0)
				577	max_pages = ret;
				578	}
				579
				580	return min(max_pages, limit);
				581	}
				582
				583	static void __init xen_align_and_add_e820_region(phys_addr_t start,
				584	phys_addr_t size, int type)
				585	{
				586	phys_addr_t end = start + size;
				587
				588	/* Align RAM regions to page boundaries. */
				589	if (type == E820_TYPE_RAM) {
				590	start = PAGE_ALIGN(start);
				591	end &= ~((phys_addr_t)PAGE_SIZE - 1);
				592	}
				593
				594	e820__range_add(start, end - start, type);
				595	}
				596
				597	static void __init xen_ignore_unusable(void)
				598	{
				599	struct e820_entry *entry = xen_e820_table.entries;
				600	unsigned int i;
				601
				602	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
				603	if (entry->type == E820_TYPE_UNUSABLE)
				604	entry->type = E820_TYPE_RAM;
				605	}
				606	}
				607
				608	bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
				609	{
				610	struct e820_entry *entry;
				611	unsigned mapcnt;
				612	phys_addr_t end;
				613
				614	if (!size)
				615	return false;
				616
				617	end = start + size;
				618	entry = xen_e820_table.entries;
				619
				620	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
				621	if (entry->type == E820_TYPE_RAM && entry->addr <= start &&
				622	(entry->addr + entry->size) >= end)
				623	return false;
				624
				625	entry++;
				626	}
				627
				628	return true;
				629	}
				630
				631	/*
				632	* Find a free area in physical memory not yet reserved and compliant with
				633	* E820 map.
				634	* Used to relocate pre-allocated areas like initrd or p2m list which are in
				635	* conflict with the to be used E820 map.
				636	* In case no area is found, return 0. Otherwise return the physical address
				637	* of the area which is already reserved for convenience.
				638	*/
				639	phys_addr_t __init xen_find_free_area(phys_addr_t size)
				640	{
				641	unsigned mapcnt;
				642	phys_addr_t addr, start;
				643	struct e820_entry *entry = xen_e820_table.entries;
				644
				645	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++, entry++) {
				646	if (entry->type != E820_TYPE_RAM \|\| entry->size < size)
				647	continue;
				648	start = entry->addr;
				649	for (addr = start; addr < start + size; addr += PAGE_SIZE) {
				650	if (!memblock_is_reserved(addr))
				651	continue;
				652	start = addr + PAGE_SIZE;
				653	if (start + size > entry->addr + entry->size)
				654	break;
				655	}
				656	if (addr >= start + size) {
				657	memblock_reserve(start, size);
				658	return start;
				659	}
				660	}
				661
				662	return 0;
				663	}
				664
				665	/*
				666	* Like memcpy, but with physical addresses for dest and src.
				667	*/
				668	static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
				669	phys_addr_t n)
				670	{
				671	phys_addr_t dest_off, src_off, dest_len, src_len, len;
				672	void from, to;
				673
				674	while (n) {
				675	dest_off = dest & ~PAGE_MASK;
				676	src_off = src & ~PAGE_MASK;
				677	dest_len = n;
				678	if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
				679	dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
				680	src_len = n;
				681	if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
				682	src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
				683	len = min(dest_len, src_len);
				684	to = early_memremap(dest - dest_off, dest_len + dest_off);
				685	from = early_memremap(src - src_off, src_len + src_off);
				686	memcpy(to, from, len);
				687	early_memunmap(to, dest_len + dest_off);
				688	early_memunmap(from, src_len + src_off);
				689	n -= len;
				690	dest += len;
				691	src += len;
				692	}
				693	}
				694
				695	/*
				696	* Reserve Xen mfn_list.
				697	*/
				698	static void __init xen_reserve_xen_mfnlist(void)
				699	{
				700	phys_addr_t start, size;
				701
				702	if (xen_start_info->mfn_list >= __START_KERNEL_map) {
				703	start = __pa(xen_start_info->mfn_list);
				704	size = PFN_ALIGN(xen_start_info->nr_pages *
				705	sizeof(unsigned long));
				706	} else {
				707	start = PFN_PHYS(xen_start_info->first_p2m_pfn);
				708	size = PFN_PHYS(xen_start_info->nr_p2m_frames);
				709	}
				710
				711	memblock_reserve(start, size);
				712	if (!xen_is_e820_reserved(start, size))
				713	return;
				714
				715	#ifdef CONFIG_X86_32
				716	/*
				717	* Relocating the p2m on 32 bit system to an arbitrary virtual address
				718	* is not supported, so just give up.
				719	*/
				720	xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
				721	BUG();
				722	#else
				723	xen_relocate_p2m();
				724	memblock_free(start, size);
				725	#endif
				726	}
				727
				728	/**
				729	* machine_specific_memory_setup - Hook for machine specific memory setup.
				730	**/
				731	char * __init xen_memory_setup(void)
				732	{
				733	unsigned long max_pfn, pfn_s, n_pfns;
				734	phys_addr_t mem_end, addr, size, chunk_size;
				735	u32 type;
				736	int rc;
				737	struct xen_memory_map memmap;
				738	unsigned long max_pages;
				739	unsigned long extra_pages = 0;
				740	int i;
				741	int op;
				742
				743	xen_parse_512gb();
				744	max_pfn = xen_get_pages_limit();
				745	max_pfn = min(max_pfn, xen_start_info->nr_pages);
				746	mem_end = PFN_PHYS(max_pfn);
				747
				748	memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
				749	set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
				750
				751	op = xen_initial_domain() ?
				752	XENMEM_machine_memory_map :
				753	XENMEM_memory_map;
				754	rc = HYPERVISOR_memory_op(op, &memmap);
				755	if (rc == -ENOSYS) {
				756	BUG_ON(xen_initial_domain());
				757	memmap.nr_entries = 1;
				758	xen_e820_table.entries[0].addr = 0ULL;
				759	xen_e820_table.entries[0].size = mem_end;
				760	/* 8MB slack (to balance backend allocations). */
				761	xen_e820_table.entries[0].size += 8ULL << 20;
				762	xen_e820_table.entries[0].type = E820_TYPE_RAM;
				763	rc = 0;
				764	}
				765	BUG_ON(rc);
				766	BUG_ON(memmap.nr_entries == 0);
				767	xen_e820_table.nr_entries = memmap.nr_entries;
				768
				769	/*
				770	* Xen won't allow a 1:1 mapping to be created to UNUSABLE
				771	* regions, so if we're using the machine memory map leave the
				772	* region as RAM as it is in the pseudo-physical map.
				773	*
				774	* UNUSABLE regions in domUs are not handled and will need
				775	* a patch in the future.
				776	*/
				777	if (xen_initial_domain())
				778	xen_ignore_unusable();
				779
				780	/* Make sure the Xen-supplied memory map is well-ordered. */
				781	e820__update_table(&xen_e820_table);
				782
				783	max_pages = xen_get_max_pages();
				784
				785	/* How many extra pages do we need due to remapping? */
				786	max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages);
				787
				788	if (max_pages > max_pfn)
				789	extra_pages += max_pages - max_pfn;
				790
				791	/*
				792	* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
				793	* factor the base size. On non-highmem systems, the base
				794	* size is the full initial memory allocation; on highmem it
				795	* is limited to the max size of lowmem, so that it doesn't
				796	* get completely filled.
				797	*
				798	* Make sure we have no memory above max_pages, as this area
				799	* isn't handled by the p2m management.
				800	*
				801	* In principle there could be a problem in lowmem systems if
				802	* the initial memory is also very large with respect to
				803	* lowmem, but we won't try to deal with that here.
				804	*/
				805	extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
				806	extra_pages, max_pages - max_pfn);
				807	i = 0;
				808	addr = xen_e820_table.entries[0].addr;
				809	size = xen_e820_table.entries[0].size;
				810	while (i < xen_e820_table.nr_entries) {
				811	bool discard = false;
				812
				813	chunk_size = size;
				814	type = xen_e820_table.entries[i].type;
				815
				816	if (type == E820_TYPE_RAM) {
				817	if (addr < mem_end) {
				818	chunk_size = min(size, mem_end - addr);
				819	} else if (extra_pages) {
				820	chunk_size = min(size, PFN_PHYS(extra_pages));
				821	pfn_s = PFN_UP(addr);
				822	n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
				823	extra_pages -= n_pfns;
				824	xen_add_extra_mem(pfn_s, n_pfns);
				825	xen_max_p2m_pfn = pfn_s + n_pfns;
				826	} else
				827	discard = true;
				828	}
				829
				830	if (!discard)
				831	xen_align_and_add_e820_region(addr, chunk_size, type);
				832
				833	addr += chunk_size;
				834	size -= chunk_size;
				835	if (size == 0) {
				836	i++;
				837	if (i < xen_e820_table.nr_entries) {
				838	addr = xen_e820_table.entries[i].addr;
				839	size = xen_e820_table.entries[i].size;
				840	}
				841	}
				842	}
				843
				844	/*
				845	* Set the rest as identity mapped, in case PCI BARs are
				846	* located here.
				847	*/
				848	set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
				849
				850	/*
				851	* In domU, the ISA region is normal, usable memory, but we
				852	* reserve ISA memory anyway because too many things poke
				853	* about in there.
				854	*/
				855	e820__range_add(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_TYPE_RESERVED);
				856
				857	e820__update_table(e820_table);
				858
				859	/*
				860	* Check whether the kernel itself conflicts with the target E820 map.
				861	* Failing now is better than running into weird problems later due
				862	* to relocating (and even reusing) pages with kernel text or data.
				863	*/
				864	if (xen_is_e820_reserved(__pa_symbol(_text),
				865	__pa_symbol(__bss_stop) - __pa_symbol(_text))) {
				866	xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
				867	BUG();
				868	}
				869
				870	/*
				871	* Check for a conflict of the hypervisor supplied page tables with
				872	* the target E820 map.
				873	*/
				874	xen_pt_check_e820();
				875
				876	xen_reserve_xen_mfnlist();
				877
				878	/* Check for a conflict of the initrd with the target E820 map. */
				879	if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
				880	boot_params.hdr.ramdisk_size)) {
				881	phys_addr_t new_area, start, size;
				882
				883	new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
				884	if (!new_area) {
				885	xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
				886	BUG();
				887	}
				888
				889	start = boot_params.hdr.ramdisk_image;
				890	size = boot_params.hdr.ramdisk_size;
				891	xen_phys_memcpy(new_area, start, size);
				892	pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
				893	start, start + size, new_area, new_area + size);
				894	memblock_free(start, size);
				895	boot_params.hdr.ramdisk_image = new_area;
				896	boot_params.ext_ramdisk_image = new_area >> 32;
				897	}
				898
				899	/*
				900	* Set identity map on non-RAM pages and prepare remapping the
				901	* underlying RAM.
				902	*/
				903	xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk);
				904
				905	pr_info("Released %ld page(s)\n", xen_released_pages);
				906
				907	return "Xen";
				908	}
				909
				910	/*
				911	* Set the bit indicating "nosegneg" library variants should be used.
				912	* We only need to bother in pure 32-bit mode; compat 32-bit processes
				913	* can have un-truncated segments, so wrapping around is allowed.
				914	*/
				915	static void __init fiddle_vdso(void)
				916	{
				917	#ifdef CONFIG_X86_32
				918	u32 *mask = vdso_image_32.data +
				919	vdso_image_32.sym_VDSO32_NOTE_MASK;
				920	*mask \|= 1 << VDSO_NOTE_NONEGSEG_BIT;
				921	#endif
				922	}
				923
				924	static int register_callback(unsigned type, const void *func)
				925	{
				926	struct callback_register callback = {
				927	.type = type,
				928	.address = XEN_CALLBACK(__KERNEL_CS, func),
				929	.flags = CALLBACKF_mask_events,
				930	};
				931
				932	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
				933	}
				934
				935	void xen_enable_sysenter(void)
				936	{
				937	int ret;
				938	unsigned sysenter_feature;
				939
				940	#ifdef CONFIG_X86_32
				941	sysenter_feature = X86_FEATURE_SEP;
				942	#else
				943	sysenter_feature = X86_FEATURE_SYSENTER32;
				944	#endif
				945
				946	if (!boot_cpu_has(sysenter_feature))
				947	return;
				948
				949	ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
				950	if(ret != 0)
				951	setup_clear_cpu_cap(sysenter_feature);
				952	}
				953
				954	void xen_enable_syscall(void)
				955	{
				956	#ifdef CONFIG_X86_64
				957	int ret;
				958
				959	ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
				960	if (ret != 0) {
				961	printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
				962	/* Pretty fatal; 64-bit userspace has no other
				963	mechanism for syscalls. */
				964	}
				965
				966	if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
				967	ret = register_callback(CALLBACKTYPE_syscall32,
				968	xen_syscall32_target);
				969	if (ret != 0)
				970	setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
				971	}
				972	#endif /* CONFIG_X86_64 */
				973	}
				974
				975	void __init xen_pvmmu_arch_setup(void)
				976	{
				977	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
				978	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
				979
				980	HYPERVISOR_vm_assist(VMASST_CMD_enable,
				981	VMASST_TYPE_pae_extended_cr3);
				982
				983	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) \|\|
				984	register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
				985	BUG();
				986
				987	xen_enable_sysenter();
				988	xen_enable_syscall();
				989	}
				990
				991	/* This function is not called for HVM domains */
				992	void __init xen_arch_setup(void)
				993	{
				994	xen_panic_handler_init();
				995	xen_pvmmu_arch_setup();
				996
				997	#ifdef CONFIG_ACPI
				998	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
				999	printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
				1000	disable_acpi();
				1001	}
				1002	#endif
				1003
				1004	memcpy(boot_command_line, xen_start_info->cmd_line,
				1005	MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
				1006	COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
				1007
				1008	/* Set up idle, making sure it calls safe_halt() pvop */
				1009	disable_cpuidle();
				1010	disable_cpufreq();
				1011	WARN_ON(xen_set_default_idle());
				1012	fiddle_vdso();
				1013	#ifdef CONFIG_NUMA
				1014	numa_off = 1;
				1015	#endif
				1016	}