Blame - src/kernel/linux/v4.19/drivers/misc/vmw_balloon.c - T800

blob: 2543ef1ece179bc5ba2430e8cf0b65f7ee612645 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* VMware Balloon driver.
				4	*
				5	* Copyright (C) 2000-2018, VMware, Inc. All Rights Reserved.
				6	*
				7	* This is VMware physical memory management driver for Linux. The driver
				8	* acts like a "balloon" that can be inflated to reclaim physical pages by
				9	* reserving them in the guest and invalidating them in the monitor,
				10	* freeing up the underlying machine pages so they can be allocated to
				11	* other guests. The balloon can also be deflated to allow the guest to
				12	* use more physical memory. Higher level policies can control the sizes
				13	* of balloons in VMs in order to manage physical memory resources.
				14	*/
				15
				16	//#define DEBUG
				17	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				18
				19	#include <linux/types.h>
				20	#include <linux/kernel.h>
				21	#include <linux/mm.h>
				22	#include <linux/vmalloc.h>
				23	#include <linux/sched.h>
				24	#include <linux/module.h>
				25	#include <linux/workqueue.h>
				26	#include <linux/debugfs.h>
				27	#include <linux/seq_file.h>
				28	#include <linux/vmw_vmci_defs.h>
				29	#include <linux/vmw_vmci_api.h>
				30	#include <asm/hypervisor.h>
				31
				32	MODULE_AUTHOR("VMware, Inc.");
				33	MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
				34	MODULE_VERSION("1.5.0.0-k");
				35	MODULE_ALIAS("dmi::svnVMware:*");
				36	MODULE_ALIAS("vmware_vmmemctl");
				37	MODULE_LICENSE("GPL");
				38
				39	/*
				40	* Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
				41	* allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
				42	* __GFP_NOWARN, to suppress page allocation failure warnings.
				43	*/
				44	#define VMW_PAGE_ALLOC_NOSLEEP (__GFP_HIGHMEM\|__GFP_NOWARN)
				45
				46	/*
				47	* Use GFP_HIGHUSER when executing in a separate kernel thread
				48	* context and allocation can sleep. This is less stressful to
				49	* the guest memory system, since it allows the thread to block
				50	* while memory is reclaimed, and won't take pages from emergency
				51	* low-memory pools.
				52	*/
				53	#define VMW_PAGE_ALLOC_CANSLEEP (GFP_HIGHUSER)
				54
				55	/* Maximum number of refused pages we accumulate during inflation cycle */
				56	#define VMW_BALLOON_MAX_REFUSED 16
				57
				58	/*
				59	* Hypervisor communication port definitions.
				60	*/
				61	#define VMW_BALLOON_HV_PORT 0x5670
				62	#define VMW_BALLOON_HV_MAGIC 0x456c6d6f
				63	#define VMW_BALLOON_GUEST_ID 1 /* Linux */
				64
				65	enum vmwballoon_capabilities {
				66	/*
				67	* Bit 0 is reserved and not associated to any capability.
				68	*/
				69	VMW_BALLOON_BASIC_CMDS = (1 << 1),
				70	VMW_BALLOON_BATCHED_CMDS = (1 << 2),
				71	VMW_BALLOON_BATCHED_2M_CMDS = (1 << 3),
				72	VMW_BALLOON_SIGNALLED_WAKEUP_CMD = (1 << 4),
				73	};
				74
				75	#define VMW_BALLOON_CAPABILITIES (VMW_BALLOON_BASIC_CMDS \
				76	\| VMW_BALLOON_BATCHED_CMDS \
				77	\| VMW_BALLOON_BATCHED_2M_CMDS \
				78	\| VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
				79
				80	#define VMW_BALLOON_2M_SHIFT (9)
				81	#define VMW_BALLOON_NUM_PAGE_SIZES (2)
				82
				83	/*
				84	* Backdoor commands availability:
				85	*
				86	* START, GET_TARGET and GUEST_ID are always available,
				87	*
				88	* VMW_BALLOON_BASIC_CMDS:
				89	* LOCK and UNLOCK commands,
				90	* VMW_BALLOON_BATCHED_CMDS:
				91	* BATCHED_LOCK and BATCHED_UNLOCK commands.
				92	* VMW BALLOON_BATCHED_2M_CMDS:
				93	* BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
				94	* VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
				95	* VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
				96	*/
				97	#define VMW_BALLOON_CMD_START 0
				98	#define VMW_BALLOON_CMD_GET_TARGET 1
				99	#define VMW_BALLOON_CMD_LOCK 2
				100	#define VMW_BALLOON_CMD_UNLOCK 3
				101	#define VMW_BALLOON_CMD_GUEST_ID 4
				102	#define VMW_BALLOON_CMD_BATCHED_LOCK 6
				103	#define VMW_BALLOON_CMD_BATCHED_UNLOCK 7
				104	#define VMW_BALLOON_CMD_BATCHED_2M_LOCK 8
				105	#define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK 9
				106	#define VMW_BALLOON_CMD_VMCI_DOORBELL_SET 10
				107
				108
				109	/* error codes */
				110	#define VMW_BALLOON_SUCCESS 0
				111	#define VMW_BALLOON_FAILURE -1
				112	#define VMW_BALLOON_ERROR_CMD_INVALID 1
				113	#define VMW_BALLOON_ERROR_PPN_INVALID 2
				114	#define VMW_BALLOON_ERROR_PPN_LOCKED 3
				115	#define VMW_BALLOON_ERROR_PPN_UNLOCKED 4
				116	#define VMW_BALLOON_ERROR_PPN_PINNED 5
				117	#define VMW_BALLOON_ERROR_PPN_NOTNEEDED 6
				118	#define VMW_BALLOON_ERROR_RESET 7
				119	#define VMW_BALLOON_ERROR_BUSY 8
				120
				121	#define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES (0x03000000)
				122
				123	/* Batch page description */
				124
				125	/*
				126	* Layout of a page in the batch page:
				127	*
				128	* +-------------+----------+--------+
				129	* \| \| \| \|
				130	* \| Page number \| Reserved \| Status \|
				131	* \| \| \| \|
				132	* +-------------+----------+--------+
				133	* 64 PAGE_SHIFT 6 0
				134	*
				135	* The reserved field should be set to 0.
				136	*/
				137	#define VMW_BALLOON_BATCH_MAX_PAGES (PAGE_SIZE / sizeof(u64))
				138	#define VMW_BALLOON_BATCH_STATUS_MASK ((1UL << 5) - 1)
				139	#define VMW_BALLOON_BATCH_PAGE_MASK (~((1UL << PAGE_SHIFT) - 1))
				140
				141	struct vmballoon_batch_page {
				142	u64 pages[VMW_BALLOON_BATCH_MAX_PAGES];
				143	};
				144
				145	static u64 vmballoon_batch_get_pa(struct vmballoon_batch_page *batch, int idx)
				146	{
				147	return batch->pages[idx] & VMW_BALLOON_BATCH_PAGE_MASK;
				148	}
				149
				150	static int vmballoon_batch_get_status(struct vmballoon_batch_page *batch,
				151	int idx)
				152	{
				153	return (int)(batch->pages[idx] & VMW_BALLOON_BATCH_STATUS_MASK);
				154	}
				155
				156	static void vmballoon_batch_set_pa(struct vmballoon_batch_page *batch, int idx,
				157	u64 pa)
				158	{
				159	batch->pages[idx] = pa;
				160	}
				161
				162
				163	#define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result) \
				164	({ \
				165	unsigned long __status, __dummy1, __dummy2, __dummy3; \
				166	__asm__ __volatile__ ("inl %%dx" : \
				167	"=a"(__status), \
				168	"=c"(__dummy1), \
				169	"=d"(__dummy2), \
				170	"=b"(result), \
				171	"=S" (__dummy3) : \
				172	"0"(VMW_BALLOON_HV_MAGIC), \
				173	"1"(VMW_BALLOON_CMD_##cmd), \
				174	"2"(VMW_BALLOON_HV_PORT), \
				175	"3"(arg1), \
				176	"4" (arg2) : \
				177	"memory"); \
				178	if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START) \
				179	result = __dummy1; \
				180	result &= -1UL; \
				181	__status & -1UL; \
				182	})
				183
				184	#ifdef CONFIG_DEBUG_FS
				185	struct vmballoon_stats {
				186	unsigned int timer;
				187	unsigned int doorbell;
				188
				189	/* allocation statistics */
				190	unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
				191	unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
				192	unsigned int sleep_alloc;
				193	unsigned int sleep_alloc_fail;
				194	unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
				195	unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
				196	unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
				197
				198	/* monitor operations */
				199	unsigned int lock[VMW_BALLOON_NUM_PAGE_SIZES];
				200	unsigned int lock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
				201	unsigned int unlock[VMW_BALLOON_NUM_PAGE_SIZES];
				202	unsigned int unlock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
				203	unsigned int target;
				204	unsigned int target_fail;
				205	unsigned int start;
				206	unsigned int start_fail;
				207	unsigned int guest_type;
				208	unsigned int guest_type_fail;
				209	unsigned int doorbell_set;
				210	unsigned int doorbell_unset;
				211	};
				212
				213	#define STATS_INC(stat) (stat)++
				214	#else
				215	#define STATS_INC(stat)
				216	#endif
				217
				218	struct vmballoon;
				219
				220	struct vmballoon_ops {
				221	void (add_page)(struct vmballoon b, int idx, struct page *p);
				222	int (lock)(struct vmballoon b, unsigned int num_pages,
				223	bool is_2m_pages, unsigned int *target);
				224	int (unlock)(struct vmballoon b, unsigned int num_pages,
				225	bool is_2m_pages, unsigned int *target);
				226	};
				227
				228	struct vmballoon_page_size {
				229	/* list of reserved physical pages */
				230	struct list_head pages;
				231
				232	/* transient list of non-balloonable pages */
				233	struct list_head refused_pages;
				234	unsigned int n_refused_pages;
				235	};
				236
				237	struct vmballoon {
				238	struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
				239
				240	/* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
				241	unsigned supported_page_sizes;
				242
				243	/* balloon size in pages */
				244	unsigned int size;
				245	unsigned int target;
				246
				247	/* reset flag */
				248	bool reset_required;
				249
				250	unsigned long capabilities;
				251
				252	struct vmballoon_batch_page *batch_page;
				253	unsigned int batch_max_pages;
				254	struct page *page;
				255
				256	const struct vmballoon_ops *ops;
				257
				258	#ifdef CONFIG_DEBUG_FS
				259	/* statistics */
				260	struct vmballoon_stats stats;
				261
				262	/* debugfs file exporting statistics */
				263	struct dentry *dbg_entry;
				264	#endif
				265
				266	struct sysinfo sysinfo;
				267
				268	struct delayed_work dwork;
				269
				270	struct vmci_handle vmci_doorbell;
				271	};
				272
				273	static struct vmballoon balloon;
				274
				275	/*
				276	* Send "start" command to the host, communicating supported version
				277	* of the protocol.
				278	*/
				279	static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
				280	{
				281	unsigned long status, capabilities, dummy = 0;
				282	bool success;
				283
				284	STATS_INC(b->stats.start);
				285
				286	status = VMWARE_BALLOON_CMD(START, req_caps, dummy, capabilities);
				287
				288	switch (status) {
				289	case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
				290	b->capabilities = capabilities;
				291	success = true;
				292	break;
				293	case VMW_BALLOON_SUCCESS:
				294	b->capabilities = VMW_BALLOON_BASIC_CMDS;
				295	success = true;
				296	break;
				297	default:
				298	success = false;
				299	}
				300
				301	/*
				302	* 2MB pages are only supported with batching. If batching is for some
				303	* reason disabled, do not use 2MB pages, since otherwise the legacy
				304	* mechanism is used with 2MB pages, causing a failure.
				305	*/
				306	if ((b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS) &&
				307	(b->capabilities & VMW_BALLOON_BATCHED_CMDS))
				308	b->supported_page_sizes = 2;
				309	else
				310	b->supported_page_sizes = 1;
				311
				312	if (!success) {
				313	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
				314	STATS_INC(b->stats.start_fail);
				315	}
				316	return success;
				317	}
				318
				319	static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
				320	{
				321	switch (status) {
				322	case VMW_BALLOON_SUCCESS:
				323	return true;
				324
				325	case VMW_BALLOON_ERROR_RESET:
				326	b->reset_required = true;
				327	/* fall through */
				328
				329	default:
				330	return false;
				331	}
				332	}
				333
				334	/*
				335	* Communicate guest type to the host so that it can adjust ballooning
				336	* algorithm to the one most appropriate for the guest. This command
				337	* is normally issued after sending "start" command and is part of
				338	* standard reset sequence.
				339	*/
				340	static bool vmballoon_send_guest_id(struct vmballoon *b)
				341	{
				342	unsigned long status, dummy = 0;
				343
				344	status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy,
				345	dummy);
				346
				347	STATS_INC(b->stats.guest_type);
				348
				349	if (vmballoon_check_status(b, status))
				350	return true;
				351
				352	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
				353	STATS_INC(b->stats.guest_type_fail);
				354	return false;
				355	}
				356
				357	static u16 vmballoon_page_size(bool is_2m_page)
				358	{
				359	if (is_2m_page)
				360	return 1 << VMW_BALLOON_2M_SHIFT;
				361
				362	return 1;
				363	}
				364
				365	/*
				366	* Retrieve desired balloon size from the host.
				367	*/
				368	static bool vmballoon_send_get_target(struct vmballoon b, u32 new_target)
				369	{
				370	unsigned long status;
				371	unsigned long target;
				372	unsigned long limit;
				373	unsigned long dummy = 0;
				374	u32 limit32;
				375
				376	/*
				377	* si_meminfo() is cheap. Moreover, we want to provide dynamic
				378	* max balloon size later. So let us call si_meminfo() every
				379	* iteration.
				380	*/
				381	si_meminfo(&b->sysinfo);
				382	limit = b->sysinfo.totalram;
				383
				384	/* Ensure limit fits in 32-bits */
				385	limit32 = (u32)limit;
				386	if (limit != limit32)
				387	return false;
				388
				389	/* update stats */
				390	STATS_INC(b->stats.target);
				391
				392	status = VMWARE_BALLOON_CMD(GET_TARGET, limit, dummy, target);
				393	if (vmballoon_check_status(b, status)) {
				394	*new_target = target;
				395	return true;
				396	}
				397
				398	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
				399	STATS_INC(b->stats.target_fail);
				400	return false;
				401	}
				402
				403	/*
				404	* Notify the host about allocated page so that host can use it without
				405	* fear that guest will need it. Host may reject some pages, we need to
				406	* check the return value and maybe submit a different page.
				407	*/
				408	static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
				409	unsigned int hv_status, unsigned int target)
				410	{
				411	unsigned long status, dummy = 0;
				412	u32 pfn32;
				413
				414	pfn32 = (u32)pfn;
				415	if (pfn32 != pfn)
				416	return -EINVAL;
				417
				418	STATS_INC(b->stats.lock[false]);
				419
				420	hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy, target);
				421	if (vmballoon_check_status(b, status))
				422	return 0;
				423
				424	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
				425	STATS_INC(b->stats.lock_fail[false]);
				426	return -EIO;
				427	}
				428
				429	static int vmballoon_send_batched_lock(struct vmballoon *b,
				430	unsigned int num_pages, bool is_2m_pages, unsigned int *target)
				431	{
				432	unsigned long status;
				433	unsigned long pfn = PHYS_PFN(virt_to_phys(b->batch_page));
				434
				435	STATS_INC(b->stats.lock[is_2m_pages]);
				436
				437	if (is_2m_pages)
				438	status = VMWARE_BALLOON_CMD(BATCHED_2M_LOCK, pfn, num_pages,
				439	*target);
				440	else
				441	status = VMWARE_BALLOON_CMD(BATCHED_LOCK, pfn, num_pages,
				442	*target);
				443
				444	if (vmballoon_check_status(b, status))
				445	return 0;
				446
				447	pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
				448	STATS_INC(b->stats.lock_fail[is_2m_pages]);
				449	return 1;
				450	}
				451
				452	/*
				453	* Notify the host that guest intends to release given page back into
				454	* the pool of available (to the guest) pages.
				455	*/
				456	static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn,
				457	unsigned int *target)
				458	{
				459	unsigned long status, dummy = 0;
				460	u32 pfn32;
				461
				462	pfn32 = (u32)pfn;
				463	if (pfn32 != pfn)
				464	return false;
				465
				466	STATS_INC(b->stats.unlock[false]);
				467
				468	status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy, *target);
				469	if (vmballoon_check_status(b, status))
				470	return true;
				471
				472	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
				473	STATS_INC(b->stats.unlock_fail[false]);
				474	return false;
				475	}
				476
				477	static bool vmballoon_send_batched_unlock(struct vmballoon *b,
				478	unsigned int num_pages, bool is_2m_pages, unsigned int *target)
				479	{
				480	unsigned long status;
				481	unsigned long pfn = PHYS_PFN(virt_to_phys(b->batch_page));
				482
				483	STATS_INC(b->stats.unlock[is_2m_pages]);
				484
				485	if (is_2m_pages)
				486	status = VMWARE_BALLOON_CMD(BATCHED_2M_UNLOCK, pfn, num_pages,
				487	*target);
				488	else
				489	status = VMWARE_BALLOON_CMD(BATCHED_UNLOCK, pfn, num_pages,
				490	*target);
				491
				492	if (vmballoon_check_status(b, status))
				493	return true;
				494
				495	pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
				496	STATS_INC(b->stats.unlock_fail[is_2m_pages]);
				497	return false;
				498	}
				499
				500	static struct page *vmballoon_alloc_page(gfp_t flags, bool is_2m_page)
				501	{
				502	if (is_2m_page)
				503	return alloc_pages(flags, VMW_BALLOON_2M_SHIFT);
				504
				505	return alloc_page(flags);
				506	}
				507
				508	static void vmballoon_free_page(struct page *page, bool is_2m_page)
				509	{
				510	if (is_2m_page)
				511	__free_pages(page, VMW_BALLOON_2M_SHIFT);
				512	else
				513	__free_page(page);
				514	}
				515
				516	/*
				517	* Quickly release all pages allocated for the balloon. This function is
				518	* called when host decides to "reset" balloon for one reason or another.
				519	* Unlike normal "deflate" we do not (shall not) notify host of the pages
				520	* being released.
				521	*/
				522	static void vmballoon_pop(struct vmballoon *b)
				523	{
				524	struct page page, next;
				525	unsigned is_2m_pages;
				526
				527	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
				528	is_2m_pages++) {
				529	struct vmballoon_page_size *page_size =
				530	&b->page_sizes[is_2m_pages];
				531	u16 size_per_page = vmballoon_page_size(is_2m_pages);
				532
				533	list_for_each_entry_safe(page, next, &page_size->pages, lru) {
				534	list_del(&page->lru);
				535	vmballoon_free_page(page, is_2m_pages);
				536	STATS_INC(b->stats.free[is_2m_pages]);
				537	b->size -= size_per_page;
				538	cond_resched();
				539	}
				540	}
				541
				542	/* Clearing the batch_page unconditionally has no adverse effect */
				543	free_page((unsigned long)b->batch_page);
				544	b->batch_page = NULL;
				545	}
				546
				547	/*
				548	* Notify the host of a ballooned page. If host rejects the page put it on the
				549	* refuse list, those refused page are then released at the end of the
				550	* inflation cycle.
				551	*/
				552	static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages,
				553	bool is_2m_pages, unsigned int *target)
				554	{
				555	int locked, hv_status;
				556	struct page *page = b->page;
				557	struct vmballoon_page_size *page_size = &b->page_sizes[false];
				558
				559	/* is_2m_pages can never happen as 2m pages support implies batching */
				560
				561	locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status,
				562	target);
				563	if (locked) {
				564	STATS_INC(b->stats.refused_alloc[false]);
				565
				566	if (locked == -EIO &&
				567	(hv_status == VMW_BALLOON_ERROR_RESET \|\|
				568	hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED)) {
				569	vmballoon_free_page(page, false);
				570	return -EIO;
				571	}
				572
				573	/*
				574	* Place page on the list of non-balloonable pages
				575	* and retry allocation, unless we already accumulated
				576	* too many of them, in which case take a breather.
				577	*/
				578	if (page_size->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
				579	page_size->n_refused_pages++;
				580	list_add(&page->lru, &page_size->refused_pages);
				581	} else {
				582	vmballoon_free_page(page, false);
				583	}
				584	return locked;
				585	}
				586
				587	/* track allocated page */
				588	list_add(&page->lru, &page_size->pages);
				589
				590	/* update balloon size */
				591	b->size++;
				592
				593	return 0;
				594	}
				595
				596	static int vmballoon_lock_batched_page(struct vmballoon *b,
				597	unsigned int num_pages, bool is_2m_pages, unsigned int *target)
				598	{
				599	int locked, i;
				600	u16 size_per_page = vmballoon_page_size(is_2m_pages);
				601
				602	locked = vmballoon_send_batched_lock(b, num_pages, is_2m_pages,
				603	target);
				604	if (locked > 0) {
				605	for (i = 0; i < num_pages; i++) {
				606	u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
				607	struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
				608
				609	vmballoon_free_page(p, is_2m_pages);
				610	}
				611
				612	return -EIO;
				613	}
				614
				615	for (i = 0; i < num_pages; i++) {
				616	u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
				617	struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
				618	struct vmballoon_page_size *page_size =
				619	&b->page_sizes[is_2m_pages];
				620
				621	locked = vmballoon_batch_get_status(b->batch_page, i);
				622
				623	switch (locked) {
				624	case VMW_BALLOON_SUCCESS:
				625	list_add(&p->lru, &page_size->pages);
				626	b->size += size_per_page;
				627	break;
				628	case VMW_BALLOON_ERROR_PPN_PINNED:
				629	case VMW_BALLOON_ERROR_PPN_INVALID:
				630	if (page_size->n_refused_pages
				631	< VMW_BALLOON_MAX_REFUSED) {
				632	list_add(&p->lru, &page_size->refused_pages);
				633	page_size->n_refused_pages++;
				634	break;
				635	}
				636	/* Fallthrough */
				637	case VMW_BALLOON_ERROR_RESET:
				638	case VMW_BALLOON_ERROR_PPN_NOTNEEDED:
				639	vmballoon_free_page(p, is_2m_pages);
				640	break;
				641	default:
				642	/* This should never happen */
				643	WARN_ON_ONCE(true);
				644	}
				645	}
				646
				647	return 0;
				648	}
				649
				650	/*
				651	* Release the page allocated for the balloon. Note that we first notify
				652	* the host so it can make sure the page will be available for the guest
				653	* to use, if needed.
				654	*/
				655	static int vmballoon_unlock_page(struct vmballoon *b, unsigned int num_pages,
				656	bool is_2m_pages, unsigned int *target)
				657	{
				658	struct page *page = b->page;
				659	struct vmballoon_page_size *page_size = &b->page_sizes[false];
				660
				661	/* is_2m_pages can never happen as 2m pages support implies batching */
				662
				663	if (!vmballoon_send_unlock_page(b, page_to_pfn(page), target)) {
				664	list_add(&page->lru, &page_size->pages);
				665	return -EIO;
				666	}
				667
				668	/* deallocate page */
				669	vmballoon_free_page(page, false);
				670	STATS_INC(b->stats.free[false]);
				671
				672	/* update balloon size */
				673	b->size--;
				674
				675	return 0;
				676	}
				677
				678	static int vmballoon_unlock_batched_page(struct vmballoon *b,
				679	unsigned int num_pages, bool is_2m_pages,
				680	unsigned int *target)
				681	{
				682	int locked, i, ret = 0;
				683	bool hv_success;
				684	u16 size_per_page = vmballoon_page_size(is_2m_pages);
				685
				686	hv_success = vmballoon_send_batched_unlock(b, num_pages, is_2m_pages,
				687	target);
				688	if (!hv_success)
				689	ret = -EIO;
				690
				691	for (i = 0; i < num_pages; i++) {
				692	u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
				693	struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
				694	struct vmballoon_page_size *page_size =
				695	&b->page_sizes[is_2m_pages];
				696
				697	locked = vmballoon_batch_get_status(b->batch_page, i);
				698	if (!hv_success \|\| locked != VMW_BALLOON_SUCCESS) {
				699	/*
				700	* That page wasn't successfully unlocked by the
				701	* hypervisor, re-add it to the list of pages owned by
				702	* the balloon driver.
				703	*/
				704	list_add(&p->lru, &page_size->pages);
				705	} else {
				706	/* deallocate page */
				707	vmballoon_free_page(p, is_2m_pages);
				708	STATS_INC(b->stats.free[is_2m_pages]);
				709
				710	/* update balloon size */
				711	b->size -= size_per_page;
				712	}
				713	}
				714
				715	return ret;
				716	}
				717
				718	/*
				719	* Release pages that were allocated while attempting to inflate the
				720	* balloon but were refused by the host for one reason or another.
				721	*/
				722	static void vmballoon_release_refused_pages(struct vmballoon *b,
				723	bool is_2m_pages)
				724	{
				725	struct page page, next;
				726	struct vmballoon_page_size *page_size =
				727	&b->page_sizes[is_2m_pages];
				728
				729	list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
				730	list_del(&page->lru);
				731	vmballoon_free_page(page, is_2m_pages);
				732	STATS_INC(b->stats.refused_free[is_2m_pages]);
				733	}
				734
				735	page_size->n_refused_pages = 0;
				736	}
				737
				738	static void vmballoon_add_page(struct vmballoon b, int idx, struct page p)
				739	{
				740	b->page = p;
				741	}
				742
				743	static void vmballoon_add_batched_page(struct vmballoon *b, int idx,
				744	struct page *p)
				745	{
				746	vmballoon_batch_set_pa(b->batch_page, idx,
				747	(u64)page_to_pfn(p) << PAGE_SHIFT);
				748	}
				749
				750	/*
				751	* Inflate the balloon towards its target size. Note that we try to limit
				752	* the rate of allocation to make sure we are not choking the rest of the
				753	* system.
				754	*/
				755	static void vmballoon_inflate(struct vmballoon *b)
				756	{
				757	unsigned int num_pages = 0;
				758	int error = 0;
				759	gfp_t flags = VMW_PAGE_ALLOC_NOSLEEP;
				760	bool is_2m_pages;
				761
				762	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
				763
				764	/*
				765	* First try NOSLEEP page allocations to inflate balloon.
				766	*
				767	* If we do not throttle nosleep allocations, we can drain all
				768	* free pages in the guest quickly (if the balloon target is high).
				769	* As a side-effect, draining free pages helps to inform (force)
				770	* the guest to start swapping if balloon target is not met yet,
				771	* which is a desired behavior. However, balloon driver can consume
				772	* all available CPU cycles if too many pages are allocated in a
				773	* second. Therefore, we throttle nosleep allocations even when
				774	* the guest is not under memory pressure. OTOH, if we have already
				775	* predicted that the guest is under memory pressure, then we
				776	* slowdown page allocations considerably.
				777	*/
				778
				779	/*
				780	* Start with no sleep allocation rate which may be higher
				781	* than sleeping allocation rate.
				782	*/
				783	is_2m_pages = b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
				784
				785	pr_debug("%s - goal: %d", __func__, b->target - b->size);
				786
				787	while (!b->reset_required &&
				788	b->size + num_pages * vmballoon_page_size(is_2m_pages)
				789	< b->target) {
				790	struct page *page;
				791
				792	if (flags == VMW_PAGE_ALLOC_NOSLEEP)
				793	STATS_INC(b->stats.alloc[is_2m_pages]);
				794	else
				795	STATS_INC(b->stats.sleep_alloc);
				796
				797	page = vmballoon_alloc_page(flags, is_2m_pages);
				798	if (!page) {
				799	STATS_INC(b->stats.alloc_fail[is_2m_pages]);
				800
				801	if (is_2m_pages) {
				802	b->ops->lock(b, num_pages, true, &b->target);
				803
				804	/*
				805	* ignore errors from locking as we now switch
				806	* to 4k pages and we might get different
				807	* errors.
				808	*/
				809
				810	num_pages = 0;
				811	is_2m_pages = false;
				812	continue;
				813	}
				814
				815	if (flags == VMW_PAGE_ALLOC_CANSLEEP) {
				816	/*
				817	* CANSLEEP page allocation failed, so guest
				818	* is under severe memory pressure. We just log
				819	* the event, but do not stop the inflation
				820	* due to its negative impact on performance.
				821	*/
				822	STATS_INC(b->stats.sleep_alloc_fail);
				823	break;
				824	}
				825
				826	/*
				827	* NOSLEEP page allocation failed, so the guest is
				828	* under memory pressure. Slowing down page alloctions
				829	* seems to be reasonable, but doing so might actually
				830	* cause the hypervisor to throttle us down, resulting
				831	* in degraded performance. We will count on the
				832	* scheduler and standard memory management mechanisms
				833	* for now.
				834	*/
				835	flags = VMW_PAGE_ALLOC_CANSLEEP;
				836	continue;
				837	}
				838
				839	b->ops->add_page(b, num_pages++, page);
				840	if (num_pages == b->batch_max_pages) {
				841	error = b->ops->lock(b, num_pages, is_2m_pages,
				842	&b->target);
				843	num_pages = 0;
				844	if (error)
				845	break;
				846	}
				847
				848	cond_resched();
				849	}
				850
				851	if (num_pages > 0)
				852	b->ops->lock(b, num_pages, is_2m_pages, &b->target);
				853
				854	vmballoon_release_refused_pages(b, true);
				855	vmballoon_release_refused_pages(b, false);
				856	}
				857
				858	/*
				859	* Decrease the size of the balloon allowing guest to use more memory.
				860	*/
				861	static void vmballoon_deflate(struct vmballoon *b)
				862	{
				863	unsigned is_2m_pages;
				864
				865	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
				866
				867	/* free pages to reach target */
				868	for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
				869	is_2m_pages++) {
				870	struct page page, next;
				871	unsigned int num_pages = 0;
				872	struct vmballoon_page_size *page_size =
				873	&b->page_sizes[is_2m_pages];
				874
				875	list_for_each_entry_safe(page, next, &page_size->pages, lru) {
				876	if (b->reset_required \|\|
				877	(b->target > 0 &&
				878	b->size - num_pages
				879	* vmballoon_page_size(is_2m_pages)
				880	< b->target + vmballoon_page_size(true)))
				881	break;
				882
				883	list_del(&page->lru);
				884	b->ops->add_page(b, num_pages++, page);
				885
				886	if (num_pages == b->batch_max_pages) {
				887	int error;
				888
				889	error = b->ops->unlock(b, num_pages,
				890	is_2m_pages, &b->target);
				891	num_pages = 0;
				892	if (error)
				893	return;
				894	}
				895
				896	cond_resched();
				897	}
				898
				899	if (num_pages > 0)
				900	b->ops->unlock(b, num_pages, is_2m_pages, &b->target);
				901	}
				902	}
				903
				904	static const struct vmballoon_ops vmballoon_basic_ops = {
				905	.add_page = vmballoon_add_page,
				906	.lock = vmballoon_lock_page,
				907	.unlock = vmballoon_unlock_page
				908	};
				909
				910	static const struct vmballoon_ops vmballoon_batched_ops = {
				911	.add_page = vmballoon_add_batched_page,
				912	.lock = vmballoon_lock_batched_page,
				913	.unlock = vmballoon_unlock_batched_page
				914	};
				915
				916	static bool vmballoon_init_batching(struct vmballoon *b)
				917	{
				918	struct page *page;
				919
				920	page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
				921	if (!page)
				922	return false;
				923
				924	b->batch_page = page_address(page);
				925	return true;
				926	}
				927
				928	/*
				929	* Receive notification and resize balloon
				930	*/
				931	static void vmballoon_doorbell(void *client_data)
				932	{
				933	struct vmballoon *b = client_data;
				934
				935	STATS_INC(b->stats.doorbell);
				936
				937	mod_delayed_work(system_freezable_wq, &b->dwork, 0);
				938	}
				939
				940	/*
				941	* Clean up vmci doorbell
				942	*/
				943	static void vmballoon_vmci_cleanup(struct vmballoon *b)
				944	{
				945	int error;
				946
				947	VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, VMCI_INVALID_ID,
				948	VMCI_INVALID_ID, error);
				949	STATS_INC(b->stats.doorbell_unset);
				950
				951	if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
				952	vmci_doorbell_destroy(b->vmci_doorbell);
				953	b->vmci_doorbell = VMCI_INVALID_HANDLE;
				954	}
				955	}
				956
				957	/*
				958	* Initialize vmci doorbell, to get notified as soon as balloon changes
				959	*/
				960	static int vmballoon_vmci_init(struct vmballoon *b)
				961	{
				962	unsigned long error, dummy;
				963
				964	if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) == 0)
				965	return 0;
				966
				967	error = vmci_doorbell_create(&b->vmci_doorbell, VMCI_FLAG_DELAYED_CB,
				968	VMCI_PRIVILEGE_FLAG_RESTRICTED,
				969	vmballoon_doorbell, b);
				970
				971	if (error != VMCI_SUCCESS)
				972	goto fail;
				973
				974	error = VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, b->vmci_doorbell.context,
				975	b->vmci_doorbell.resource, dummy);
				976
				977	STATS_INC(b->stats.doorbell_set);
				978
				979	if (error != VMW_BALLOON_SUCCESS)
				980	goto fail;
				981
				982	return 0;
				983	fail:
				984	vmballoon_vmci_cleanup(b);
				985	return -EIO;
				986	}
				987
				988	/*
				989	* Perform standard reset sequence by popping the balloon (in case it
				990	* is not empty) and then restarting protocol. This operation normally
				991	* happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
				992	*/
				993	static void vmballoon_reset(struct vmballoon *b)
				994	{
				995	int error;
				996
				997	vmballoon_vmci_cleanup(b);
				998
				999	/* free all pages, skipping monitor unlock */
				1000	vmballoon_pop(b);
				1001
				1002	if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
				1003	return;
				1004
				1005	if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
				1006	b->ops = &vmballoon_batched_ops;
				1007	b->batch_max_pages = VMW_BALLOON_BATCH_MAX_PAGES;
				1008	if (!vmballoon_init_batching(b)) {
				1009	/*
				1010	* We failed to initialize batching, inform the monitor
				1011	* about it by sending a null capability.
				1012	*
				1013	* The guest will retry in one second.
				1014	*/
				1015	vmballoon_send_start(b, 0);
				1016	return;
				1017	}
				1018	} else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
				1019	b->ops = &vmballoon_basic_ops;
				1020	b->batch_max_pages = 1;
				1021	}
				1022
				1023	b->reset_required = false;
				1024
				1025	error = vmballoon_vmci_init(b);
				1026	if (error)
				1027	pr_err("failed to initialize vmci doorbell\n");
				1028
				1029	if (!vmballoon_send_guest_id(b))
				1030	pr_err("failed to send guest ID to the host\n");
				1031	}
				1032
				1033	/*
				1034	* Balloon work function: reset protocol, if needed, get the new size and
				1035	* adjust balloon as needed. Repeat in 1 sec.
				1036	*/
				1037	static void vmballoon_work(struct work_struct *work)
				1038	{
				1039	struct delayed_work *dwork = to_delayed_work(work);
				1040	struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
				1041	unsigned int target;
				1042
				1043	STATS_INC(b->stats.timer);
				1044
				1045	if (b->reset_required)
				1046	vmballoon_reset(b);
				1047
				1048	if (!b->reset_required && vmballoon_send_get_target(b, &target)) {
				1049	/* update target, adjust size */
				1050	b->target = target;
				1051
				1052	if (b->size < target)
				1053	vmballoon_inflate(b);
				1054	else if (target == 0 \|\|
				1055	b->size > target + vmballoon_page_size(true))
				1056	vmballoon_deflate(b);
				1057	}
				1058
				1059	/*
				1060	* We are using a freezable workqueue so that balloon operations are
				1061	* stopped while the system transitions to/from sleep/hibernation.
				1062	*/
				1063	queue_delayed_work(system_freezable_wq,
				1064	dwork, round_jiffies_relative(HZ));
				1065	}
				1066
				1067	/*
				1068	* DEBUGFS Interface
				1069	*/
				1070	#ifdef CONFIG_DEBUG_FS
				1071
				1072	static int vmballoon_debug_show(struct seq_file f, void offset)
				1073	{
				1074	struct vmballoon *b = f->private;
				1075	struct vmballoon_stats *stats = &b->stats;
				1076
				1077	/* format capabilities info */
				1078	seq_printf(f,
				1079	"balloon capabilities: %#4x\n"
				1080	"used capabilities: %#4lx\n"
				1081	"is resetting: %c\n",
				1082	VMW_BALLOON_CAPABILITIES, b->capabilities,
				1083	b->reset_required ? 'y' : 'n');
				1084
				1085	/* format size info */
				1086	seq_printf(f,
				1087	"target: %8d pages\n"
				1088	"current: %8d pages\n",
				1089	b->target, b->size);
				1090
				1091	seq_printf(f,
				1092	"\n"
				1093	"timer: %8u\n"
				1094	"doorbell: %8u\n"
				1095	"start: %8u (%4u failed)\n"
				1096	"guestType: %8u (%4u failed)\n"
				1097	"2m-lock: %8u (%4u failed)\n"
				1098	"lock: %8u (%4u failed)\n"
				1099	"2m-unlock: %8u (%4u failed)\n"
				1100	"unlock: %8u (%4u failed)\n"
				1101	"target: %8u (%4u failed)\n"
				1102	"prim2mAlloc: %8u (%4u failed)\n"
				1103	"primNoSleepAlloc: %8u (%4u failed)\n"
				1104	"primCanSleepAlloc: %8u (%4u failed)\n"
				1105	"prim2mFree: %8u\n"
				1106	"primFree: %8u\n"
				1107	"err2mAlloc: %8u\n"
				1108	"errAlloc: %8u\n"
				1109	"err2mFree: %8u\n"
				1110	"errFree: %8u\n"
				1111	"doorbellSet: %8u\n"
				1112	"doorbellUnset: %8u\n",
				1113	stats->timer,
				1114	stats->doorbell,
				1115	stats->start, stats->start_fail,
				1116	stats->guest_type, stats->guest_type_fail,
				1117	stats->lock[true], stats->lock_fail[true],
				1118	stats->lock[false], stats->lock_fail[false],
				1119	stats->unlock[true], stats->unlock_fail[true],
				1120	stats->unlock[false], stats->unlock_fail[false],
				1121	stats->target, stats->target_fail,
				1122	stats->alloc[true], stats->alloc_fail[true],
				1123	stats->alloc[false], stats->alloc_fail[false],
				1124	stats->sleep_alloc, stats->sleep_alloc_fail,
				1125	stats->free[true],
				1126	stats->free[false],
				1127	stats->refused_alloc[true], stats->refused_alloc[false],
				1128	stats->refused_free[true], stats->refused_free[false],
				1129	stats->doorbell_set, stats->doorbell_unset);
				1130
				1131	return 0;
				1132	}
				1133
				1134	static int vmballoon_debug_open(struct inode inode, struct file file)
				1135	{
				1136	return single_open(file, vmballoon_debug_show, inode->i_private);
				1137	}
				1138
				1139	static const struct file_operations vmballoon_debug_fops = {
				1140	.owner = THIS_MODULE,
				1141	.open = vmballoon_debug_open,
				1142	.read = seq_read,
				1143	.llseek = seq_lseek,
				1144	.release = single_release,
				1145	};
				1146
				1147	static int __init vmballoon_debugfs_init(struct vmballoon *b)
				1148	{
				1149	int error;
				1150
				1151	b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
				1152	&vmballoon_debug_fops);
				1153	if (IS_ERR(b->dbg_entry)) {
				1154	error = PTR_ERR(b->dbg_entry);
				1155	pr_err("failed to create debugfs entry, error: %d\n", error);
				1156	return error;
				1157	}
				1158
				1159	return 0;
				1160	}
				1161
				1162	static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
				1163	{
				1164	debugfs_remove(b->dbg_entry);
				1165	}
				1166
				1167	#else
				1168
				1169	static inline int vmballoon_debugfs_init(struct vmballoon *b)
				1170	{
				1171	return 0;
				1172	}
				1173
				1174	static inline void vmballoon_debugfs_exit(struct vmballoon *b)
				1175	{
				1176	}
				1177
				1178	#endif /* CONFIG_DEBUG_FS */
				1179
				1180	static int __init vmballoon_init(void)
				1181	{
				1182	int error;
				1183	unsigned is_2m_pages;
				1184	/*
				1185	* Check if we are running on VMware's hypervisor and bail out
				1186	* if we are not.
				1187	*/
				1188	if (x86_hyper_type != X86_HYPER_VMWARE)
				1189	return -ENODEV;
				1190
				1191	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
				1192	is_2m_pages++) {
				1193	INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
				1194	INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
				1195	}
				1196
				1197	INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
				1198
				1199	error = vmballoon_debugfs_init(&balloon);
				1200	if (error)
				1201	return error;
				1202
				1203	balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
				1204	balloon.batch_page = NULL;
				1205	balloon.page = NULL;
				1206	balloon.reset_required = true;
				1207
				1208	queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
				1209
				1210	return 0;
				1211	}
				1212
				1213	/*
				1214	* Using late_initcall() instead of module_init() allows the balloon to use the
				1215	* VMCI doorbell even when the balloon is built into the kernel. Otherwise the
				1216	* VMCI is probed only after the balloon is initialized. If the balloon is used
				1217	* as a module, late_initcall() is equivalent to module_init().
				1218	*/
				1219	late_initcall(vmballoon_init);
				1220
				1221	static void __exit vmballoon_exit(void)
				1222	{
				1223	vmballoon_vmci_cleanup(&balloon);
				1224	cancel_delayed_work_sync(&balloon.dwork);
				1225
				1226	vmballoon_debugfs_exit(&balloon);
				1227
				1228	/*
				1229	* Deallocate all reserved memory, and reset connection with monitor.
				1230	* Reset connection before deallocating memory to avoid potential for
				1231	* additional spurious resets from guest touching deallocated pages.
				1232	*/
				1233	vmballoon_send_start(&balloon, 0);
				1234	vmballoon_pop(&balloon);
				1235	}
				1236	module_exit(vmballoon_exit);