Blame - src/kernel/linux/v4.14/drivers/iommu/amd_iommu.c - T103

blob: 494caaa265af0b5dd6624ffa916f97db733239c7 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
				3	* Author: Joerg Roedel <jroedel@suse.de>
				4	* Leo Duran <leo.duran@amd.com>
				5	*
				6	* This program is free software; you can redistribute it and/or modify it
				7	* under the terms of the GNU General Public License version 2 as published
				8	* by the Free Software Foundation.
				9	*
				10	* This program is distributed in the hope that it will be useful,
				11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				13	* GNU General Public License for more details.
				14	*
				15	* You should have received a copy of the GNU General Public License
				16	* along with this program; if not, write to the Free Software
				17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
				18	*/
				19
				20	#include <linux/ratelimit.h>
				21	#include <linux/pci.h>
				22	#include <linux/acpi.h>
				23	#include <linux/amba/bus.h>
				24	#include <linux/platform_device.h>
				25	#include <linux/pci-ats.h>
				26	#include <linux/bitmap.h>
				27	#include <linux/slab.h>
				28	#include <linux/debugfs.h>
				29	#include <linux/scatterlist.h>
				30	#include <linux/dma-mapping.h>
				31	#include <linux/iommu-helper.h>
				32	#include <linux/iommu.h>
				33	#include <linux/delay.h>
				34	#include <linux/amd-iommu.h>
				35	#include <linux/notifier.h>
				36	#include <linux/export.h>
				37	#include <linux/irq.h>
				38	#include <linux/msi.h>
				39	#include <linux/dma-contiguous.h>
				40	#include <linux/irqdomain.h>
				41	#include <linux/percpu.h>
				42	#include <linux/iova.h>
				43	#include <asm/irq_remapping.h>
				44	#include <asm/io_apic.h>
				45	#include <asm/apic.h>
				46	#include <asm/hw_irq.h>
				47	#include <asm/msidef.h>
				48	#include <asm/proto.h>
				49	#include <asm/iommu.h>
				50	#include <asm/gart.h>
				51	#include <asm/dma.h>
				52
				53	#include "amd_iommu_proto.h"
				54	#include "amd_iommu_types.h"
				55	#include "irq_remapping.h"
				56
				57	#define AMD_IOMMU_MAPPING_ERROR 0
				58
				59	#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] \|= ((t) << 28))
				60
				61	#define LOOP_TIMEOUT 100000
				62
				63	/* IO virtual address start page frame number */
				64	#define IOVA_START_PFN (1)
				65	#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
				66	#define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
				67
				68	/* Reserved IOVA ranges */
				69	#define MSI_RANGE_START (0xfee00000)
				70	#define MSI_RANGE_END (0xfeefffff)
				71	#define HT_RANGE_START (0xfd00000000ULL)
				72	#define HT_RANGE_END (0xffffffffffULL)
				73
				74	/*
				75	* This bitmap is used to advertise the page sizes our hardware support
				76	* to the IOMMU core, which will then use this information to split
				77	* physically contiguous memory regions it is mapping into page sizes
				78	* that we support.
				79	*
				80	* 512GB Pages are not supported due to a hardware bug
				81	*/
				82	#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
				83
				84	static DEFINE_RWLOCK(amd_iommu_devtable_lock);
				85
				86	/* List of all available dev_data structures */
				87	static LIST_HEAD(dev_data_list);
				88	static DEFINE_SPINLOCK(dev_data_list_lock);
				89
				90	LIST_HEAD(ioapic_map);
				91	LIST_HEAD(hpet_map);
				92	LIST_HEAD(acpihid_map);
				93
				94	/*
				95	* Domain for untranslated devices - only allocated
				96	* if iommu=pt passed on kernel cmd line.
				97	*/
				98	const struct iommu_ops amd_iommu_ops;
				99
				100	static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
				101	int amd_iommu_max_glx_val = -1;
				102
				103	static const struct dma_map_ops amd_iommu_dma_ops;
				104
				105	/*
				106	* general struct to manage commands send to an IOMMU
				107	*/
				108	struct iommu_cmd {
				109	u32 data[4];
				110	};
				111
				112	struct kmem_cache *amd_iommu_irq_cache;
				113
				114	static void update_domain(struct protection_domain *domain);
				115	static int protection_domain_init(struct protection_domain *domain);
				116	static void detach_device(struct device *dev);
				117	static void iova_domain_flush_tlb(struct iova_domain *iovad);
				118
				119	/*
				120	* Data container for a dma_ops specific protection domain
				121	*/
				122	struct dma_ops_domain {
				123	/* generic protection domain information */
				124	struct protection_domain domain;
				125
				126	/* IOVA RB-Tree */
				127	struct iova_domain iovad;
				128	};
				129
				130	static struct iova_domain reserved_iova_ranges;
				131	static struct lock_class_key reserved_rbtree_key;
				132
				133	/****************************************************************************
				134	*
				135	* Helper functions
				136	*
				137	****************************************************************************/
				138
				139	static inline int match_hid_uid(struct device *dev,
				140	struct acpihid_map_entry *entry)
				141	{
				142	struct acpi_device *adev = ACPI_COMPANION(dev);
				143	const char hid, uid;
				144
				145	if (!adev)
				146	return -ENODEV;
				147
				148	hid = acpi_device_hid(adev);
				149	uid = acpi_device_uid(adev);
				150
				151	if (!hid \|\| !(*hid))
				152	return -ENODEV;
				153
				154	if (!uid \|\| !(*uid))
				155	return strcmp(hid, entry->hid);
				156
				157	if (!(*entry->uid))
				158	return strcmp(hid, entry->hid);
				159
				160	return (strcmp(hid, entry->hid) \|\| strcmp(uid, entry->uid));
				161	}
				162
				163	static inline u16 get_pci_device_id(struct device *dev)
				164	{
				165	struct pci_dev *pdev = to_pci_dev(dev);
				166
				167	return PCI_DEVID(pdev->bus->number, pdev->devfn);
				168	}
				169
				170	static inline int get_acpihid_device_id(struct device *dev,
				171	struct acpihid_map_entry **entry)
				172	{
				173	struct acpihid_map_entry *p;
				174
				175	list_for_each_entry(p, &acpihid_map, list) {
				176	if (!match_hid_uid(dev, p)) {
				177	if (entry)
				178	*entry = p;
				179	return p->devid;
				180	}
				181	}
				182	return -EINVAL;
				183	}
				184
				185	static inline int get_device_id(struct device *dev)
				186	{
				187	int devid;
				188
				189	if (dev_is_pci(dev))
				190	devid = get_pci_device_id(dev);
				191	else
				192	devid = get_acpihid_device_id(dev, NULL);
				193
				194	return devid;
				195	}
				196
				197	static struct protection_domain to_pdomain(struct iommu_domain dom)
				198	{
				199	return container_of(dom, struct protection_domain, domain);
				200	}
				201
				202	static struct dma_ops_domain* to_dma_ops_domain(struct protection_domain *domain)
				203	{
				204	BUG_ON(domain->flags != PD_DMA_OPS_MASK);
				205	return container_of(domain, struct dma_ops_domain, domain);
				206	}
				207
				208	static struct iommu_dev_data *alloc_dev_data(u16 devid)
				209	{
				210	struct iommu_dev_data *dev_data;
				211	unsigned long flags;
				212
				213	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
				214	if (!dev_data)
				215	return NULL;
				216
				217	dev_data->devid = devid;
				218
				219	spin_lock_irqsave(&dev_data_list_lock, flags);
				220	list_add_tail(&dev_data->dev_data_list, &dev_data_list);
				221	spin_unlock_irqrestore(&dev_data_list_lock, flags);
				222
				223	ratelimit_default_init(&dev_data->rs);
				224
				225	return dev_data;
				226	}
				227
				228	static struct iommu_dev_data *search_dev_data(u16 devid)
				229	{
				230	struct iommu_dev_data *dev_data;
				231	unsigned long flags;
				232
				233	spin_lock_irqsave(&dev_data_list_lock, flags);
				234	list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
				235	if (dev_data->devid == devid)
				236	goto out_unlock;
				237	}
				238
				239	dev_data = NULL;
				240
				241	out_unlock:
				242	spin_unlock_irqrestore(&dev_data_list_lock, flags);
				243
				244	return dev_data;
				245	}
				246
				247	static int __last_alias(struct pci_dev pdev, u16 alias, void data)
				248	{
				249	(u16 )data = alias;
				250	return 0;
				251	}
				252
				253	static u16 get_alias(struct device *dev)
				254	{
				255	struct pci_dev *pdev = to_pci_dev(dev);
				256	u16 devid, ivrs_alias, pci_alias;
				257
				258	/* The callers make sure that get_device_id() does not fail here */
				259	devid = get_device_id(dev);
				260
				261	/* For ACPI HID devices, we simply return the devid as such */
				262	if (!dev_is_pci(dev))
				263	return devid;
				264
				265	ivrs_alias = amd_iommu_alias_table[devid];
				266
				267	pci_for_each_dma_alias(pdev, __last_alias, &pci_alias);
				268
				269	if (ivrs_alias == pci_alias)
				270	return ivrs_alias;
				271
				272	/*
				273	* DMA alias showdown
				274	*
				275	* The IVRS is fairly reliable in telling us about aliases, but it
				276	* can't know about every screwy device. If we don't have an IVRS
				277	* reported alias, use the PCI reported alias. In that case we may
				278	* still need to initialize the rlookup and dev_table entries if the
				279	* alias is to a non-existent device.
				280	*/
				281	if (ivrs_alias == devid) {
				282	if (!amd_iommu_rlookup_table[pci_alias]) {
				283	amd_iommu_rlookup_table[pci_alias] =
				284	amd_iommu_rlookup_table[devid];
				285	memcpy(amd_iommu_dev_table[pci_alias].data,
				286	amd_iommu_dev_table[devid].data,
				287	sizeof(amd_iommu_dev_table[pci_alias].data));
				288	}
				289
				290	return pci_alias;
				291	}
				292
				293	pr_info("AMD-Vi: Using IVRS reported alias %02x:%02x.%d "
				294	"for device %s[%04x:%04x], kernel reported alias "
				295	"%02x:%02x.%d\n", PCI_BUS_NUM(ivrs_alias), PCI_SLOT(ivrs_alias),
				296	PCI_FUNC(ivrs_alias), dev_name(dev), pdev->vendor, pdev->device,
				297	PCI_BUS_NUM(pci_alias), PCI_SLOT(pci_alias),
				298	PCI_FUNC(pci_alias));
				299
				300	/*
				301	* If we don't have a PCI DMA alias and the IVRS alias is on the same
				302	* bus, then the IVRS table may know about a quirk that we don't.
				303	*/
				304	if (pci_alias == devid &&
				305	PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) {
				306	pci_add_dma_alias(pdev, ivrs_alias & 0xff);
				307	pr_info("AMD-Vi: Added PCI DMA alias %02x.%d for %s\n",
				308	PCI_SLOT(ivrs_alias), PCI_FUNC(ivrs_alias),
				309	dev_name(dev));
				310	}
				311
				312	return ivrs_alias;
				313	}
				314
				315	static struct iommu_dev_data *find_dev_data(u16 devid)
				316	{
				317	struct iommu_dev_data *dev_data;
				318	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
				319
				320	dev_data = search_dev_data(devid);
				321
				322	if (dev_data == NULL) {
				323	dev_data = alloc_dev_data(devid);
				324	if (!dev_data)
				325	return NULL;
				326
				327	if (translation_pre_enabled(iommu))
				328	dev_data->defer_attach = true;
				329	}
				330
				331	return dev_data;
				332	}
				333
				334	struct iommu_dev_data get_dev_data(struct device dev)
				335	{
				336	return dev->archdata.iommu;
				337	}
				338	EXPORT_SYMBOL(get_dev_data);
				339
				340	/*
				341	* Find or create an IOMMU group for a acpihid device.
				342	*/
				343	static struct iommu_group acpihid_device_group(struct device dev)
				344	{
				345	struct acpihid_map_entry p, entry = NULL;
				346	int devid;
				347
				348	devid = get_acpihid_device_id(dev, &entry);
				349	if (devid < 0)
				350	return ERR_PTR(devid);
				351
				352	list_for_each_entry(p, &acpihid_map, list) {
				353	if ((devid == p->devid) && p->group)
				354	entry->group = p->group;
				355	}
				356
				357	if (!entry->group)
				358	entry->group = generic_device_group(dev);
				359	else
				360	iommu_group_ref_get(entry->group);
				361
				362	return entry->group;
				363	}
				364
				365	static bool pci_iommuv2_capable(struct pci_dev *pdev)
				366	{
				367	static const int caps[] = {
				368	PCI_EXT_CAP_ID_ATS,
				369	PCI_EXT_CAP_ID_PRI,
				370	PCI_EXT_CAP_ID_PASID,
				371	};
				372	int i, pos;
				373
				374	for (i = 0; i < 3; ++i) {
				375	pos = pci_find_ext_capability(pdev, caps[i]);
				376	if (pos == 0)
				377	return false;
				378	}
				379
				380	return true;
				381	}
				382
				383	static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum)
				384	{
				385	struct iommu_dev_data *dev_data;
				386
				387	dev_data = get_dev_data(&pdev->dev);
				388
				389	return dev_data->errata & (1 << erratum) ? true : false;
				390	}
				391
				392	/*
				393	* This function checks if the driver got a valid device from the caller to
				394	* avoid dereferencing invalid pointers.
				395	*/
				396	static bool check_device(struct device *dev)
				397	{
				398	int devid;
				399
				400	if (!dev \|\| !dev->dma_mask)
				401	return false;
				402
				403	devid = get_device_id(dev);
				404	if (devid < 0)
				405	return false;
				406
				407	/* Out of our scope? */
				408	if (devid > amd_iommu_last_bdf)
				409	return false;
				410
				411	if (amd_iommu_rlookup_table[devid] == NULL)
				412	return false;
				413
				414	return true;
				415	}
				416
				417	static void init_iommu_group(struct device *dev)
				418	{
				419	struct iommu_group *group;
				420
				421	group = iommu_group_get_for_dev(dev);
				422	if (IS_ERR(group))
				423	return;
				424
				425	iommu_group_put(group);
				426	}
				427
				428	static int iommu_init_device(struct device *dev)
				429	{
				430	struct iommu_dev_data *dev_data;
				431	struct amd_iommu *iommu;
				432	int devid;
				433
				434	if (dev->archdata.iommu)
				435	return 0;
				436
				437	devid = get_device_id(dev);
				438	if (devid < 0)
				439	return devid;
				440
				441	iommu = amd_iommu_rlookup_table[devid];
				442
				443	dev_data = find_dev_data(devid);
				444	if (!dev_data)
				445	return -ENOMEM;
				446
				447	dev_data->alias = get_alias(dev);
				448
				449	/*
				450	* By default we use passthrough mode for IOMMUv2 capable device.
				451	* But if amd_iommu=force_isolation is set (e.g. to debug DMA to
				452	* invalid address), we ignore the capability for the device so
				453	* it'll be forced to go into translation mode.
				454	*/
				455	if ((iommu_pass_through \|\| !amd_iommu_force_isolation) &&
				456	dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) {
				457	struct amd_iommu *iommu;
				458
				459	iommu = amd_iommu_rlookup_table[dev_data->devid];
				460	dev_data->iommu_v2 = iommu->is_iommu_v2;
				461	}
				462
				463	dev->archdata.iommu = dev_data;
				464
				465	iommu_device_link(&iommu->iommu, dev);
				466
				467	return 0;
				468	}
				469
				470	static void iommu_ignore_device(struct device *dev)
				471	{
				472	u16 alias;
				473	int devid;
				474
				475	devid = get_device_id(dev);
				476	if (devid < 0)
				477	return;
				478
				479	alias = get_alias(dev);
				480
				481	memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
				482	memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
				483
				484	amd_iommu_rlookup_table[devid] = NULL;
				485	amd_iommu_rlookup_table[alias] = NULL;
				486	}
				487
				488	static void iommu_uninit_device(struct device *dev)
				489	{
				490	struct iommu_dev_data *dev_data;
				491	struct amd_iommu *iommu;
				492	int devid;
				493
				494	devid = get_device_id(dev);
				495	if (devid < 0)
				496	return;
				497
				498	iommu = amd_iommu_rlookup_table[devid];
				499
				500	dev_data = search_dev_data(devid);
				501	if (!dev_data)
				502	return;
				503
				504	if (dev_data->domain)
				505	detach_device(dev);
				506
				507	iommu_device_unlink(&iommu->iommu, dev);
				508
				509	iommu_group_remove_device(dev);
				510
				511	/* Remove dma-ops */
				512	dev->dma_ops = NULL;
				513
				514	/*
				515	* We keep dev_data around for unplugged devices and reuse it when the
				516	* device is re-plugged - not doing so would introduce a ton of races.
				517	*/
				518	}
				519
				520	/****************************************************************************
				521	*
				522	* Interrupt handling functions
				523	*
				524	****************************************************************************/
				525
				526	static void dump_dte_entry(u16 devid)
				527	{
				528	int i;
				529
				530	for (i = 0; i < 4; ++i)
				531	pr_err("AMD-Vi: DTE[%d]: %016llx\n", i,
				532	amd_iommu_dev_table[devid].data[i]);
				533	}
				534
				535	static void dump_command(unsigned long phys_addr)
				536	{
				537	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
				538	int i;
				539
				540	for (i = 0; i < 4; ++i)
				541	pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
				542	}
				543
				544	static void amd_iommu_report_page_fault(u16 devid, u16 domain_id,
				545	u64 address, int flags)
				546	{
				547	struct iommu_dev_data *dev_data = NULL;
				548	struct pci_dev *pdev;
				549
				550	pdev = pci_get_bus_and_slot(PCI_BUS_NUM(devid), devid & 0xff);
				551	if (pdev)
				552	dev_data = get_dev_data(&pdev->dev);
				553
				554	if (dev_data && __ratelimit(&dev_data->rs)) {
				555	dev_err(&pdev->dev, "AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%016llx flags=0x%04x]\n",
				556	domain_id, address, flags);
				557	} else if (printk_ratelimit()) {
				558	pr_err("AMD-Vi: Event logged [IO_PAGE_FAULT device=%02x:%02x.%x domain=0x%04x address=0x%016llx flags=0x%04x]\n",
				559	PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
				560	domain_id, address, flags);
				561	}
				562
				563	if (pdev)
				564	pci_dev_put(pdev);
				565	}
				566
				567	static void iommu_print_event(struct amd_iommu iommu, void __evt)
				568	{
				569	int type, devid, domid, flags;
				570	volatile u32 *event = __evt;
				571	int count = 0;
				572	u64 address;
				573
				574	retry:
				575	type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
				576	devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
				577	domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
				578	flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
				579	address = (u64)(((u64)event[3]) << 32) \| event[2];
				580
				581	if (type == 0) {
				582	/* Did we hit the erratum? */
				583	if (++count == LOOP_TIMEOUT) {
				584	pr_err("AMD-Vi: No event written to event log\n");
				585	return;
				586	}
				587	udelay(1);
				588	goto retry;
				589	}
				590
				591	if (type == EVENT_TYPE_IO_FAULT) {
				592	amd_iommu_report_page_fault(devid, domid, address, flags);
				593	return;
				594	} else {
				595	printk(KERN_ERR "AMD-Vi: Event logged [");
				596	}
				597
				598	switch (type) {
				599	case EVENT_TYPE_ILL_DEV:
				600	printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
				601	"address=0x%016llx flags=0x%04x]\n",
				602	PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
				603	address, flags);
				604	dump_dte_entry(devid);
				605	break;
				606	case EVENT_TYPE_DEV_TAB_ERR:
				607	printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
				608	"address=0x%016llx flags=0x%04x]\n",
				609	PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
				610	address, flags);
				611	break;
				612	case EVENT_TYPE_PAGE_TAB_ERR:
				613	printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
				614	"domain=0x%04x address=0x%016llx flags=0x%04x]\n",
				615	PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
				616	domid, address, flags);
				617	break;
				618	case EVENT_TYPE_ILL_CMD:
				619	printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
				620	dump_command(address);
				621	break;
				622	case EVENT_TYPE_CMD_HARD_ERR:
				623	printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
				624	"flags=0x%04x]\n", address, flags);
				625	break;
				626	case EVENT_TYPE_IOTLB_INV_TO:
				627	printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
				628	"address=0x%016llx]\n",
				629	PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
				630	address);
				631	break;
				632	case EVENT_TYPE_INV_DEV_REQ:
				633	printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
				634	"address=0x%016llx flags=0x%04x]\n",
				635	PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
				636	address, flags);
				637	break;
				638	default:
				639	printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
				640	}
				641
				642	memset(__evt, 0, 4 * sizeof(u32));
				643	}
				644
				645	static void iommu_poll_events(struct amd_iommu *iommu)
				646	{
				647	u32 head, tail;
				648
				649	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
				650	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
				651
				652	while (head != tail) {
				653	iommu_print_event(iommu, iommu->evt_buf + head);
				654	head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
				655	}
				656
				657	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
				658	}
				659
				660	static void iommu_handle_ppr_entry(struct amd_iommu iommu, u64 raw)
				661	{
				662	struct amd_iommu_fault fault;
				663
				664	if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
				665	pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n");
				666	return;
				667	}
				668
				669	fault.address = raw[1];
				670	fault.pasid = PPR_PASID(raw[0]);
				671	fault.device_id = PPR_DEVID(raw[0]);
				672	fault.tag = PPR_TAG(raw[0]);
				673	fault.flags = PPR_FLAGS(raw[0]);
				674
				675	atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
				676	}
				677
				678	static void iommu_poll_ppr_log(struct amd_iommu *iommu)
				679	{
				680	u32 head, tail;
				681
				682	if (iommu->ppr_log == NULL)
				683	return;
				684
				685	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
				686	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
				687
				688	while (head != tail) {
				689	volatile u64 *raw;
				690	u64 entry[2];
				691	int i;
				692
				693	raw = (u64 *)(iommu->ppr_log + head);
				694
				695	/*
				696	* Hardware bug: Interrupt may arrive before the entry is
				697	* written to memory. If this happens we need to wait for the
				698	* entry to arrive.
				699	*/
				700	for (i = 0; i < LOOP_TIMEOUT; ++i) {
				701	if (PPR_REQ_TYPE(raw[0]) != 0)
				702	break;
				703	udelay(1);
				704	}
				705
				706	/* Avoid memcpy function-call overhead */
				707	entry[0] = raw[0];
				708	entry[1] = raw[1];
				709
				710	/*
				711	* To detect the hardware bug we need to clear the entry
				712	* back to zero.
				713	*/
				714	raw[0] = raw[1] = 0UL;
				715
				716	/* Update head pointer of hardware ring-buffer */
				717	head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
				718	writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
				719
				720	/* Handle PPR entry */
				721	iommu_handle_ppr_entry(iommu, entry);
				722
				723	/* Refresh ring-buffer information */
				724	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
				725	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
				726	}
				727	}
				728
				729	#ifdef CONFIG_IRQ_REMAP
				730	static int (*iommu_ga_log_notifier)(u32);
				731
				732	int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
				733	{
				734	iommu_ga_log_notifier = notifier;
				735
				736	return 0;
				737	}
				738	EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
				739
				740	static void iommu_poll_ga_log(struct amd_iommu *iommu)
				741	{
				742	u32 head, tail, cnt = 0;
				743
				744	if (iommu->ga_log == NULL)
				745	return;
				746
				747	head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
				748	tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
				749
				750	while (head != tail) {
				751	volatile u64 *raw;
				752	u64 log_entry;
				753
				754	raw = (u64 *)(iommu->ga_log + head);
				755	cnt++;
				756
				757	/* Avoid memcpy function-call overhead */
				758	log_entry = *raw;
				759
				760	/* Update head pointer of hardware ring-buffer */
				761	head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
				762	writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
				763
				764	/* Handle GA entry */
				765	switch (GA_REQ_TYPE(log_entry)) {
				766	case GA_GUEST_NR:
				767	if (!iommu_ga_log_notifier)
				768	break;
				769
				770	pr_debug("AMD-Vi: %s: devid=%#x, ga_tag=%#x\n",
				771	__func__, GA_DEVID(log_entry),
				772	GA_TAG(log_entry));
				773
				774	if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
				775	pr_err("AMD-Vi: GA log notifier failed.\n");
				776	break;
				777	default:
				778	break;
				779	}
				780	}
				781	}
				782	#endif /* CONFIG_IRQ_REMAP */
				783
				784	#define AMD_IOMMU_INT_MASK \
				785	(MMIO_STATUS_EVT_INT_MASK \| \
				786	MMIO_STATUS_PPR_INT_MASK \| \
				787	MMIO_STATUS_GALOG_INT_MASK)
				788
				789	irqreturn_t amd_iommu_int_thread(int irq, void *data)
				790	{
				791	struct amd_iommu iommu = (struct amd_iommu ) data;
				792	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
				793
				794	while (status & AMD_IOMMU_INT_MASK) {
				795	/* Enable EVT and PPR and GA interrupts again */
				796	writel(AMD_IOMMU_INT_MASK,
				797	iommu->mmio_base + MMIO_STATUS_OFFSET);
				798
				799	if (status & MMIO_STATUS_EVT_INT_MASK) {
				800	pr_devel("AMD-Vi: Processing IOMMU Event Log\n");
				801	iommu_poll_events(iommu);
				802	}
				803
				804	if (status & MMIO_STATUS_PPR_INT_MASK) {
				805	pr_devel("AMD-Vi: Processing IOMMU PPR Log\n");
				806	iommu_poll_ppr_log(iommu);
				807	}
				808
				809	#ifdef CONFIG_IRQ_REMAP
				810	if (status & MMIO_STATUS_GALOG_INT_MASK) {
				811	pr_devel("AMD-Vi: Processing IOMMU GA Log\n");
				812	iommu_poll_ga_log(iommu);
				813	}
				814	#endif
				815
				816	/*
				817	* Hardware bug: ERBT1312
				818	* When re-enabling interrupt (by writing 1
				819	* to clear the bit), the hardware might also try to set
				820	* the interrupt bit in the event status register.
				821	* In this scenario, the bit will be set, and disable
				822	* subsequent interrupts.
				823	*
				824	* Workaround: The IOMMU driver should read back the
				825	* status register and check if the interrupt bits are cleared.
				826	* If not, driver will need to go through the interrupt handler
				827	* again and re-clear the bits
				828	*/
				829	status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
				830	}
				831	return IRQ_HANDLED;
				832	}
				833
				834	irqreturn_t amd_iommu_int_handler(int irq, void *data)
				835	{
				836	return IRQ_WAKE_THREAD;
				837	}
				838
				839	/****************************************************************************
				840	*
				841	* IOMMU command queuing functions
				842	*
				843	****************************************************************************/
				844
				845	static int wait_on_sem(volatile u64 *sem)
				846	{
				847	int i = 0;
				848
				849	while (*sem == 0 && i < LOOP_TIMEOUT) {
				850	udelay(1);
				851	i += 1;
				852	}
				853
				854	if (i == LOOP_TIMEOUT) {
				855	pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
				856	return -EIO;
				857	}
				858
				859	return 0;
				860	}
				861
				862	static void copy_cmd_to_buffer(struct amd_iommu *iommu,
				863	struct iommu_cmd *cmd)
				864	{
				865	u8 *target;
				866
				867	target = iommu->cmd_buf + iommu->cmd_buf_tail;
				868
				869	iommu->cmd_buf_tail += sizeof(*cmd);
				870	iommu->cmd_buf_tail %= CMD_BUFFER_SIZE;
				871
				872	/* Copy command to buffer */
				873	memcpy(target, cmd, sizeof(*cmd));
				874
				875	/* Tell the IOMMU about it */
				876	writel(iommu->cmd_buf_tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
				877	}
				878
				879	static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
				880	{
				881	u64 paddr = iommu_virt_to_phys((void *)address);
				882
				883	WARN_ON(address & 0x7ULL);
				884
				885	memset(cmd, 0, sizeof(*cmd));
				886	cmd->data[0] = lower_32_bits(paddr) \| CMD_COMPL_WAIT_STORE_MASK;
				887	cmd->data[1] = upper_32_bits(paddr);
				888	cmd->data[2] = 1;
				889	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
				890	}
				891
				892	static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
				893	{
				894	memset(cmd, 0, sizeof(*cmd));
				895	cmd->data[0] = devid;
				896	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
				897	}
				898
				899	static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
				900	size_t size, u16 domid, int pde)
				901	{
				902	u64 pages;
				903	bool s;
				904
				905	pages = iommu_num_pages(address, size, PAGE_SIZE);
				906	s = false;
				907
				908	if (pages > 1) {
				909	/*
				910	* If we have to flush more than one page, flush all
				911	* TLB entries for this domain
				912	*/
				913	address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
				914	s = true;
				915	}
				916
				917	address &= PAGE_MASK;
				918
				919	memset(cmd, 0, sizeof(*cmd));
				920	cmd->data[1] \|= domid;
				921	cmd->data[2] = lower_32_bits(address);
				922	cmd->data[3] = upper_32_bits(address);
				923	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
				924	if (s) /* size bit - we flush more than one 4kb page */
				925	cmd->data[2] \|= CMD_INV_IOMMU_PAGES_SIZE_MASK;
				926	if (pde) /* PDE bit - we want to flush everything, not only the PTEs */
				927	cmd->data[2] \|= CMD_INV_IOMMU_PAGES_PDE_MASK;
				928	}
				929
				930	static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
				931	u64 address, size_t size)
				932	{
				933	u64 pages;
				934	bool s;
				935
				936	pages = iommu_num_pages(address, size, PAGE_SIZE);
				937	s = false;
				938
				939	if (pages > 1) {
				940	/*
				941	* If we have to flush more than one page, flush all
				942	* TLB entries for this domain
				943	*/
				944	address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
				945	s = true;
				946	}
				947
				948	address &= PAGE_MASK;
				949
				950	memset(cmd, 0, sizeof(*cmd));
				951	cmd->data[0] = devid;
				952	cmd->data[0] \|= (qdep & 0xff) << 24;
				953	cmd->data[1] = devid;
				954	cmd->data[2] = lower_32_bits(address);
				955	cmd->data[3] = upper_32_bits(address);
				956	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
				957	if (s)
				958	cmd->data[2] \|= CMD_INV_IOMMU_PAGES_SIZE_MASK;
				959	}
				960
				961	static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid,
				962	u64 address, bool size)
				963	{
				964	memset(cmd, 0, sizeof(*cmd));
				965
				966	address &= ~(0xfffULL);
				967
				968	cmd->data[0] = pasid;
				969	cmd->data[1] = domid;
				970	cmd->data[2] = lower_32_bits(address);
				971	cmd->data[3] = upper_32_bits(address);
				972	cmd->data[2] \|= CMD_INV_IOMMU_PAGES_PDE_MASK;
				973	cmd->data[2] \|= CMD_INV_IOMMU_PAGES_GN_MASK;
				974	if (size)
				975	cmd->data[2] \|= CMD_INV_IOMMU_PAGES_SIZE_MASK;
				976	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
				977	}
				978
				979	static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid,
				980	int qdep, u64 address, bool size)
				981	{
				982	memset(cmd, 0, sizeof(*cmd));
				983
				984	address &= ~(0xfffULL);
				985
				986	cmd->data[0] = devid;
				987	cmd->data[0] \|= ((pasid >> 8) & 0xff) << 16;
				988	cmd->data[0] \|= (qdep & 0xff) << 24;
				989	cmd->data[1] = devid;
				990	cmd->data[1] \|= (pasid & 0xff) << 16;
				991	cmd->data[2] = lower_32_bits(address);
				992	cmd->data[2] \|= CMD_INV_IOMMU_PAGES_GN_MASK;
				993	cmd->data[3] = upper_32_bits(address);
				994	if (size)
				995	cmd->data[2] \|= CMD_INV_IOMMU_PAGES_SIZE_MASK;
				996	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
				997	}
				998
				999	static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid,
				1000	int status, int tag, bool gn)
				1001	{
				1002	memset(cmd, 0, sizeof(*cmd));
				1003
				1004	cmd->data[0] = devid;
				1005	if (gn) {
				1006	cmd->data[1] = pasid;
				1007	cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK;
				1008	}
				1009	cmd->data[3] = tag & 0x1ff;
				1010	cmd->data[3] \|= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
				1011
				1012	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
				1013	}
				1014
				1015	static void build_inv_all(struct iommu_cmd *cmd)
				1016	{
				1017	memset(cmd, 0, sizeof(*cmd));
				1018	CMD_SET_TYPE(cmd, CMD_INV_ALL);
				1019	}
				1020
				1021	static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
				1022	{
				1023	memset(cmd, 0, sizeof(*cmd));
				1024	cmd->data[0] = devid;
				1025	CMD_SET_TYPE(cmd, CMD_INV_IRT);
				1026	}
				1027
				1028	/*
				1029	* Writes the command to the IOMMUs command buffer and informs the
				1030	* hardware about the new command.
				1031	*/
				1032	static int __iommu_queue_command_sync(struct amd_iommu *iommu,
				1033	struct iommu_cmd *cmd,
				1034	bool sync)
				1035	{
				1036	unsigned int count = 0;
				1037	u32 left, next_tail;
				1038
				1039	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
				1040	again:
				1041	left = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
				1042
				1043	if (left <= 0x20) {
				1044	/* Skip udelay() the first time around */
				1045	if (count++) {
				1046	if (count == LOOP_TIMEOUT) {
				1047	pr_err("AMD-Vi: Command buffer timeout\n");
				1048	return -EIO;
				1049	}
				1050
				1051	udelay(1);
				1052	}
				1053
				1054	/* Update head and recheck remaining space */
				1055	iommu->cmd_buf_head = readl(iommu->mmio_base +
				1056	MMIO_CMD_HEAD_OFFSET);
				1057
				1058	goto again;
				1059	}
				1060
				1061	copy_cmd_to_buffer(iommu, cmd);
				1062
				1063	/* Do we need to make sure all commands are processed? */
				1064	iommu->need_sync = sync;
				1065
				1066	return 0;
				1067	}
				1068
				1069	static int iommu_queue_command_sync(struct amd_iommu *iommu,
				1070	struct iommu_cmd *cmd,
				1071	bool sync)
				1072	{
				1073	unsigned long flags;
				1074	int ret;
				1075
				1076	spin_lock_irqsave(&iommu->lock, flags);
				1077	ret = __iommu_queue_command_sync(iommu, cmd, sync);
				1078	spin_unlock_irqrestore(&iommu->lock, flags);
				1079
				1080	return ret;
				1081	}
				1082
				1083	static int iommu_queue_command(struct amd_iommu iommu, struct iommu_cmd cmd)
				1084	{
				1085	return iommu_queue_command_sync(iommu, cmd, true);
				1086	}
				1087
				1088	/*
				1089	* This function queues a completion wait command into the command
				1090	* buffer of an IOMMU
				1091	*/
				1092	static int iommu_completion_wait(struct amd_iommu *iommu)
				1093	{
				1094	struct iommu_cmd cmd;
				1095	unsigned long flags;
				1096	int ret;
				1097
				1098	if (!iommu->need_sync)
				1099	return 0;
				1100
				1101
				1102	build_completion_wait(&cmd, (u64)&iommu->cmd_sem);
				1103
				1104	spin_lock_irqsave(&iommu->lock, flags);
				1105
				1106	iommu->cmd_sem = 0;
				1107
				1108	ret = __iommu_queue_command_sync(iommu, &cmd, false);
				1109	if (ret)
				1110	goto out_unlock;
				1111
				1112	ret = wait_on_sem(&iommu->cmd_sem);
				1113
				1114	out_unlock:
				1115	spin_unlock_irqrestore(&iommu->lock, flags);
				1116
				1117	return ret;
				1118	}
				1119
				1120	static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
				1121	{
				1122	struct iommu_cmd cmd;
				1123
				1124	build_inv_dte(&cmd, devid);
				1125
				1126	return iommu_queue_command(iommu, &cmd);
				1127	}
				1128
				1129	static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
				1130	{
				1131	u32 devid;
				1132
				1133	for (devid = 0; devid <= 0xffff; ++devid)
				1134	iommu_flush_dte(iommu, devid);
				1135
				1136	iommu_completion_wait(iommu);
				1137	}
				1138
				1139	/*
				1140	* This function uses heavy locking and may disable irqs for some time. But
				1141	* this is no issue because it is only called during resume.
				1142	*/
				1143	static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
				1144	{
				1145	u32 dom_id;
				1146
				1147	for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
				1148	struct iommu_cmd cmd;
				1149	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
				1150	dom_id, 1);
				1151	iommu_queue_command(iommu, &cmd);
				1152	}
				1153
				1154	iommu_completion_wait(iommu);
				1155	}
				1156
				1157	static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
				1158	{
				1159	struct iommu_cmd cmd;
				1160
				1161	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
				1162	dom_id, 1);
				1163	iommu_queue_command(iommu, &cmd);
				1164
				1165	iommu_completion_wait(iommu);
				1166	}
				1167
				1168	static void amd_iommu_flush_all(struct amd_iommu *iommu)
				1169	{
				1170	struct iommu_cmd cmd;
				1171
				1172	build_inv_all(&cmd);
				1173
				1174	iommu_queue_command(iommu, &cmd);
				1175	iommu_completion_wait(iommu);
				1176	}
				1177
				1178	static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
				1179	{
				1180	struct iommu_cmd cmd;
				1181
				1182	build_inv_irt(&cmd, devid);
				1183
				1184	iommu_queue_command(iommu, &cmd);
				1185	}
				1186
				1187	static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
				1188	{
				1189	u32 devid;
				1190
				1191	for (devid = 0; devid <= MAX_DEV_TABLE_ENTRIES; devid++)
				1192	iommu_flush_irt(iommu, devid);
				1193
				1194	iommu_completion_wait(iommu);
				1195	}
				1196
				1197	void iommu_flush_all_caches(struct amd_iommu *iommu)
				1198	{
				1199	if (iommu_feature(iommu, FEATURE_IA)) {
				1200	amd_iommu_flush_all(iommu);
				1201	} else {
				1202	amd_iommu_flush_dte_all(iommu);
				1203	amd_iommu_flush_irt_all(iommu);
				1204	amd_iommu_flush_tlb_all(iommu);
				1205	}
				1206	}
				1207
				1208	/*
				1209	* Command send function for flushing on-device TLB
				1210	*/
				1211	static int device_flush_iotlb(struct iommu_dev_data *dev_data,
				1212	u64 address, size_t size)
				1213	{
				1214	struct amd_iommu *iommu;
				1215	struct iommu_cmd cmd;
				1216	int qdep;
				1217
				1218	qdep = dev_data->ats.qdep;
				1219	iommu = amd_iommu_rlookup_table[dev_data->devid];
				1220
				1221	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
				1222
				1223	return iommu_queue_command(iommu, &cmd);
				1224	}
				1225
				1226	/*
				1227	* Command send function for invalidating a device table entry
				1228	*/
				1229	static int device_flush_dte(struct iommu_dev_data *dev_data)
				1230	{
				1231	struct amd_iommu *iommu;
				1232	u16 alias;
				1233	int ret;
				1234
				1235	iommu = amd_iommu_rlookup_table[dev_data->devid];
				1236	alias = dev_data->alias;
				1237
				1238	ret = iommu_flush_dte(iommu, dev_data->devid);
				1239	if (!ret && alias != dev_data->devid)
				1240	ret = iommu_flush_dte(iommu, alias);
				1241	if (ret)
				1242	return ret;
				1243
				1244	if (dev_data->ats.enabled)
				1245	ret = device_flush_iotlb(dev_data, 0, ~0UL);
				1246
				1247	return ret;
				1248	}
				1249
				1250	/*
				1251	* TLB invalidation function which is called from the mapping functions.
				1252	* It invalidates a single PTE if the range to flush is within a single
				1253	* page. Otherwise it flushes the whole TLB of the IOMMU.
				1254	*/
				1255	static void __domain_flush_pages(struct protection_domain *domain,
				1256	u64 address, size_t size, int pde)
				1257	{
				1258	struct iommu_dev_data *dev_data;
				1259	struct iommu_cmd cmd;
				1260	int ret = 0, i;
				1261
				1262	build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
				1263
				1264	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
				1265	if (!domain->dev_iommu[i])
				1266	continue;
				1267
				1268	/*
				1269	* Devices of this domain are behind this IOMMU
				1270	* We need a TLB flush
				1271	*/
				1272	ret \|= iommu_queue_command(amd_iommus[i], &cmd);
				1273	}
				1274
				1275	list_for_each_entry(dev_data, &domain->dev_list, list) {
				1276
				1277	if (!dev_data->ats.enabled)
				1278	continue;
				1279
				1280	ret \|= device_flush_iotlb(dev_data, address, size);
				1281	}
				1282
				1283	WARN_ON(ret);
				1284	}
				1285
				1286	static void domain_flush_pages(struct protection_domain *domain,
				1287	u64 address, size_t size)
				1288	{
				1289	__domain_flush_pages(domain, address, size, 0);
				1290	}
				1291
				1292	/* Flush the whole IO/TLB for a given protection domain */
				1293	static void domain_flush_tlb(struct protection_domain *domain)
				1294	{
				1295	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
				1296	}
				1297
				1298	/* Flush the whole IO/TLB for a given protection domain - including PDE */
				1299	static void domain_flush_tlb_pde(struct protection_domain *domain)
				1300	{
				1301	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
				1302	}
				1303
				1304	static void domain_flush_complete(struct protection_domain *domain)
				1305	{
				1306	int i;
				1307
				1308	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
				1309	if (domain && !domain->dev_iommu[i])
				1310	continue;
				1311
				1312	/*
				1313	* Devices of this domain are behind this IOMMU
				1314	* We need to wait for completion of all commands.
				1315	*/
				1316	iommu_completion_wait(amd_iommus[i]);
				1317	}
				1318	}
				1319
				1320
				1321	/*
				1322	* This function flushes the DTEs for all devices in domain
				1323	*/
				1324	static void domain_flush_devices(struct protection_domain *domain)
				1325	{
				1326	struct iommu_dev_data *dev_data;
				1327
				1328	list_for_each_entry(dev_data, &domain->dev_list, list)
				1329	device_flush_dte(dev_data);
				1330	}
				1331
				1332	/****************************************************************************
				1333	*
				1334	* The functions below are used the create the page table mappings for
				1335	* unity mapped regions.
				1336	*
				1337	****************************************************************************/
				1338
				1339	/*
				1340	* This function is used to add another level to an IO page table. Adding
				1341	* another level increases the size of the address space by 9 bits to a size up
				1342	* to 64 bits.
				1343	*/
				1344	static void increase_address_space(struct protection_domain *domain,
				1345	gfp_t gfp)
				1346	{
				1347	unsigned long flags;
				1348	u64 *pte;
				1349
				1350	spin_lock_irqsave(&domain->lock, flags);
				1351
				1352	if (WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
				1353	/* address space already 64 bit large */
				1354	goto out;
				1355
				1356	pte = (void *)get_zeroed_page(gfp);
				1357	if (!pte)
				1358	goto out;
				1359
				1360	*pte = PM_LEVEL_PDE(domain->mode,
				1361	iommu_virt_to_phys(domain->pt_root));
				1362	domain->pt_root = pte;
				1363	domain->mode += 1;
				1364	domain->updated = true;
				1365
				1366	out:
				1367	spin_unlock_irqrestore(&domain->lock, flags);
				1368
				1369	return;
				1370	}
				1371
				1372	static u64 alloc_pte(struct protection_domain domain,
				1373	unsigned long address,
				1374	unsigned long page_size,
				1375	u64 **pte_page,
				1376	gfp_t gfp)
				1377	{
				1378	int level, end_lvl;
				1379	u64 pte, page;
				1380
				1381	BUG_ON(!is_power_of_2(page_size));
				1382
				1383	while (address > PM_LEVEL_SIZE(domain->mode))
				1384	increase_address_space(domain, gfp);
				1385
				1386	level = domain->mode - 1;
				1387	pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
				1388	address = PAGE_SIZE_ALIGN(address, page_size);
				1389	end_lvl = PAGE_SIZE_LEVEL(page_size);
				1390
				1391	while (level > end_lvl) {
				1392	u64 __pte, __npte;
				1393
				1394	__pte = *pte;
				1395
				1396	if (!IOMMU_PTE_PRESENT(__pte)) {
				1397	page = (u64 *)get_zeroed_page(gfp);
				1398	if (!page)
				1399	return NULL;
				1400
				1401	__npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
				1402
				1403	/* pte could have been changed somewhere. */
				1404	if (cmpxchg64(pte, __pte, __npte) != __pte) {
				1405	free_page((unsigned long)page);
				1406	continue;
				1407	}
				1408	}
				1409
				1410	/* No level skipping support yet */
				1411	if (PM_PTE_LEVEL(*pte) != level)
				1412	return NULL;
				1413
				1414	level -= 1;
				1415
				1416	pte = IOMMU_PTE_PAGE(*pte);
				1417
				1418	if (pte_page && level == end_lvl)
				1419	*pte_page = pte;
				1420
				1421	pte = &pte[PM_LEVEL_INDEX(level, address)];
				1422	}
				1423
				1424	return pte;
				1425	}
				1426
				1427	/*
				1428	* This function checks if there is a PTE for a given dma address. If
				1429	* there is one, it returns the pointer to it.
				1430	*/
				1431	static u64 fetch_pte(struct protection_domain domain,
				1432	unsigned long address,
				1433	unsigned long *page_size)
				1434	{
				1435	int level;
				1436	u64 *pte;
				1437
				1438	if (address > PM_LEVEL_SIZE(domain->mode))
				1439	return NULL;
				1440
				1441	level = domain->mode - 1;
				1442	pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
				1443	*page_size = PTE_LEVEL_PAGE_SIZE(level);
				1444
				1445	while (level > 0) {
				1446
				1447	/* Not Present */
				1448	if (!IOMMU_PTE_PRESENT(*pte))
				1449	return NULL;
				1450
				1451	/* Large PTE */
				1452	if (PM_PTE_LEVEL(*pte) == 7 \|\|
				1453	PM_PTE_LEVEL(*pte) == 0)
				1454	break;
				1455
				1456	/* No level skipping support yet */
				1457	if (PM_PTE_LEVEL(*pte) != level)
				1458	return NULL;
				1459
				1460	level -= 1;
				1461
				1462	/* Walk to the next level */
				1463	pte = IOMMU_PTE_PAGE(*pte);
				1464	pte = &pte[PM_LEVEL_INDEX(level, address)];
				1465	*page_size = PTE_LEVEL_PAGE_SIZE(level);
				1466	}
				1467
				1468	if (PM_PTE_LEVEL(*pte) == 0x07) {
				1469	unsigned long pte_mask;
				1470
				1471	/*
				1472	* If we have a series of large PTEs, make
				1473	* sure to return a pointer to the first one.
				1474	*/
				1475	page_size = pte_mask = PTE_PAGE_SIZE(pte);
				1476	pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
				1477	pte = (u64 *)(((unsigned long)pte) & pte_mask);
				1478	}
				1479
				1480	return pte;
				1481	}
				1482
				1483	/*
				1484	* Generic mapping functions. It maps a physical address into a DMA
				1485	* address space. It allocates the page table pages if necessary.
				1486	* In the future it can be extended to a generic mapping function
				1487	* supporting all features of AMD IOMMU page tables like level skipping
				1488	* and full 64 bit address spaces.
				1489	*/
				1490	static int iommu_map_page(struct protection_domain *dom,
				1491	unsigned long bus_addr,
				1492	unsigned long phys_addr,
				1493	unsigned long page_size,
				1494	int prot,
				1495	gfp_t gfp)
				1496	{
				1497	u64 __pte, *pte;
				1498	int i, count;
				1499
				1500	BUG_ON(!IS_ALIGNED(bus_addr, page_size));
				1501	BUG_ON(!IS_ALIGNED(phys_addr, page_size));
				1502
				1503	if (!(prot & IOMMU_PROT_MASK))
				1504	return -EINVAL;
				1505
				1506	count = PAGE_SIZE_PTE_COUNT(page_size);
				1507	pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp);
				1508
				1509	if (!pte)
				1510	return -ENOMEM;
				1511
				1512	for (i = 0; i < count; ++i)
				1513	if (IOMMU_PTE_PRESENT(pte[i]))
				1514	return -EBUSY;
				1515
				1516	if (count > 1) {
				1517	__pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
				1518	__pte \|= PM_LEVEL_ENC(7) \| IOMMU_PTE_PR \| IOMMU_PTE_FC;
				1519	} else
				1520	__pte = __sme_set(phys_addr) \| IOMMU_PTE_PR \| IOMMU_PTE_FC;
				1521
				1522	if (prot & IOMMU_PROT_IR)
				1523	__pte \|= IOMMU_PTE_IR;
				1524	if (prot & IOMMU_PROT_IW)
				1525	__pte \|= IOMMU_PTE_IW;
				1526
				1527	for (i = 0; i < count; ++i)
				1528	pte[i] = __pte;
				1529
				1530	update_domain(dom);
				1531
				1532	return 0;
				1533	}
				1534
				1535	static unsigned long iommu_unmap_page(struct protection_domain *dom,
				1536	unsigned long bus_addr,
				1537	unsigned long page_size)
				1538	{
				1539	unsigned long long unmapped;
				1540	unsigned long unmap_size;
				1541	u64 *pte;
				1542
				1543	BUG_ON(!is_power_of_2(page_size));
				1544
				1545	unmapped = 0;
				1546
				1547	while (unmapped < page_size) {
				1548
				1549	pte = fetch_pte(dom, bus_addr, &unmap_size);
				1550
				1551	if (pte) {
				1552	int i, count;
				1553
				1554	count = PAGE_SIZE_PTE_COUNT(unmap_size);
				1555	for (i = 0; i < count; i++)
				1556	pte[i] = 0ULL;
				1557	}
				1558
				1559	bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
				1560	unmapped += unmap_size;
				1561	}
				1562
				1563	BUG_ON(unmapped && !is_power_of_2(unmapped));
				1564
				1565	return unmapped;
				1566	}
				1567
				1568	/****************************************************************************
				1569	*
				1570	* The next functions belong to the address allocator for the dma_ops
				1571	* interface functions.
				1572	*
				1573	****************************************************************************/
				1574
				1575
				1576	static unsigned long dma_ops_alloc_iova(struct device *dev,
				1577	struct dma_ops_domain *dma_dom,
				1578	unsigned int pages, u64 dma_mask)
				1579	{
				1580	unsigned long pfn = 0;
				1581
				1582	pages = __roundup_pow_of_two(pages);
				1583
				1584	if (dma_mask > DMA_BIT_MASK(32))
				1585	pfn = alloc_iova_fast(&dma_dom->iovad, pages,
				1586	IOVA_PFN(DMA_BIT_MASK(32)));
				1587
				1588	if (!pfn)
				1589	pfn = alloc_iova_fast(&dma_dom->iovad, pages, IOVA_PFN(dma_mask));
				1590
				1591	return (pfn << PAGE_SHIFT);
				1592	}
				1593
				1594	static void dma_ops_free_iova(struct dma_ops_domain *dma_dom,
				1595	unsigned long address,
				1596	unsigned int pages)
				1597	{
				1598	pages = __roundup_pow_of_two(pages);
				1599	address >>= PAGE_SHIFT;
				1600
				1601	free_iova_fast(&dma_dom->iovad, address, pages);
				1602	}
				1603
				1604	/****************************************************************************
				1605	*
				1606	* The next functions belong to the domain allocation. A domain is
				1607	* allocated for every IOMMU as the default domain. If device isolation
				1608	* is enabled, every device get its own domain. The most important thing
				1609	* about domains is the page table mapping the DMA address space they
				1610	* contain.
				1611	*
				1612	****************************************************************************/
				1613
				1614	/*
				1615	* This function adds a protection domain to the global protection domain list
				1616	*/
				1617	static void add_domain_to_list(struct protection_domain *domain)
				1618	{
				1619	unsigned long flags;
				1620
				1621	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
				1622	list_add(&domain->list, &amd_iommu_pd_list);
				1623	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
				1624	}
				1625
				1626	/*
				1627	* This function removes a protection domain to the global
				1628	* protection domain list
				1629	*/
				1630	static void del_domain_from_list(struct protection_domain *domain)
				1631	{
				1632	unsigned long flags;
				1633
				1634	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
				1635	list_del(&domain->list);
				1636	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
				1637	}
				1638
				1639	static u16 domain_id_alloc(void)
				1640	{
				1641	unsigned long flags;
				1642	int id;
				1643
				1644	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
				1645	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
				1646	BUG_ON(id == 0);
				1647	if (id > 0 && id < MAX_DOMAIN_ID)
				1648	__set_bit(id, amd_iommu_pd_alloc_bitmap);
				1649	else
				1650	id = 0;
				1651	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
				1652
				1653	return id;
				1654	}
				1655
				1656	static void domain_id_free(int id)
				1657	{
				1658	unsigned long flags;
				1659
				1660	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
				1661	if (id > 0 && id < MAX_DOMAIN_ID)
				1662	__clear_bit(id, amd_iommu_pd_alloc_bitmap);
				1663	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
				1664	}
				1665
				1666	#define DEFINE_FREE_PT_FN(LVL, FN) \
				1667	static void free_pt_##LVL (unsigned long __pt) \
				1668	{ \
				1669	unsigned long p; \
				1670	u64 *pt; \
				1671	int i; \
				1672	\
				1673	pt = (u64 *)__pt; \
				1674	\
				1675	for (i = 0; i < 512; ++i) { \
				1676	/* PTE present? */ \
				1677	if (!IOMMU_PTE_PRESENT(pt[i])) \
				1678	continue; \
				1679	\
				1680	/* Large PTE? */ \
				1681	if (PM_PTE_LEVEL(pt[i]) == 0 \|\| \
				1682	PM_PTE_LEVEL(pt[i]) == 7) \
				1683	continue; \
				1684	\
				1685	p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \
				1686	FN(p); \
				1687	} \
				1688	free_page((unsigned long)pt); \
				1689	}
				1690
				1691	DEFINE_FREE_PT_FN(l2, free_page)
				1692	DEFINE_FREE_PT_FN(l3, free_pt_l2)
				1693	DEFINE_FREE_PT_FN(l4, free_pt_l3)
				1694	DEFINE_FREE_PT_FN(l5, free_pt_l4)
				1695	DEFINE_FREE_PT_FN(l6, free_pt_l5)
				1696
				1697	static void free_pagetable(struct protection_domain *domain)
				1698	{
				1699	unsigned long root = (unsigned long)domain->pt_root;
				1700
				1701	switch (domain->mode) {
				1702	case PAGE_MODE_NONE:
				1703	break;
				1704	case PAGE_MODE_1_LEVEL:
				1705	free_page(root);
				1706	break;
				1707	case PAGE_MODE_2_LEVEL:
				1708	free_pt_l2(root);
				1709	break;
				1710	case PAGE_MODE_3_LEVEL:
				1711	free_pt_l3(root);
				1712	break;
				1713	case PAGE_MODE_4_LEVEL:
				1714	free_pt_l4(root);
				1715	break;
				1716	case PAGE_MODE_5_LEVEL:
				1717	free_pt_l5(root);
				1718	break;
				1719	case PAGE_MODE_6_LEVEL:
				1720	free_pt_l6(root);
				1721	break;
				1722	default:
				1723	BUG();
				1724	}
				1725	}
				1726
				1727	static void free_gcr3_tbl_level1(u64 *tbl)
				1728	{
				1729	u64 *ptr;
				1730	int i;
				1731
				1732	for (i = 0; i < 512; ++i) {
				1733	if (!(tbl[i] & GCR3_VALID))
				1734	continue;
				1735
				1736	ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
				1737
				1738	free_page((unsigned long)ptr);
				1739	}
				1740	}
				1741
				1742	static void free_gcr3_tbl_level2(u64 *tbl)
				1743	{
				1744	u64 *ptr;
				1745	int i;
				1746
				1747	for (i = 0; i < 512; ++i) {
				1748	if (!(tbl[i] & GCR3_VALID))
				1749	continue;
				1750
				1751	ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
				1752
				1753	free_gcr3_tbl_level1(ptr);
				1754	}
				1755	}
				1756
				1757	static void free_gcr3_table(struct protection_domain *domain)
				1758	{
				1759	if (domain->glx == 2)
				1760	free_gcr3_tbl_level2(domain->gcr3_tbl);
				1761	else if (domain->glx == 1)
				1762	free_gcr3_tbl_level1(domain->gcr3_tbl);
				1763	else
				1764	BUG_ON(domain->glx != 0);
				1765
				1766	free_page((unsigned long)domain->gcr3_tbl);
				1767	}
				1768
				1769	static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom)
				1770	{
				1771	domain_flush_tlb(&dom->domain);
				1772	domain_flush_complete(&dom->domain);
				1773	}
				1774
				1775	static void iova_domain_flush_tlb(struct iova_domain *iovad)
				1776	{
				1777	struct dma_ops_domain *dom;
				1778
				1779	dom = container_of(iovad, struct dma_ops_domain, iovad);
				1780
				1781	dma_ops_domain_flush_tlb(dom);
				1782	}
				1783
				1784	/*
				1785	* Free a domain, only used if something went wrong in the
				1786	* allocation path and we need to free an already allocated page table
				1787	*/
				1788	static void dma_ops_domain_free(struct dma_ops_domain *dom)
				1789	{
				1790	if (!dom)
				1791	return;
				1792
				1793	del_domain_from_list(&dom->domain);
				1794
				1795	put_iova_domain(&dom->iovad);
				1796
				1797	free_pagetable(&dom->domain);
				1798
				1799	if (dom->domain.id)
				1800	domain_id_free(dom->domain.id);
				1801
				1802	kfree(dom);
				1803	}
				1804
				1805	/*
				1806	* Allocates a new protection domain usable for the dma_ops functions.
				1807	* It also initializes the page table and the address allocator data
				1808	* structures required for the dma_ops interface
				1809	*/
				1810	static struct dma_ops_domain *dma_ops_domain_alloc(void)
				1811	{
				1812	struct dma_ops_domain *dma_dom;
				1813
				1814	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
				1815	if (!dma_dom)
				1816	return NULL;
				1817
				1818	if (protection_domain_init(&dma_dom->domain))
				1819	goto free_dma_dom;
				1820
				1821	dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
				1822	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
				1823	dma_dom->domain.flags = PD_DMA_OPS_MASK;
				1824	if (!dma_dom->domain.pt_root)
				1825	goto free_dma_dom;
				1826
				1827	init_iova_domain(&dma_dom->iovad, PAGE_SIZE,
				1828	IOVA_START_PFN, DMA_32BIT_PFN);
				1829
				1830	if (init_iova_flush_queue(&dma_dom->iovad, iova_domain_flush_tlb, NULL))
				1831	goto free_dma_dom;
				1832
				1833	/* Initialize reserved ranges */
				1834	copy_reserved_iova(&reserved_iova_ranges, &dma_dom->iovad);
				1835
				1836	add_domain_to_list(&dma_dom->domain);
				1837
				1838	return dma_dom;
				1839
				1840	free_dma_dom:
				1841	dma_ops_domain_free(dma_dom);
				1842
				1843	return NULL;
				1844	}
				1845
				1846	/*
				1847	* little helper function to check whether a given protection domain is a
				1848	* dma_ops domain
				1849	*/
				1850	static bool dma_ops_domain(struct protection_domain *domain)
				1851	{
				1852	return domain->flags & PD_DMA_OPS_MASK;
				1853	}
				1854
				1855	static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
				1856	{
				1857	u64 pte_root = 0;
				1858	u64 flags = 0;
				1859	u32 old_domid;
				1860
				1861	if (domain->mode != PAGE_MODE_NONE)
				1862	pte_root = iommu_virt_to_phys(domain->pt_root);
				1863
				1864	pte_root \|= (domain->mode & DEV_ENTRY_MODE_MASK)
				1865	<< DEV_ENTRY_MODE_SHIFT;
				1866	pte_root \|= DTE_FLAG_IR \| DTE_FLAG_IW \| DTE_FLAG_V \| DTE_FLAG_TV;
				1867
				1868	flags = amd_iommu_dev_table[devid].data[1];
				1869
				1870	if (ats)
				1871	flags \|= DTE_FLAG_IOTLB;
				1872
				1873	if (domain->flags & PD_IOMMUV2_MASK) {
				1874	u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
				1875	u64 glx = domain->glx;
				1876	u64 tmp;
				1877
				1878	pte_root \|= DTE_FLAG_GV;
				1879	pte_root \|= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
				1880
				1881	/* First mask out possible old values for GCR3 table */
				1882	tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
				1883	flags &= ~tmp;
				1884
				1885	tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
				1886	flags &= ~tmp;
				1887
				1888	/* Encode GCR3 table into DTE */
				1889	tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
				1890	pte_root \|= tmp;
				1891
				1892	tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
				1893	flags \|= tmp;
				1894
				1895	tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
				1896	flags \|= tmp;
				1897	}
				1898
				1899	flags &= ~DEV_DOMID_MASK;
				1900	flags \|= domain->id;
				1901
				1902	old_domid = amd_iommu_dev_table[devid].data[1] & DEV_DOMID_MASK;
				1903	amd_iommu_dev_table[devid].data[1] = flags;
				1904	amd_iommu_dev_table[devid].data[0] = pte_root;
				1905
				1906	/*
				1907	* A kdump kernel might be replacing a domain ID that was copied from
				1908	* the previous kernel--if so, it needs to flush the translation cache
				1909	* entries for the old domain ID that is being overwritten
				1910	*/
				1911	if (old_domid) {
				1912	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
				1913
				1914	amd_iommu_flush_tlb_domid(iommu, old_domid);
				1915	}
				1916	}
				1917
				1918	static void clear_dte_entry(u16 devid)
				1919	{
				1920	/* remove entry from the device table seen by the hardware */
				1921	amd_iommu_dev_table[devid].data[0] = DTE_FLAG_V \| DTE_FLAG_TV;
				1922	amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK;
				1923
				1924	amd_iommu_apply_erratum_63(devid);
				1925	}
				1926
				1927	static void do_attach(struct iommu_dev_data *dev_data,
				1928	struct protection_domain *domain)
				1929	{
				1930	struct amd_iommu *iommu;
				1931	u16 alias;
				1932	bool ats;
				1933
				1934	iommu = amd_iommu_rlookup_table[dev_data->devid];
				1935	alias = dev_data->alias;
				1936	ats = dev_data->ats.enabled;
				1937
				1938	/* Update data structures */
				1939	dev_data->domain = domain;
				1940	list_add(&dev_data->list, &domain->dev_list);
				1941
				1942	/* Do reference counting */
				1943	domain->dev_iommu[iommu->index] += 1;
				1944	domain->dev_cnt += 1;
				1945
				1946	/* Update device table */
				1947	set_dte_entry(dev_data->devid, domain, ats);
				1948	if (alias != dev_data->devid)
				1949	set_dte_entry(alias, domain, ats);
				1950
				1951	device_flush_dte(dev_data);
				1952	}
				1953
				1954	static void do_detach(struct iommu_dev_data *dev_data)
				1955	{
				1956	struct protection_domain *domain = dev_data->domain;
				1957	struct amd_iommu *iommu;
				1958	u16 alias;
				1959
				1960	/*
				1961	* First check if the device is still attached. It might already
				1962	* be detached from its domain because the generic
				1963	* iommu_detach_group code detached it and we try again here in
				1964	* our alias handling.
				1965	*/
				1966	if (!dev_data->domain)
				1967	return;
				1968
				1969	iommu = amd_iommu_rlookup_table[dev_data->devid];
				1970	alias = dev_data->alias;
				1971
				1972	/* Update data structures */
				1973	dev_data->domain = NULL;
				1974	list_del(&dev_data->list);
				1975	clear_dte_entry(dev_data->devid);
				1976	if (alias != dev_data->devid)
				1977	clear_dte_entry(alias);
				1978
				1979	/* Flush the DTE entry */
				1980	device_flush_dte(dev_data);
				1981
				1982	/* Flush IOTLB */
				1983	domain_flush_tlb_pde(domain);
				1984
				1985	/* Wait for the flushes to finish */
				1986	domain_flush_complete(domain);
				1987
				1988	/* decrease reference counters - needs to happen after the flushes */
				1989	domain->dev_iommu[iommu->index] -= 1;
				1990	domain->dev_cnt -= 1;
				1991	}
				1992
				1993	/*
				1994	* If a device is not yet associated with a domain, this function does
				1995	* assigns it visible for the hardware
				1996	*/
				1997	static int __attach_device(struct iommu_dev_data *dev_data,
				1998	struct protection_domain *domain)
				1999	{
				2000	int ret;
				2001
				2002	/*
				2003	* Must be called with IRQs disabled. Warn here to detect early
				2004	* when its not.
				2005	*/
				2006	WARN_ON(!irqs_disabled());
				2007
				2008	/* lock domain */
				2009	spin_lock(&domain->lock);
				2010
				2011	ret = -EBUSY;
				2012	if (dev_data->domain != NULL)
				2013	goto out_unlock;
				2014
				2015	/* Attach alias group root */
				2016	do_attach(dev_data, domain);
				2017
				2018	ret = 0;
				2019
				2020	out_unlock:
				2021
				2022	/* ready */
				2023	spin_unlock(&domain->lock);
				2024
				2025	return ret;
				2026	}
				2027
				2028
				2029	static void pdev_iommuv2_disable(struct pci_dev *pdev)
				2030	{
				2031	pci_disable_ats(pdev);
				2032	pci_disable_pri(pdev);
				2033	pci_disable_pasid(pdev);
				2034	}
				2035
				2036	/* FIXME: Change generic reset-function to do the same */
				2037	static int pri_reset_while_enabled(struct pci_dev *pdev)
				2038	{
				2039	u16 control;
				2040	int pos;
				2041
				2042	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
				2043	if (!pos)
				2044	return -EINVAL;
				2045
				2046	pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
				2047	control \|= PCI_PRI_CTRL_RESET;
				2048	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
				2049
				2050	return 0;
				2051	}
				2052
				2053	static int pdev_iommuv2_enable(struct pci_dev *pdev)
				2054	{
				2055	bool reset_enable;
				2056	int reqs, ret;
				2057
				2058	/* FIXME: Hardcode number of outstanding requests for now */
				2059	reqs = 32;
				2060	if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE))
				2061	reqs = 1;
				2062	reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET);
				2063
				2064	/* Only allow access to user-accessible pages */
				2065	ret = pci_enable_pasid(pdev, 0);
				2066	if (ret)
				2067	goto out_err;
				2068
				2069	/* First reset the PRI state of the device */
				2070	ret = pci_reset_pri(pdev);
				2071	if (ret)
				2072	goto out_err;
				2073
				2074	/* Enable PRI */
				2075	ret = pci_enable_pri(pdev, reqs);
				2076	if (ret)
				2077	goto out_err;
				2078
				2079	if (reset_enable) {
				2080	ret = pri_reset_while_enabled(pdev);
				2081	if (ret)
				2082	goto out_err;
				2083	}
				2084
				2085	ret = pci_enable_ats(pdev, PAGE_SHIFT);
				2086	if (ret)
				2087	goto out_err;
				2088
				2089	return 0;
				2090
				2091	out_err:
				2092	pci_disable_pri(pdev);
				2093	pci_disable_pasid(pdev);
				2094
				2095	return ret;
				2096	}
				2097
				2098	/* FIXME: Move this to PCI code */
				2099	#define PCI_PRI_TLP_OFF (1 << 15)
				2100
				2101	static bool pci_pri_tlp_required(struct pci_dev *pdev)
				2102	{
				2103	u16 status;
				2104	int pos;
				2105
				2106	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
				2107	if (!pos)
				2108	return false;
				2109
				2110	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
				2111
				2112	return (status & PCI_PRI_TLP_OFF) ? true : false;
				2113	}
				2114
				2115	/*
				2116	* If a device is not yet associated with a domain, this function
				2117	* assigns it visible for the hardware
				2118	*/
				2119	static int attach_device(struct device *dev,
				2120	struct protection_domain *domain)
				2121	{
				2122	struct pci_dev *pdev;
				2123	struct iommu_dev_data *dev_data;
				2124	unsigned long flags;
				2125	int ret;
				2126
				2127	dev_data = get_dev_data(dev);
				2128
				2129	if (!dev_is_pci(dev))
				2130	goto skip_ats_check;
				2131
				2132	pdev = to_pci_dev(dev);
				2133	if (domain->flags & PD_IOMMUV2_MASK) {
				2134	if (!dev_data->passthrough)
				2135	return -EINVAL;
				2136
				2137	if (dev_data->iommu_v2) {
				2138	if (pdev_iommuv2_enable(pdev) != 0)
				2139	return -EINVAL;
				2140
				2141	dev_data->ats.enabled = true;
				2142	dev_data->ats.qdep = pci_ats_queue_depth(pdev);
				2143	dev_data->pri_tlp = pci_pri_tlp_required(pdev);
				2144	}
				2145	} else if (amd_iommu_iotlb_sup &&
				2146	pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
				2147	dev_data->ats.enabled = true;
				2148	dev_data->ats.qdep = pci_ats_queue_depth(pdev);
				2149	}
				2150
				2151	skip_ats_check:
				2152	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
				2153	ret = __attach_device(dev_data, domain);
				2154	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
				2155
				2156	/*
				2157	* We might boot into a crash-kernel here. The crashed kernel
				2158	* left the caches in the IOMMU dirty. So we have to flush
				2159	* here to evict all dirty stuff.
				2160	*/
				2161	domain_flush_tlb_pde(domain);
				2162
				2163	domain_flush_complete(domain);
				2164
				2165	return ret;
				2166	}
				2167
				2168	/*
				2169	* Removes a device from a protection domain (unlocked)
				2170	*/
				2171	static void __detach_device(struct iommu_dev_data *dev_data)
				2172	{
				2173	struct protection_domain *domain;
				2174
				2175	/*
				2176	* Must be called with IRQs disabled. Warn here to detect early
				2177	* when its not.
				2178	*/
				2179	WARN_ON(!irqs_disabled());
				2180
				2181	if (WARN_ON(!dev_data->domain))
				2182	return;
				2183
				2184	domain = dev_data->domain;
				2185
				2186	spin_lock(&domain->lock);
				2187
				2188	do_detach(dev_data);
				2189
				2190	spin_unlock(&domain->lock);
				2191	}
				2192
				2193	/*
				2194	* Removes a device from a protection domain (with devtable_lock held)
				2195	*/
				2196	static void detach_device(struct device *dev)
				2197	{
				2198	struct protection_domain *domain;
				2199	struct iommu_dev_data *dev_data;
				2200	unsigned long flags;
				2201
				2202	dev_data = get_dev_data(dev);
				2203	domain = dev_data->domain;
				2204
				2205	/* lock device table */
				2206	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
				2207	__detach_device(dev_data);
				2208	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
				2209
				2210	if (!dev_is_pci(dev))
				2211	return;
				2212
				2213	if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
				2214	pdev_iommuv2_disable(to_pci_dev(dev));
				2215	else if (dev_data->ats.enabled)
				2216	pci_disable_ats(to_pci_dev(dev));
				2217
				2218	dev_data->ats.enabled = false;
				2219	}
				2220
				2221	static int amd_iommu_add_device(struct device *dev)
				2222	{
				2223	struct iommu_dev_data *dev_data;
				2224	struct iommu_domain *domain;
				2225	struct amd_iommu *iommu;
				2226	int ret, devid;
				2227
				2228	if (!check_device(dev) \|\| get_dev_data(dev))
				2229	return 0;
				2230
				2231	devid = get_device_id(dev);
				2232	if (devid < 0)
				2233	return devid;
				2234
				2235	iommu = amd_iommu_rlookup_table[devid];
				2236
				2237	ret = iommu_init_device(dev);
				2238	if (ret) {
				2239	if (ret != -ENOTSUPP)
				2240	pr_err("Failed to initialize device %s - trying to proceed anyway\n",
				2241	dev_name(dev));
				2242
				2243	iommu_ignore_device(dev);
				2244	dev->dma_ops = &nommu_dma_ops;
				2245	goto out;
				2246	}
				2247	init_iommu_group(dev);
				2248
				2249	dev_data = get_dev_data(dev);
				2250
				2251	BUG_ON(!dev_data);
				2252
				2253	if (iommu_pass_through \|\| dev_data->iommu_v2)
				2254	iommu_request_dm_for_dev(dev);
				2255
				2256	/* Domains are initialized for this device - have a look what we ended up with */
				2257	domain = iommu_get_domain_for_dev(dev);
				2258	if (domain->type == IOMMU_DOMAIN_IDENTITY)
				2259	dev_data->passthrough = true;
				2260	else
				2261	dev->dma_ops = &amd_iommu_dma_ops;
				2262
				2263	out:
				2264	iommu_completion_wait(iommu);
				2265
				2266	return 0;
				2267	}
				2268
				2269	static void amd_iommu_remove_device(struct device *dev)
				2270	{
				2271	struct amd_iommu *iommu;
				2272	int devid;
				2273
				2274	if (!check_device(dev))
				2275	return;
				2276
				2277	devid = get_device_id(dev);
				2278	if (devid < 0)
				2279	return;
				2280
				2281	iommu = amd_iommu_rlookup_table[devid];
				2282
				2283	iommu_uninit_device(dev);
				2284	iommu_completion_wait(iommu);
				2285	}
				2286
				2287	static struct iommu_group amd_iommu_device_group(struct device dev)
				2288	{
				2289	if (dev_is_pci(dev))
				2290	return pci_device_group(dev);
				2291
				2292	return acpihid_device_group(dev);
				2293	}
				2294
				2295	/*****************************************************************************
				2296	*
				2297	* The next functions belong to the dma_ops mapping/unmapping code.
				2298	*
				2299	*****************************************************************************/
				2300
				2301	/*
				2302	* In the dma_ops path we only have the struct device. This function
				2303	* finds the corresponding IOMMU, the protection domain and the
				2304	* requestor id for a given device.
				2305	* If the device is not yet associated with a domain this is also done
				2306	* in this function.
				2307	*/
				2308	static struct protection_domain get_domain(struct device dev)
				2309	{
				2310	struct protection_domain *domain;
				2311	struct iommu_domain *io_domain;
				2312
				2313	if (!check_device(dev))
				2314	return ERR_PTR(-EINVAL);
				2315
				2316	domain = get_dev_data(dev)->domain;
				2317	if (domain == NULL && get_dev_data(dev)->defer_attach) {
				2318	get_dev_data(dev)->defer_attach = false;
				2319	io_domain = iommu_get_domain_for_dev(dev);
				2320	domain = to_pdomain(io_domain);
				2321	attach_device(dev, domain);
				2322	}
				2323	if (domain == NULL)
				2324	return ERR_PTR(-EBUSY);
				2325
				2326	if (!dma_ops_domain(domain))
				2327	return ERR_PTR(-EBUSY);
				2328
				2329	return domain;
				2330	}
				2331
				2332	static void update_device_table(struct protection_domain *domain)
				2333	{
				2334	struct iommu_dev_data *dev_data;
				2335
				2336	list_for_each_entry(dev_data, &domain->dev_list, list) {
				2337	set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled);
				2338
				2339	if (dev_data->devid == dev_data->alias)
				2340	continue;
				2341
				2342	/* There is an alias, update device table entry for it */
				2343	set_dte_entry(dev_data->alias, domain, dev_data->ats.enabled);
				2344	}
				2345	}
				2346
				2347	static void update_domain(struct protection_domain *domain)
				2348	{
				2349	if (!domain->updated)
				2350	return;
				2351
				2352	update_device_table(domain);
				2353
				2354	domain_flush_devices(domain);
				2355	domain_flush_tlb_pde(domain);
				2356
				2357	domain->updated = false;
				2358	}
				2359
				2360	static int dir2prot(enum dma_data_direction direction)
				2361	{
				2362	if (direction == DMA_TO_DEVICE)
				2363	return IOMMU_PROT_IR;
				2364	else if (direction == DMA_FROM_DEVICE)
				2365	return IOMMU_PROT_IW;
				2366	else if (direction == DMA_BIDIRECTIONAL)
				2367	return IOMMU_PROT_IW \| IOMMU_PROT_IR;
				2368	else
				2369	return 0;
				2370	}
				2371
				2372	/*
				2373	* This function contains common code for mapping of a physically
				2374	* contiguous memory region into DMA address space. It is used by all
				2375	* mapping functions provided with this IOMMU driver.
				2376	* Must be called with the domain lock held.
				2377	*/
				2378	static dma_addr_t __map_single(struct device *dev,
				2379	struct dma_ops_domain *dma_dom,
				2380	phys_addr_t paddr,
				2381	size_t size,
				2382	enum dma_data_direction direction,
				2383	u64 dma_mask)
				2384	{
				2385	dma_addr_t offset = paddr & ~PAGE_MASK;
				2386	dma_addr_t address, start, ret;
				2387	unsigned int pages;
				2388	int prot = 0;
				2389	int i;
				2390
				2391	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
				2392	paddr &= PAGE_MASK;
				2393
				2394	address = dma_ops_alloc_iova(dev, dma_dom, pages, dma_mask);
				2395	if (address == AMD_IOMMU_MAPPING_ERROR)
				2396	goto out;
				2397
				2398	prot = dir2prot(direction);
				2399
				2400	start = address;
				2401	for (i = 0; i < pages; ++i) {
				2402	ret = iommu_map_page(&dma_dom->domain, start, paddr,
				2403	PAGE_SIZE, prot, GFP_ATOMIC);
				2404	if (ret)
				2405	goto out_unmap;
				2406
				2407	paddr += PAGE_SIZE;
				2408	start += PAGE_SIZE;
				2409	}
				2410	address += offset;
				2411
				2412	if (unlikely(amd_iommu_np_cache)) {
				2413	domain_flush_pages(&dma_dom->domain, address, size);
				2414	domain_flush_complete(&dma_dom->domain);
				2415	}
				2416
				2417	out:
				2418	return address;
				2419
				2420	out_unmap:
				2421
				2422	for (--i; i >= 0; --i) {
				2423	start -= PAGE_SIZE;
				2424	iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
				2425	}
				2426
				2427	domain_flush_tlb(&dma_dom->domain);
				2428	domain_flush_complete(&dma_dom->domain);
				2429
				2430	dma_ops_free_iova(dma_dom, address, pages);
				2431
				2432	return AMD_IOMMU_MAPPING_ERROR;
				2433	}
				2434
				2435	/*
				2436	* Does the reverse of the __map_single function. Must be called with
				2437	* the domain lock held too
				2438	*/
				2439	static void __unmap_single(struct dma_ops_domain *dma_dom,
				2440	dma_addr_t dma_addr,
				2441	size_t size,
				2442	int dir)
				2443	{
				2444	dma_addr_t flush_addr;
				2445	dma_addr_t i, start;
				2446	unsigned int pages;
				2447
				2448	flush_addr = dma_addr;
				2449	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
				2450	dma_addr &= PAGE_MASK;
				2451	start = dma_addr;
				2452
				2453	for (i = 0; i < pages; ++i) {
				2454	iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
				2455	start += PAGE_SIZE;
				2456	}
				2457
				2458	if (amd_iommu_unmap_flush) {
				2459	domain_flush_tlb(&dma_dom->domain);
				2460	domain_flush_complete(&dma_dom->domain);
				2461	dma_ops_free_iova(dma_dom, dma_addr, pages);
				2462	} else {
				2463	pages = __roundup_pow_of_two(pages);
				2464	queue_iova(&dma_dom->iovad, dma_addr >> PAGE_SHIFT, pages, 0);
				2465	}
				2466	}
				2467
				2468	/*
				2469	* The exported map_single function for dma_ops.
				2470	*/
				2471	static dma_addr_t map_page(struct device dev, struct page page,
				2472	unsigned long offset, size_t size,
				2473	enum dma_data_direction dir,
				2474	unsigned long attrs)
				2475	{
				2476	phys_addr_t paddr = page_to_phys(page) + offset;
				2477	struct protection_domain *domain;
				2478	struct dma_ops_domain *dma_dom;
				2479	u64 dma_mask;
				2480
				2481	domain = get_domain(dev);
				2482	if (PTR_ERR(domain) == -EINVAL)
				2483	return (dma_addr_t)paddr;
				2484	else if (IS_ERR(domain))
				2485	return AMD_IOMMU_MAPPING_ERROR;
				2486
				2487	dma_mask = *dev->dma_mask;
				2488	dma_dom = to_dma_ops_domain(domain);
				2489
				2490	return __map_single(dev, dma_dom, paddr, size, dir, dma_mask);
				2491	}
				2492
				2493	/*
				2494	* The exported unmap_single function for dma_ops.
				2495	*/
				2496	static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
				2497	enum dma_data_direction dir, unsigned long attrs)
				2498	{
				2499	struct protection_domain *domain;
				2500	struct dma_ops_domain *dma_dom;
				2501
				2502	domain = get_domain(dev);
				2503	if (IS_ERR(domain))
				2504	return;
				2505
				2506	dma_dom = to_dma_ops_domain(domain);
				2507
				2508	__unmap_single(dma_dom, dma_addr, size, dir);
				2509	}
				2510
				2511	static int sg_num_pages(struct device *dev,
				2512	struct scatterlist *sglist,
				2513	int nelems)
				2514	{
				2515	unsigned long mask, boundary_size;
				2516	struct scatterlist *s;
				2517	int i, npages = 0;
				2518
				2519	mask = dma_get_seg_boundary(dev);
				2520	boundary_size = mask + 1 ? ALIGN(mask + 1, PAGE_SIZE) >> PAGE_SHIFT :
				2521	1UL << (BITS_PER_LONG - PAGE_SHIFT);
				2522
				2523	for_each_sg(sglist, s, nelems, i) {
				2524	int p, n;
				2525
				2526	s->dma_address = npages << PAGE_SHIFT;
				2527	p = npages % boundary_size;
				2528	n = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
				2529	if (p + n > boundary_size)
				2530	npages += boundary_size - p;
				2531	npages += n;
				2532	}
				2533
				2534	return npages;
				2535	}
				2536
				2537	/*
				2538	* The exported map_sg function for dma_ops (handles scatter-gather
				2539	* lists).
				2540	*/
				2541	static int map_sg(struct device dev, struct scatterlist sglist,
				2542	int nelems, enum dma_data_direction direction,
				2543	unsigned long attrs)
				2544	{
				2545	int mapped_pages = 0, npages = 0, prot = 0, i;
				2546	struct protection_domain *domain;
				2547	struct dma_ops_domain *dma_dom;
				2548	struct scatterlist *s;
				2549	unsigned long address;
				2550	u64 dma_mask;
				2551
				2552	domain = get_domain(dev);
				2553	if (IS_ERR(domain))
				2554	return 0;
				2555
				2556	dma_dom = to_dma_ops_domain(domain);
				2557	dma_mask = *dev->dma_mask;
				2558
				2559	npages = sg_num_pages(dev, sglist, nelems);
				2560
				2561	address = dma_ops_alloc_iova(dev, dma_dom, npages, dma_mask);
				2562	if (address == AMD_IOMMU_MAPPING_ERROR)
				2563	goto out_err;
				2564
				2565	prot = dir2prot(direction);
				2566
				2567	/* Map all sg entries */
				2568	for_each_sg(sglist, s, nelems, i) {
				2569	int j, pages = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
				2570
				2571	for (j = 0; j < pages; ++j) {
				2572	unsigned long bus_addr, phys_addr;
				2573	int ret;
				2574
				2575	bus_addr = address + s->dma_address + (j << PAGE_SHIFT);
				2576	phys_addr = (sg_phys(s) & PAGE_MASK) + (j << PAGE_SHIFT);
				2577	ret = iommu_map_page(domain, bus_addr, phys_addr,
				2578	PAGE_SIZE, prot,
				2579	GFP_ATOMIC \| __GFP_NOWARN);
				2580	if (ret)
				2581	goto out_unmap;
				2582
				2583	mapped_pages += 1;
				2584	}
				2585	}
				2586
				2587	/* Everything is mapped - write the right values into s->dma_address */
				2588	for_each_sg(sglist, s, nelems, i) {
				2589	/*
				2590	* Add in the remaining piece of the scatter-gather offset that
				2591	* was masked out when we were determining the physical address
				2592	* via (sg_phys(s) & PAGE_MASK) earlier.
				2593	*/
				2594	s->dma_address += address + (s->offset & ~PAGE_MASK);
				2595	s->dma_length = s->length;
				2596	}
				2597
				2598	return nelems;
				2599
				2600	out_unmap:
				2601	pr_err("%s: IOMMU mapping error in map_sg (io-pages: %d)\n",
				2602	dev_name(dev), npages);
				2603
				2604	for_each_sg(sglist, s, nelems, i) {
				2605	int j, pages = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
				2606
				2607	for (j = 0; j < pages; ++j) {
				2608	unsigned long bus_addr;
				2609
				2610	bus_addr = address + s->dma_address + (j << PAGE_SHIFT);
				2611	iommu_unmap_page(domain, bus_addr, PAGE_SIZE);
				2612
				2613	if (--mapped_pages == 0)
				2614	goto out_free_iova;
				2615	}
				2616	}
				2617
				2618	out_free_iova:
				2619	free_iova_fast(&dma_dom->iovad, address >> PAGE_SHIFT, npages);
				2620
				2621	out_err:
				2622	return 0;
				2623	}
				2624
				2625	/*
				2626	* The exported map_sg function for dma_ops (handles scatter-gather
				2627	* lists).
				2628	*/
				2629	static void unmap_sg(struct device dev, struct scatterlist sglist,
				2630	int nelems, enum dma_data_direction dir,
				2631	unsigned long attrs)
				2632	{
				2633	struct protection_domain *domain;
				2634	struct dma_ops_domain *dma_dom;
				2635	unsigned long startaddr;
				2636	int npages = 2;
				2637
				2638	domain = get_domain(dev);
				2639	if (IS_ERR(domain))
				2640	return;
				2641
				2642	startaddr = sg_dma_address(sglist) & PAGE_MASK;
				2643	dma_dom = to_dma_ops_domain(domain);
				2644	npages = sg_num_pages(dev, sglist, nelems);
				2645
				2646	__unmap_single(dma_dom, startaddr, npages << PAGE_SHIFT, dir);
				2647	}
				2648
				2649	/*
				2650	* The exported alloc_coherent function for dma_ops.
				2651	*/
				2652	static void alloc_coherent(struct device dev, size_t size,
				2653	dma_addr_t *dma_addr, gfp_t flag,
				2654	unsigned long attrs)
				2655	{
				2656	u64 dma_mask = dev->coherent_dma_mask;
				2657	struct protection_domain *domain;
				2658	struct dma_ops_domain *dma_dom;
				2659	struct page *page;
				2660
				2661	domain = get_domain(dev);
				2662	if (PTR_ERR(domain) == -EINVAL) {
				2663	page = alloc_pages(flag, get_order(size));
				2664	*dma_addr = page_to_phys(page);
				2665	return page_address(page);
				2666	} else if (IS_ERR(domain))
				2667	return NULL;
				2668
				2669	dma_dom = to_dma_ops_domain(domain);
				2670	size = PAGE_ALIGN(size);
				2671	dma_mask = dev->coherent_dma_mask;
				2672	flag &= ~(__GFP_DMA \| __GFP_HIGHMEM \| __GFP_DMA32);
				2673	flag \|= __GFP_ZERO;
				2674
				2675	page = alloc_pages(flag \| __GFP_NOWARN, get_order(size));
				2676	if (!page) {
				2677	if (!gfpflags_allow_blocking(flag))
				2678	return NULL;
				2679
				2680	page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
				2681	get_order(size), flag);
				2682	if (!page)
				2683	return NULL;
				2684	}
				2685
				2686	if (!dma_mask)
				2687	dma_mask = *dev->dma_mask;
				2688
				2689	*dma_addr = __map_single(dev, dma_dom, page_to_phys(page),
				2690	size, DMA_BIDIRECTIONAL, dma_mask);
				2691
				2692	if (*dma_addr == AMD_IOMMU_MAPPING_ERROR)
				2693	goto out_free;
				2694
				2695	return page_address(page);
				2696
				2697	out_free:
				2698
				2699	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
				2700	__free_pages(page, get_order(size));
				2701
				2702	return NULL;
				2703	}
				2704
				2705	/*
				2706	* The exported free_coherent function for dma_ops.
				2707	*/
				2708	static void free_coherent(struct device *dev, size_t size,
				2709	void *virt_addr, dma_addr_t dma_addr,
				2710	unsigned long attrs)
				2711	{
				2712	struct protection_domain *domain;
				2713	struct dma_ops_domain *dma_dom;
				2714	struct page *page;
				2715
				2716	page = virt_to_page(virt_addr);
				2717	size = PAGE_ALIGN(size);
				2718
				2719	domain = get_domain(dev);
				2720	if (IS_ERR(domain))
				2721	goto free_mem;
				2722
				2723	dma_dom = to_dma_ops_domain(domain);
				2724
				2725	__unmap_single(dma_dom, dma_addr, size, DMA_BIDIRECTIONAL);
				2726
				2727	free_mem:
				2728	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
				2729	__free_pages(page, get_order(size));
				2730	}
				2731
				2732	/*
				2733	* This function is called by the DMA layer to find out if we can handle a
				2734	* particular device. It is part of the dma_ops.
				2735	*/
				2736	static int amd_iommu_dma_supported(struct device *dev, u64 mask)
				2737	{
				2738	if (!x86_dma_supported(dev, mask))
				2739	return 0;
				2740	return check_device(dev);
				2741	}
				2742
				2743	static int amd_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr)
				2744	{
				2745	return dma_addr == AMD_IOMMU_MAPPING_ERROR;
				2746	}
				2747
				2748	static const struct dma_map_ops amd_iommu_dma_ops = {
				2749	.alloc = alloc_coherent,
				2750	.free = free_coherent,
				2751	.map_page = map_page,
				2752	.unmap_page = unmap_page,
				2753	.map_sg = map_sg,
				2754	.unmap_sg = unmap_sg,
				2755	.dma_supported = amd_iommu_dma_supported,
				2756	.mapping_error = amd_iommu_mapping_error,
				2757	};
				2758
				2759	static int init_reserved_iova_ranges(void)
				2760	{
				2761	struct pci_dev *pdev = NULL;
				2762	struct iova *val;
				2763
				2764	init_iova_domain(&reserved_iova_ranges, PAGE_SIZE,
				2765	IOVA_START_PFN, DMA_32BIT_PFN);
				2766
				2767	lockdep_set_class(&reserved_iova_ranges.iova_rbtree_lock,
				2768	&reserved_rbtree_key);
				2769
				2770	/* MSI memory range */
				2771	val = reserve_iova(&reserved_iova_ranges,
				2772	IOVA_PFN(MSI_RANGE_START), IOVA_PFN(MSI_RANGE_END));
				2773	if (!val) {
				2774	pr_err("Reserving MSI range failed\n");
				2775	return -ENOMEM;
				2776	}
				2777
				2778	/* HT memory range */
				2779	val = reserve_iova(&reserved_iova_ranges,
				2780	IOVA_PFN(HT_RANGE_START), IOVA_PFN(HT_RANGE_END));
				2781	if (!val) {
				2782	pr_err("Reserving HT range failed\n");
				2783	return -ENOMEM;
				2784	}
				2785
				2786	/*
				2787	* Memory used for PCI resources
				2788	* FIXME: Check whether we can reserve the PCI-hole completly
				2789	*/
				2790	for_each_pci_dev(pdev) {
				2791	int i;
				2792
				2793	for (i = 0; i < PCI_NUM_RESOURCES; ++i) {
				2794	struct resource *r = &pdev->resource[i];
				2795
				2796	if (!(r->flags & IORESOURCE_MEM))
				2797	continue;
				2798
				2799	val = reserve_iova(&reserved_iova_ranges,
				2800	IOVA_PFN(r->start),
				2801	IOVA_PFN(r->end));
				2802	if (!val) {
				2803	pr_err("Reserve pci-resource range failed\n");
				2804	return -ENOMEM;
				2805	}
				2806	}
				2807	}
				2808
				2809	return 0;
				2810	}
				2811
				2812	int __init amd_iommu_init_api(void)
				2813	{
				2814	int ret, err = 0;
				2815
				2816	ret = iova_cache_get();
				2817	if (ret)
				2818	return ret;
				2819
				2820	ret = init_reserved_iova_ranges();
				2821	if (ret)
				2822	return ret;
				2823
				2824	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
				2825	if (err)
				2826	return err;
				2827	#ifdef CONFIG_ARM_AMBA
				2828	err = bus_set_iommu(&amba_bustype, &amd_iommu_ops);
				2829	if (err)
				2830	return err;
				2831	#endif
				2832	err = bus_set_iommu(&platform_bus_type, &amd_iommu_ops);
				2833	if (err)
				2834	return err;
				2835
				2836	return 0;
				2837	}
				2838
				2839	int __init amd_iommu_init_dma_ops(void)
				2840	{
				2841	swiotlb = (iommu_pass_through \|\| sme_me_mask) ? 1 : 0;
				2842	iommu_detected = 1;
				2843
				2844	/*
				2845	* In case we don't initialize SWIOTLB (actually the common case
				2846	* when AMD IOMMU is enabled and SME is not active), make sure there
				2847	* are global dma_ops set as a fall-back for devices not handled by
				2848	* this driver (for example non-PCI devices). When SME is active,
				2849	* make sure that swiotlb variable remains set so the global dma_ops
				2850	* continue to be SWIOTLB.
				2851	*/
				2852	if (!swiotlb)
				2853	dma_ops = &nommu_dma_ops;
				2854
				2855	if (amd_iommu_unmap_flush)
				2856	pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n");
				2857	else
				2858	pr_info("AMD-Vi: Lazy IO/TLB flushing enabled\n");
				2859
				2860	return 0;
				2861
				2862	}
				2863
				2864	/*****************************************************************************
				2865	*
				2866	* The following functions belong to the exported interface of AMD IOMMU
				2867	*
				2868	* This interface allows access to lower level functions of the IOMMU
				2869	* like protection domain handling and assignement of devices to domains
				2870	* which is not possible with the dma_ops interface.
				2871	*
				2872	*****************************************************************************/
				2873
				2874	static void cleanup_domain(struct protection_domain *domain)
				2875	{
				2876	struct iommu_dev_data *entry;
				2877	unsigned long flags;
				2878
				2879	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
				2880
				2881	while (!list_empty(&domain->dev_list)) {
				2882	entry = list_first_entry(&domain->dev_list,
				2883	struct iommu_dev_data, list);
				2884	__detach_device(entry);
				2885	}
				2886
				2887	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
				2888	}
				2889
				2890	static void protection_domain_free(struct protection_domain *domain)
				2891	{
				2892	if (!domain)
				2893	return;
				2894
				2895	del_domain_from_list(domain);
				2896
				2897	if (domain->id)
				2898	domain_id_free(domain->id);
				2899
				2900	kfree(domain);
				2901	}
				2902
				2903	static int protection_domain_init(struct protection_domain *domain)
				2904	{
				2905	spin_lock_init(&domain->lock);
				2906	mutex_init(&domain->api_lock);
				2907	domain->id = domain_id_alloc();
				2908	if (!domain->id)
				2909	return -ENOMEM;
				2910	INIT_LIST_HEAD(&domain->dev_list);
				2911
				2912	return 0;
				2913	}
				2914
				2915	static struct protection_domain *protection_domain_alloc(void)
				2916	{
				2917	struct protection_domain *domain;
				2918
				2919	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
				2920	if (!domain)
				2921	return NULL;
				2922
				2923	if (protection_domain_init(domain))
				2924	goto out_err;
				2925
				2926	add_domain_to_list(domain);
				2927
				2928	return domain;
				2929
				2930	out_err:
				2931	kfree(domain);
				2932
				2933	return NULL;
				2934	}
				2935
				2936	static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
				2937	{
				2938	struct protection_domain *pdomain;
				2939	struct dma_ops_domain *dma_domain;
				2940
				2941	switch (type) {
				2942	case IOMMU_DOMAIN_UNMANAGED:
				2943	pdomain = protection_domain_alloc();
				2944	if (!pdomain)
				2945	return NULL;
				2946
				2947	pdomain->mode = PAGE_MODE_3_LEVEL;
				2948	pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
				2949	if (!pdomain->pt_root) {
				2950	protection_domain_free(pdomain);
				2951	return NULL;
				2952	}
				2953
				2954	pdomain->domain.geometry.aperture_start = 0;
				2955	pdomain->domain.geometry.aperture_end = ~0ULL;
				2956	pdomain->domain.geometry.force_aperture = true;
				2957
				2958	break;
				2959	case IOMMU_DOMAIN_DMA:
				2960	dma_domain = dma_ops_domain_alloc();
				2961	if (!dma_domain) {
				2962	pr_err("AMD-Vi: Failed to allocate\n");
				2963	return NULL;
				2964	}
				2965	pdomain = &dma_domain->domain;
				2966	break;
				2967	case IOMMU_DOMAIN_IDENTITY:
				2968	pdomain = protection_domain_alloc();
				2969	if (!pdomain)
				2970	return NULL;
				2971
				2972	pdomain->mode = PAGE_MODE_NONE;
				2973	break;
				2974	default:
				2975	return NULL;
				2976	}
				2977
				2978	return &pdomain->domain;
				2979	}
				2980
				2981	static void amd_iommu_domain_free(struct iommu_domain *dom)
				2982	{
				2983	struct protection_domain *domain;
				2984	struct dma_ops_domain *dma_dom;
				2985
				2986	domain = to_pdomain(dom);
				2987
				2988	if (domain->dev_cnt > 0)
				2989	cleanup_domain(domain);
				2990
				2991	BUG_ON(domain->dev_cnt != 0);
				2992
				2993	if (!dom)
				2994	return;
				2995
				2996	switch (dom->type) {
				2997	case IOMMU_DOMAIN_DMA:
				2998	/* Now release the domain */
				2999	dma_dom = to_dma_ops_domain(domain);
				3000	dma_ops_domain_free(dma_dom);
				3001	break;
				3002	default:
				3003	if (domain->mode != PAGE_MODE_NONE)
				3004	free_pagetable(domain);
				3005
				3006	if (domain->flags & PD_IOMMUV2_MASK)
				3007	free_gcr3_table(domain);
				3008
				3009	protection_domain_free(domain);
				3010	break;
				3011	}
				3012	}
				3013
				3014	static void amd_iommu_detach_device(struct iommu_domain *dom,
				3015	struct device *dev)
				3016	{
				3017	struct iommu_dev_data *dev_data = dev->archdata.iommu;
				3018	struct amd_iommu *iommu;
				3019	int devid;
				3020
				3021	if (!check_device(dev))
				3022	return;
				3023
				3024	devid = get_device_id(dev);
				3025	if (devid < 0)
				3026	return;
				3027
				3028	if (dev_data->domain != NULL)
				3029	detach_device(dev);
				3030
				3031	iommu = amd_iommu_rlookup_table[devid];
				3032	if (!iommu)
				3033	return;
				3034
				3035	#ifdef CONFIG_IRQ_REMAP
				3036	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
				3037	(dom->type == IOMMU_DOMAIN_UNMANAGED))
				3038	dev_data->use_vapic = 0;
				3039	#endif
				3040
				3041	iommu_completion_wait(iommu);
				3042	}
				3043
				3044	static int amd_iommu_attach_device(struct iommu_domain *dom,
				3045	struct device *dev)
				3046	{
				3047	struct protection_domain *domain = to_pdomain(dom);
				3048	struct iommu_dev_data *dev_data;
				3049	struct amd_iommu *iommu;
				3050	int ret;
				3051
				3052	if (!check_device(dev))
				3053	return -EINVAL;
				3054
				3055	dev_data = dev->archdata.iommu;
				3056
				3057	iommu = amd_iommu_rlookup_table[dev_data->devid];
				3058	if (!iommu)
				3059	return -EINVAL;
				3060
				3061	if (dev_data->domain)
				3062	detach_device(dev);
				3063
				3064	ret = attach_device(dev, domain);
				3065
				3066	#ifdef CONFIG_IRQ_REMAP
				3067	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
				3068	if (dom->type == IOMMU_DOMAIN_UNMANAGED)
				3069	dev_data->use_vapic = 1;
				3070	else
				3071	dev_data->use_vapic = 0;
				3072	}
				3073	#endif
				3074
				3075	iommu_completion_wait(iommu);
				3076
				3077	return ret;
				3078	}
				3079
				3080	static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
				3081	phys_addr_t paddr, size_t page_size, int iommu_prot)
				3082	{
				3083	struct protection_domain *domain = to_pdomain(dom);
				3084	int prot = 0;
				3085	int ret;
				3086
				3087	if (domain->mode == PAGE_MODE_NONE)
				3088	return -EINVAL;
				3089
				3090	if (iommu_prot & IOMMU_READ)
				3091	prot \|= IOMMU_PROT_IR;
				3092	if (iommu_prot & IOMMU_WRITE)
				3093	prot \|= IOMMU_PROT_IW;
				3094
				3095	mutex_lock(&domain->api_lock);
				3096	ret = iommu_map_page(domain, iova, paddr, page_size, prot, GFP_KERNEL);
				3097	mutex_unlock(&domain->api_lock);
				3098
				3099	return ret;
				3100	}
				3101
				3102	static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
				3103	size_t page_size)
				3104	{
				3105	struct protection_domain *domain = to_pdomain(dom);
				3106	size_t unmap_size;
				3107
				3108	if (domain->mode == PAGE_MODE_NONE)
				3109	return -EINVAL;
				3110
				3111	mutex_lock(&domain->api_lock);
				3112	unmap_size = iommu_unmap_page(domain, iova, page_size);
				3113	mutex_unlock(&domain->api_lock);
				3114
				3115	domain_flush_tlb_pde(domain);
				3116	domain_flush_complete(domain);
				3117
				3118	return unmap_size;
				3119	}
				3120
				3121	static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
				3122	dma_addr_t iova)
				3123	{
				3124	struct protection_domain *domain = to_pdomain(dom);
				3125	unsigned long offset_mask, pte_pgsize;
				3126	u64 *pte, __pte;
				3127
				3128	if (domain->mode == PAGE_MODE_NONE)
				3129	return iova;
				3130
				3131	pte = fetch_pte(domain, iova, &pte_pgsize);
				3132
				3133	if (!pte \|\| !IOMMU_PTE_PRESENT(*pte))
				3134	return 0;
				3135
				3136	offset_mask = pte_pgsize - 1;
				3137	__pte = __sme_clr(*pte & PM_ADDR_MASK);
				3138
				3139	return (__pte & ~offset_mask) \| (iova & offset_mask);
				3140	}
				3141
				3142	static bool amd_iommu_capable(enum iommu_cap cap)
				3143	{
				3144	switch (cap) {
				3145	case IOMMU_CAP_CACHE_COHERENCY:
				3146	return true;
				3147	case IOMMU_CAP_INTR_REMAP:
				3148	return (irq_remapping_enabled == 1);
				3149	case IOMMU_CAP_NOEXEC:
				3150	return false;
				3151	}
				3152
				3153	return false;
				3154	}
				3155
				3156	static void amd_iommu_get_resv_regions(struct device *dev,
				3157	struct list_head *head)
				3158	{
				3159	struct iommu_resv_region *region;
				3160	struct unity_map_entry *entry;
				3161	int devid;
				3162
				3163	devid = get_device_id(dev);
				3164	if (devid < 0)
				3165	return;
				3166
				3167	list_for_each_entry(entry, &amd_iommu_unity_map, list) {
				3168	int type, prot = 0;
				3169	size_t length;
				3170
				3171	if (devid < entry->devid_start \|\| devid > entry->devid_end)
				3172	continue;
				3173
				3174	type = IOMMU_RESV_DIRECT;
				3175	length = entry->address_end - entry->address_start;
				3176	if (entry->prot & IOMMU_PROT_IR)
				3177	prot \|= IOMMU_READ;
				3178	if (entry->prot & IOMMU_PROT_IW)
				3179	prot \|= IOMMU_WRITE;
				3180	if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
				3181	/* Exclusion range */
				3182	type = IOMMU_RESV_RESERVED;
				3183
				3184	region = iommu_alloc_resv_region(entry->address_start,
				3185	length, prot, type);
				3186	if (!region) {
				3187	pr_err("Out of memory allocating dm-regions for %s\n",
				3188	dev_name(dev));
				3189	return;
				3190	}
				3191	list_add_tail(&region->list, head);
				3192	}
				3193
				3194	region = iommu_alloc_resv_region(MSI_RANGE_START,
				3195	MSI_RANGE_END - MSI_RANGE_START + 1,
				3196	0, IOMMU_RESV_MSI);
				3197	if (!region)
				3198	return;
				3199	list_add_tail(&region->list, head);
				3200
				3201	region = iommu_alloc_resv_region(HT_RANGE_START,
				3202	HT_RANGE_END - HT_RANGE_START + 1,
				3203	0, IOMMU_RESV_RESERVED);
				3204	if (!region)
				3205	return;
				3206	list_add_tail(&region->list, head);
				3207	}
				3208
				3209	static void amd_iommu_put_resv_regions(struct device *dev,
				3210	struct list_head *head)
				3211	{
				3212	struct iommu_resv_region entry, next;
				3213
				3214	list_for_each_entry_safe(entry, next, head, list)
				3215	kfree(entry);
				3216	}
				3217
				3218	static void amd_iommu_apply_resv_region(struct device *dev,
				3219	struct iommu_domain *domain,
				3220	struct iommu_resv_region *region)
				3221	{
				3222	struct dma_ops_domain *dma_dom = to_dma_ops_domain(to_pdomain(domain));
				3223	unsigned long start, end;
				3224
				3225	start = IOVA_PFN(region->start);
				3226	end = IOVA_PFN(region->start + region->length - 1);
				3227
				3228	WARN_ON_ONCE(reserve_iova(&dma_dom->iovad, start, end) == NULL);
				3229	}
				3230
				3231	static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
				3232	struct device *dev)
				3233	{
				3234	struct iommu_dev_data *dev_data = dev->archdata.iommu;
				3235	return dev_data->defer_attach;
				3236	}
				3237
				3238	const struct iommu_ops amd_iommu_ops = {
				3239	.capable = amd_iommu_capable,
				3240	.domain_alloc = amd_iommu_domain_alloc,
				3241	.domain_free = amd_iommu_domain_free,
				3242	.attach_dev = amd_iommu_attach_device,
				3243	.detach_dev = amd_iommu_detach_device,
				3244	.map = amd_iommu_map,
				3245	.unmap = amd_iommu_unmap,
				3246	.map_sg = default_iommu_map_sg,
				3247	.iova_to_phys = amd_iommu_iova_to_phys,
				3248	.add_device = amd_iommu_add_device,
				3249	.remove_device = amd_iommu_remove_device,
				3250	.device_group = amd_iommu_device_group,
				3251	.get_resv_regions = amd_iommu_get_resv_regions,
				3252	.put_resv_regions = amd_iommu_put_resv_regions,
				3253	.apply_resv_region = amd_iommu_apply_resv_region,
				3254	.is_attach_deferred = amd_iommu_is_attach_deferred,
				3255	.pgsize_bitmap = AMD_IOMMU_PGSIZES,
				3256	};
				3257
				3258	/*****************************************************************************
				3259	*
				3260	* The next functions do a basic initialization of IOMMU for pass through
				3261	* mode
				3262	*
				3263	* In passthrough mode the IOMMU is initialized and enabled but not used for
				3264	* DMA-API translation.
				3265	*
				3266	*****************************************************************************/
				3267
				3268	/* IOMMUv2 specific functions */
				3269	int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
				3270	{
				3271	return atomic_notifier_chain_register(&ppr_notifier, nb);
				3272	}
				3273	EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);
				3274
				3275	int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
				3276	{
				3277	return atomic_notifier_chain_unregister(&ppr_notifier, nb);
				3278	}
				3279	EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
				3280
				3281	void amd_iommu_domain_direct_map(struct iommu_domain *dom)
				3282	{
				3283	struct protection_domain *domain = to_pdomain(dom);
				3284	unsigned long flags;
				3285
				3286	spin_lock_irqsave(&domain->lock, flags);
				3287
				3288	/* Update data structure */
				3289	domain->mode = PAGE_MODE_NONE;
				3290	domain->updated = true;
				3291
				3292	/* Make changes visible to IOMMUs */
				3293	update_domain(domain);
				3294
				3295	/* Page-table is not visible to IOMMU anymore, so free it */
				3296	free_pagetable(domain);
				3297
				3298	spin_unlock_irqrestore(&domain->lock, flags);
				3299	}
				3300	EXPORT_SYMBOL(amd_iommu_domain_direct_map);
				3301
				3302	int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
				3303	{
				3304	struct protection_domain *domain = to_pdomain(dom);
				3305	unsigned long flags;
				3306	int levels, ret;
				3307
				3308	if (pasids <= 0 \|\| pasids > (PASID_MASK + 1))
				3309	return -EINVAL;
				3310
				3311	/* Number of GCR3 table levels required */
				3312	for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
				3313	levels += 1;
				3314
				3315	if (levels > amd_iommu_max_glx_val)
				3316	return -EINVAL;
				3317
				3318	spin_lock_irqsave(&domain->lock, flags);
				3319
				3320	/*
				3321	* Save us all sanity checks whether devices already in the
				3322	* domain support IOMMUv2. Just force that the domain has no
				3323	* devices attached when it is switched into IOMMUv2 mode.
				3324	*/
				3325	ret = -EBUSY;
				3326	if (domain->dev_cnt > 0 \|\| domain->flags & PD_IOMMUV2_MASK)
				3327	goto out;
				3328
				3329	ret = -ENOMEM;
				3330	domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
				3331	if (domain->gcr3_tbl == NULL)
				3332	goto out;
				3333
				3334	domain->glx = levels;
				3335	domain->flags \|= PD_IOMMUV2_MASK;
				3336	domain->updated = true;
				3337
				3338	update_domain(domain);
				3339
				3340	ret = 0;
				3341
				3342	out:
				3343	spin_unlock_irqrestore(&domain->lock, flags);
				3344
				3345	return ret;
				3346	}
				3347	EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
				3348
				3349	static int __flush_pasid(struct protection_domain *domain, int pasid,
				3350	u64 address, bool size)
				3351	{
				3352	struct iommu_dev_data *dev_data;
				3353	struct iommu_cmd cmd;
				3354	int i, ret;
				3355
				3356	if (!(domain->flags & PD_IOMMUV2_MASK))
				3357	return -EINVAL;
				3358
				3359	build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);
				3360
				3361	/*
				3362	* IOMMU TLB needs to be flushed before Device TLB to
				3363	* prevent device TLB refill from IOMMU TLB
				3364	*/
				3365	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
				3366	if (domain->dev_iommu[i] == 0)
				3367	continue;
				3368
				3369	ret = iommu_queue_command(amd_iommus[i], &cmd);
				3370	if (ret != 0)
				3371	goto out;
				3372	}
				3373
				3374	/* Wait until IOMMU TLB flushes are complete */
				3375	domain_flush_complete(domain);
				3376
				3377	/* Now flush device TLBs */
				3378	list_for_each_entry(dev_data, &domain->dev_list, list) {
				3379	struct amd_iommu *iommu;
				3380	int qdep;
				3381
				3382	/*
				3383	There might be non-IOMMUv2 capable devices in an IOMMUv2
				3384	* domain.
				3385	*/
				3386	if (!dev_data->ats.enabled)
				3387	continue;
				3388
				3389	qdep = dev_data->ats.qdep;
				3390	iommu = amd_iommu_rlookup_table[dev_data->devid];
				3391
				3392	build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
				3393	qdep, address, size);
				3394
				3395	ret = iommu_queue_command(iommu, &cmd);
				3396	if (ret != 0)
				3397	goto out;
				3398	}
				3399
				3400	/* Wait until all device TLBs are flushed */
				3401	domain_flush_complete(domain);
				3402
				3403	ret = 0;
				3404
				3405	out:
				3406
				3407	return ret;
				3408	}
				3409
				3410	static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid,
				3411	u64 address)
				3412	{
				3413	return __flush_pasid(domain, pasid, address, false);
				3414	}
				3415
				3416	int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
				3417	u64 address)
				3418	{
				3419	struct protection_domain *domain = to_pdomain(dom);
				3420	unsigned long flags;
				3421	int ret;
				3422
				3423	spin_lock_irqsave(&domain->lock, flags);
				3424	ret = __amd_iommu_flush_page(domain, pasid, address);
				3425	spin_unlock_irqrestore(&domain->lock, flags);
				3426
				3427	return ret;
				3428	}
				3429	EXPORT_SYMBOL(amd_iommu_flush_page);
				3430
				3431	static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid)
				3432	{
				3433	return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
				3434	true);
				3435	}
				3436
				3437	int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid)
				3438	{
				3439	struct protection_domain *domain = to_pdomain(dom);
				3440	unsigned long flags;
				3441	int ret;
				3442
				3443	spin_lock_irqsave(&domain->lock, flags);
				3444	ret = __amd_iommu_flush_tlb(domain, pasid);
				3445	spin_unlock_irqrestore(&domain->lock, flags);
				3446
				3447	return ret;
				3448	}
				3449	EXPORT_SYMBOL(amd_iommu_flush_tlb);
				3450
				3451	static u64 __get_gcr3_pte(u64 root, int level, int pasid, bool alloc)
				3452	{
				3453	int index;
				3454	u64 *pte;
				3455
				3456	while (true) {
				3457
				3458	index = (pasid >> (9 * level)) & 0x1ff;
				3459	pte = &root[index];
				3460
				3461	if (level == 0)
				3462	break;
				3463
				3464	if (!(*pte & GCR3_VALID)) {
				3465	if (!alloc)
				3466	return NULL;
				3467
				3468	root = (void *)get_zeroed_page(GFP_ATOMIC);
				3469	if (root == NULL)
				3470	return NULL;
				3471
				3472	*pte = iommu_virt_to_phys(root) \| GCR3_VALID;
				3473	}
				3474
				3475	root = iommu_phys_to_virt(*pte & PAGE_MASK);
				3476
				3477	level -= 1;
				3478	}
				3479
				3480	return pte;
				3481	}
				3482
				3483	static int __set_gcr3(struct protection_domain *domain, int pasid,
				3484	unsigned long cr3)
				3485	{
				3486	u64 *pte;
				3487
				3488	if (domain->mode != PAGE_MODE_NONE)
				3489	return -EINVAL;
				3490
				3491	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
				3492	if (pte == NULL)
				3493	return -ENOMEM;
				3494
				3495	*pte = (cr3 & PAGE_MASK) \| GCR3_VALID;
				3496
				3497	return __amd_iommu_flush_tlb(domain, pasid);
				3498	}
				3499
				3500	static int __clear_gcr3(struct protection_domain *domain, int pasid)
				3501	{
				3502	u64 *pte;
				3503
				3504	if (domain->mode != PAGE_MODE_NONE)
				3505	return -EINVAL;
				3506
				3507	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
				3508	if (pte == NULL)
				3509	return 0;
				3510
				3511	*pte = 0;
				3512
				3513	return __amd_iommu_flush_tlb(domain, pasid);
				3514	}
				3515
				3516	int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
				3517	unsigned long cr3)
				3518	{
				3519	struct protection_domain *domain = to_pdomain(dom);
				3520	unsigned long flags;
				3521	int ret;
				3522
				3523	spin_lock_irqsave(&domain->lock, flags);
				3524	ret = __set_gcr3(domain, pasid, cr3);
				3525	spin_unlock_irqrestore(&domain->lock, flags);
				3526
				3527	return ret;
				3528	}
				3529	EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
				3530
				3531	int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid)
				3532	{
				3533	struct protection_domain *domain = to_pdomain(dom);
				3534	unsigned long flags;
				3535	int ret;
				3536
				3537	spin_lock_irqsave(&domain->lock, flags);
				3538	ret = __clear_gcr3(domain, pasid);
				3539	spin_unlock_irqrestore(&domain->lock, flags);
				3540
				3541	return ret;
				3542	}
				3543	EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
				3544
				3545	int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
				3546	int status, int tag)
				3547	{
				3548	struct iommu_dev_data *dev_data;
				3549	struct amd_iommu *iommu;
				3550	struct iommu_cmd cmd;
				3551
				3552	dev_data = get_dev_data(&pdev->dev);
				3553	iommu = amd_iommu_rlookup_table[dev_data->devid];
				3554
				3555	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
				3556	tag, dev_data->pri_tlp);
				3557
				3558	return iommu_queue_command(iommu, &cmd);
				3559	}
				3560	EXPORT_SYMBOL(amd_iommu_complete_ppr);
				3561
				3562	struct iommu_domain amd_iommu_get_v2_domain(struct pci_dev pdev)
				3563	{
				3564	struct protection_domain *pdomain;
				3565
				3566	pdomain = get_domain(&pdev->dev);
				3567	if (IS_ERR(pdomain))
				3568	return NULL;
				3569
				3570	/* Only return IOMMUv2 domains */
				3571	if (!(pdomain->flags & PD_IOMMUV2_MASK))
				3572	return NULL;
				3573
				3574	return &pdomain->domain;
				3575	}
				3576	EXPORT_SYMBOL(amd_iommu_get_v2_domain);
				3577
				3578	void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum)
				3579	{
				3580	struct iommu_dev_data *dev_data;
				3581
				3582	if (!amd_iommu_v2_supported())
				3583	return;
				3584
				3585	dev_data = get_dev_data(&pdev->dev);
				3586	dev_data->errata \|= (1 << erratum);
				3587	}
				3588	EXPORT_SYMBOL(amd_iommu_enable_device_erratum);
				3589
				3590	int amd_iommu_device_info(struct pci_dev *pdev,
				3591	struct amd_iommu_device_info *info)
				3592	{
				3593	int max_pasids;
				3594	int pos;
				3595
				3596	if (pdev == NULL \|\| info == NULL)
				3597	return -EINVAL;
				3598
				3599	if (!amd_iommu_v2_supported())
				3600	return -EINVAL;
				3601
				3602	memset(info, 0, sizeof(*info));
				3603
				3604	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS);
				3605	if (pos)
				3606	info->flags \|= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
				3607
				3608	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
				3609	if (pos)
				3610	info->flags \|= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
				3611
				3612	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
				3613	if (pos) {
				3614	int features;
				3615
				3616	max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
				3617	max_pasids = min(max_pasids, (1 << 20));
				3618
				3619	info->flags \|= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
				3620	info->max_pasids = min(pci_max_pasids(pdev), max_pasids);
				3621
				3622	features = pci_pasid_features(pdev);
				3623	if (features & PCI_PASID_CAP_EXEC)
				3624	info->flags \|= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
				3625	if (features & PCI_PASID_CAP_PRIV)
				3626	info->flags \|= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
				3627	}
				3628
				3629	return 0;
				3630	}
				3631	EXPORT_SYMBOL(amd_iommu_device_info);
				3632
				3633	#ifdef CONFIG_IRQ_REMAP
				3634
				3635	/*****************************************************************************
				3636	*
				3637	* Interrupt Remapping Implementation
				3638	*
				3639	*****************************************************************************/
				3640
				3641	static struct irq_chip amd_ir_chip;
				3642
				3643	static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
				3644	{
				3645	u64 dte;
				3646
				3647	dte = amd_iommu_dev_table[devid].data[2];
				3648	dte &= ~DTE_IRQ_PHYS_ADDR_MASK;
				3649	dte \|= iommu_virt_to_phys(table->table);
				3650	dte \|= DTE_IRQ_REMAP_INTCTL;
				3651	dte \|= DTE_IRQ_TABLE_LEN;
				3652	dte \|= DTE_IRQ_REMAP_ENABLE;
				3653
				3654	amd_iommu_dev_table[devid].data[2] = dte;
				3655	}
				3656
				3657	static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
				3658	{
				3659	struct irq_remap_table *table = NULL;
				3660	struct amd_iommu *iommu;
				3661	unsigned long flags;
				3662	u16 alias;
				3663
				3664	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
				3665
				3666	iommu = amd_iommu_rlookup_table[devid];
				3667	if (!iommu)
				3668	goto out_unlock;
				3669
				3670	table = irq_lookup_table[devid];
				3671	if (table)
				3672	goto out_unlock;
				3673
				3674	alias = amd_iommu_alias_table[devid];
				3675	table = irq_lookup_table[alias];
				3676	if (table) {
				3677	irq_lookup_table[devid] = table;
				3678	set_dte_irq_entry(devid, table);
				3679	iommu_flush_dte(iommu, devid);
				3680	goto out;
				3681	}
				3682
				3683	/* Nothing there yet, allocate new irq remapping table */
				3684	table = kzalloc(sizeof(*table), GFP_ATOMIC);
				3685	if (!table)
				3686	goto out_unlock;
				3687
				3688	/* Initialize table spin-lock */
				3689	spin_lock_init(&table->lock);
				3690
				3691	if (ioapic)
				3692	/* Keep the first 32 indexes free for IOAPIC interrupts */
				3693	table->min_index = 32;
				3694
				3695	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC);
				3696	if (!table->table) {
				3697	kfree(table);
				3698	table = NULL;
				3699	goto out_unlock;
				3700	}
				3701
				3702	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
				3703	memset(table->table, 0,
				3704	MAX_IRQS_PER_TABLE * sizeof(u32));
				3705	else
				3706	memset(table->table, 0,
				3707	(MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
				3708
				3709	if (ioapic) {
				3710	int i;
				3711
				3712	for (i = 0; i < 32; ++i)
				3713	iommu->irte_ops->set_allocated(table, i);
				3714	}
				3715
				3716	irq_lookup_table[devid] = table;
				3717	set_dte_irq_entry(devid, table);
				3718	iommu_flush_dte(iommu, devid);
				3719	if (devid != alias) {
				3720	irq_lookup_table[alias] = table;
				3721	set_dte_irq_entry(alias, table);
				3722	iommu_flush_dte(iommu, alias);
				3723	}
				3724
				3725	out:
				3726	iommu_completion_wait(iommu);
				3727
				3728	out_unlock:
				3729	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
				3730
				3731	return table;
				3732	}
				3733
				3734	static int alloc_irq_index(u16 devid, int count)
				3735	{
				3736	struct irq_remap_table *table;
				3737	unsigned long flags;
				3738	int index, c;
				3739	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
				3740
				3741	if (!iommu)
				3742	return -ENODEV;
				3743
				3744	table = get_irq_table(devid, false);
				3745	if (!table)
				3746	return -ENODEV;
				3747
				3748	spin_lock_irqsave(&table->lock, flags);
				3749
				3750	/* Scan table for free entries */
				3751	for (c = 0, index = table->min_index;
				3752	index < MAX_IRQS_PER_TABLE;
				3753	++index) {
				3754	if (!iommu->irte_ops->is_allocated(table, index))
				3755	c += 1;
				3756	else
				3757	c = 0;
				3758
				3759	if (c == count) {
				3760	for (; c != 0; --c)
				3761	iommu->irte_ops->set_allocated(table, index - c + 1);
				3762
				3763	index -= count - 1;
				3764	goto out;
				3765	}
				3766	}
				3767
				3768	index = -ENOSPC;
				3769
				3770	out:
				3771	spin_unlock_irqrestore(&table->lock, flags);
				3772
				3773	return index;
				3774	}
				3775
				3776	static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
				3777	struct amd_ir_data *data)
				3778	{
				3779	struct irq_remap_table *table;
				3780	struct amd_iommu *iommu;
				3781	unsigned long flags;
				3782	struct irte_ga *entry;
				3783
				3784	iommu = amd_iommu_rlookup_table[devid];
				3785	if (iommu == NULL)
				3786	return -EINVAL;
				3787
				3788	table = get_irq_table(devid, false);
				3789	if (!table)
				3790	return -ENOMEM;
				3791
				3792	spin_lock_irqsave(&table->lock, flags);
				3793
				3794	entry = (struct irte_ga *)table->table;
				3795	entry = &entry[index];
				3796	entry->lo.fields_remap.valid = 0;
				3797	entry->hi.val = irte->hi.val;
				3798	entry->lo.val = irte->lo.val;
				3799	entry->lo.fields_remap.valid = 1;
				3800	if (data)
				3801	data->ref = entry;
				3802
				3803	spin_unlock_irqrestore(&table->lock, flags);
				3804
				3805	iommu_flush_irt(iommu, devid);
				3806	iommu_completion_wait(iommu);
				3807
				3808	return 0;
				3809	}
				3810
				3811	static int modify_irte(u16 devid, int index, union irte *irte)
				3812	{
				3813	struct irq_remap_table *table;
				3814	struct amd_iommu *iommu;
				3815	unsigned long flags;
				3816
				3817	iommu = amd_iommu_rlookup_table[devid];
				3818	if (iommu == NULL)
				3819	return -EINVAL;
				3820
				3821	table = get_irq_table(devid, false);
				3822	if (!table)
				3823	return -ENOMEM;
				3824
				3825	spin_lock_irqsave(&table->lock, flags);
				3826	table->table[index] = irte->val;
				3827	spin_unlock_irqrestore(&table->lock, flags);
				3828
				3829	iommu_flush_irt(iommu, devid);
				3830	iommu_completion_wait(iommu);
				3831
				3832	return 0;
				3833	}
				3834
				3835	static void free_irte(u16 devid, int index)
				3836	{
				3837	struct irq_remap_table *table;
				3838	struct amd_iommu *iommu;
				3839	unsigned long flags;
				3840
				3841	iommu = amd_iommu_rlookup_table[devid];
				3842	if (iommu == NULL)
				3843	return;
				3844
				3845	table = get_irq_table(devid, false);
				3846	if (!table)
				3847	return;
				3848
				3849	spin_lock_irqsave(&table->lock, flags);
				3850	iommu->irte_ops->clear_allocated(table, index);
				3851	spin_unlock_irqrestore(&table->lock, flags);
				3852
				3853	iommu_flush_irt(iommu, devid);
				3854	iommu_completion_wait(iommu);
				3855	}
				3856
				3857	static void irte_prepare(void *entry,
				3858	u32 delivery_mode, u32 dest_mode,
				3859	u8 vector, u32 dest_apicid, int devid)
				3860	{
				3861	union irte irte = (union irte ) entry;
				3862
				3863	irte->val = 0;
				3864	irte->fields.vector = vector;
				3865	irte->fields.int_type = delivery_mode;
				3866	irte->fields.destination = dest_apicid;
				3867	irte->fields.dm = dest_mode;
				3868	irte->fields.valid = 1;
				3869	}
				3870
				3871	static void irte_ga_prepare(void *entry,
				3872	u32 delivery_mode, u32 dest_mode,
				3873	u8 vector, u32 dest_apicid, int devid)
				3874	{
				3875	struct irte_ga irte = (struct irte_ga ) entry;
				3876
				3877	irte->lo.val = 0;
				3878	irte->hi.val = 0;
				3879	irte->lo.fields_remap.int_type = delivery_mode;
				3880	irte->lo.fields_remap.dm = dest_mode;
				3881	irte->hi.fields.vector = vector;
				3882	irte->lo.fields_remap.destination = dest_apicid;
				3883	irte->lo.fields_remap.valid = 1;
				3884	}
				3885
				3886	static void irte_activate(void *entry, u16 devid, u16 index)
				3887	{
				3888	union irte irte = (union irte ) entry;
				3889
				3890	irte->fields.valid = 1;
				3891	modify_irte(devid, index, irte);
				3892	}
				3893
				3894	static void irte_ga_activate(void *entry, u16 devid, u16 index)
				3895	{
				3896	struct irte_ga irte = (struct irte_ga ) entry;
				3897
				3898	irte->lo.fields_remap.valid = 1;
				3899	modify_irte_ga(devid, index, irte, NULL);
				3900	}
				3901
				3902	static void irte_deactivate(void *entry, u16 devid, u16 index)
				3903	{
				3904	union irte irte = (union irte ) entry;
				3905
				3906	irte->fields.valid = 0;
				3907	modify_irte(devid, index, irte);
				3908	}
				3909
				3910	static void irte_ga_deactivate(void *entry, u16 devid, u16 index)
				3911	{
				3912	struct irte_ga irte = (struct irte_ga ) entry;
				3913
				3914	irte->lo.fields_remap.valid = 0;
				3915	modify_irte_ga(devid, index, irte, NULL);
				3916	}
				3917
				3918	static void irte_set_affinity(void *entry, u16 devid, u16 index,
				3919	u8 vector, u32 dest_apicid)
				3920	{
				3921	union irte irte = (union irte ) entry;
				3922
				3923	irte->fields.vector = vector;
				3924	irte->fields.destination = dest_apicid;
				3925	modify_irte(devid, index, irte);
				3926	}
				3927
				3928	static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
				3929	u8 vector, u32 dest_apicid)
				3930	{
				3931	struct irte_ga irte = (struct irte_ga ) entry;
				3932	struct iommu_dev_data *dev_data = search_dev_data(devid);
				3933
				3934	if (!dev_data \|\| !dev_data->use_vapic \|\|
				3935	!irte->lo.fields_remap.guest_mode) {
				3936	irte->hi.fields.vector = vector;
				3937	irte->lo.fields_remap.destination = dest_apicid;
				3938	modify_irte_ga(devid, index, irte, NULL);
				3939	}
				3940	}
				3941
				3942	#define IRTE_ALLOCATED (~1U)
				3943	static void irte_set_allocated(struct irq_remap_table *table, int index)
				3944	{
				3945	table->table[index] = IRTE_ALLOCATED;
				3946	}
				3947
				3948	static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
				3949	{
				3950	struct irte_ga ptr = (struct irte_ga )table->table;
				3951	struct irte_ga *irte = &ptr[index];
				3952
				3953	memset(&irte->lo.val, 0, sizeof(u64));
				3954	memset(&irte->hi.val, 0, sizeof(u64));
				3955	irte->hi.fields.vector = 0xff;
				3956	}
				3957
				3958	static bool irte_is_allocated(struct irq_remap_table *table, int index)
				3959	{
				3960	union irte ptr = (union irte )table->table;
				3961	union irte *irte = &ptr[index];
				3962
				3963	return irte->val != 0;
				3964	}
				3965
				3966	static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
				3967	{
				3968	struct irte_ga ptr = (struct irte_ga )table->table;
				3969	struct irte_ga *irte = &ptr[index];
				3970
				3971	return irte->hi.fields.vector != 0;
				3972	}
				3973
				3974	static void irte_clear_allocated(struct irq_remap_table *table, int index)
				3975	{
				3976	table->table[index] = 0;
				3977	}
				3978
				3979	static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
				3980	{
				3981	struct irte_ga ptr = (struct irte_ga )table->table;
				3982	struct irte_ga *irte = &ptr[index];
				3983
				3984	memset(&irte->lo.val, 0, sizeof(u64));
				3985	memset(&irte->hi.val, 0, sizeof(u64));
				3986	}
				3987
				3988	static int get_devid(struct irq_alloc_info *info)
				3989	{
				3990	int devid = -1;
				3991
				3992	switch (info->type) {
				3993	case X86_IRQ_ALLOC_TYPE_IOAPIC:
				3994	devid = get_ioapic_devid(info->ioapic_id);
				3995	break;
				3996	case X86_IRQ_ALLOC_TYPE_HPET:
				3997	devid = get_hpet_devid(info->hpet_id);
				3998	break;
				3999	case X86_IRQ_ALLOC_TYPE_MSI:
				4000	case X86_IRQ_ALLOC_TYPE_MSIX:
				4001	devid = get_device_id(&info->msi_dev->dev);
				4002	break;
				4003	default:
				4004	BUG_ON(1);
				4005	break;
				4006	}
				4007
				4008	return devid;
				4009	}
				4010
				4011	static struct irq_domain get_ir_irq_domain(struct irq_alloc_info info)
				4012	{
				4013	struct amd_iommu *iommu;
				4014	int devid;
				4015
				4016	if (!info)
				4017	return NULL;
				4018
				4019	devid = get_devid(info);
				4020	if (devid >= 0) {
				4021	iommu = amd_iommu_rlookup_table[devid];
				4022	if (iommu)
				4023	return iommu->ir_domain;
				4024	}
				4025
				4026	return NULL;
				4027	}
				4028
				4029	static struct irq_domain get_irq_domain(struct irq_alloc_info info)
				4030	{
				4031	struct amd_iommu *iommu;
				4032	int devid;
				4033
				4034	if (!info)
				4035	return NULL;
				4036
				4037	switch (info->type) {
				4038	case X86_IRQ_ALLOC_TYPE_MSI:
				4039	case X86_IRQ_ALLOC_TYPE_MSIX:
				4040	devid = get_device_id(&info->msi_dev->dev);
				4041	if (devid < 0)
				4042	return NULL;
				4043
				4044	iommu = amd_iommu_rlookup_table[devid];
				4045	if (iommu)
				4046	return iommu->msi_domain;
				4047	break;
				4048	default:
				4049	break;
				4050	}
				4051
				4052	return NULL;
				4053	}
				4054
				4055	struct irq_remap_ops amd_iommu_irq_ops = {
				4056	.prepare = amd_iommu_prepare,
				4057	.enable = amd_iommu_enable,
				4058	.disable = amd_iommu_disable,
				4059	.reenable = amd_iommu_reenable,
				4060	.enable_faulting = amd_iommu_enable_faulting,
				4061	.get_ir_irq_domain = get_ir_irq_domain,
				4062	.get_irq_domain = get_irq_domain,
				4063	};
				4064
				4065	static void irq_remapping_prepare_irte(struct amd_ir_data *data,
				4066	struct irq_cfg *irq_cfg,
				4067	struct irq_alloc_info *info,
				4068	int devid, int index, int sub_handle)
				4069	{
				4070	struct irq_2_irte *irte_info = &data->irq_2_irte;
				4071	struct msi_msg *msg = &data->msi_entry;
				4072	struct IO_APIC_route_entry *entry;
				4073	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
				4074
				4075	if (!iommu)
				4076	return;
				4077
				4078	data->irq_2_irte.devid = devid;
				4079	data->irq_2_irte.index = index + sub_handle;
				4080	iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode,
				4081	apic->irq_dest_mode, irq_cfg->vector,
				4082	irq_cfg->dest_apicid, devid);
				4083
				4084	switch (info->type) {
				4085	case X86_IRQ_ALLOC_TYPE_IOAPIC:
				4086	/* Setup IOAPIC entry */
				4087	entry = info->ioapic_entry;
				4088	info->ioapic_entry = NULL;
				4089	memset(entry, 0, sizeof(*entry));
				4090	entry->vector = index;
				4091	entry->mask = 0;
				4092	entry->trigger = info->ioapic_trigger;
				4093	entry->polarity = info->ioapic_polarity;
				4094	/* Mask level triggered irqs. */
				4095	if (info->ioapic_trigger)
				4096	entry->mask = 1;
				4097	break;
				4098
				4099	case X86_IRQ_ALLOC_TYPE_HPET:
				4100	case X86_IRQ_ALLOC_TYPE_MSI:
				4101	case X86_IRQ_ALLOC_TYPE_MSIX:
				4102	msg->address_hi = MSI_ADDR_BASE_HI;
				4103	msg->address_lo = MSI_ADDR_BASE_LO;
				4104	msg->data = irte_info->index;
				4105	break;
				4106
				4107	default:
				4108	BUG_ON(1);
				4109	break;
				4110	}
				4111	}
				4112
				4113	struct amd_irte_ops irte_32_ops = {
				4114	.prepare = irte_prepare,
				4115	.activate = irte_activate,
				4116	.deactivate = irte_deactivate,
				4117	.set_affinity = irte_set_affinity,
				4118	.set_allocated = irte_set_allocated,
				4119	.is_allocated = irte_is_allocated,
				4120	.clear_allocated = irte_clear_allocated,
				4121	};
				4122
				4123	struct amd_irte_ops irte_128_ops = {
				4124	.prepare = irte_ga_prepare,
				4125	.activate = irte_ga_activate,
				4126	.deactivate = irte_ga_deactivate,
				4127	.set_affinity = irte_ga_set_affinity,
				4128	.set_allocated = irte_ga_set_allocated,
				4129	.is_allocated = irte_ga_is_allocated,
				4130	.clear_allocated = irte_ga_clear_allocated,
				4131	};
				4132
				4133	static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
				4134	unsigned int nr_irqs, void *arg)
				4135	{
				4136	struct irq_alloc_info *info = arg;
				4137	struct irq_data *irq_data;
				4138	struct amd_ir_data *data = NULL;
				4139	struct irq_cfg *cfg;
				4140	int i, ret, devid;
				4141	int index = -1;
				4142
				4143	if (!info)
				4144	return -EINVAL;
				4145	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
				4146	info->type != X86_IRQ_ALLOC_TYPE_MSIX)
				4147	return -EINVAL;
				4148
				4149	/*
				4150	* With IRQ remapping enabled, don't need contiguous CPU vectors
				4151	* to support multiple MSI interrupts.
				4152	*/
				4153	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
				4154	info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
				4155
				4156	devid = get_devid(info);
				4157	if (devid < 0)
				4158	return -EINVAL;
				4159
				4160	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
				4161	if (ret < 0)
				4162	return ret;
				4163
				4164	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
				4165	if (get_irq_table(devid, true))
				4166	index = info->ioapic_pin;
				4167	else
				4168	ret = -ENOMEM;
				4169	} else {
				4170	index = alloc_irq_index(devid, nr_irqs);
				4171	}
				4172	if (index < 0) {
				4173	pr_warn("Failed to allocate IRTE\n");
				4174	ret = index;
				4175	goto out_free_parent;
				4176	}
				4177
				4178	for (i = 0; i < nr_irqs; i++) {
				4179	irq_data = irq_domain_get_irq_data(domain, virq + i);
				4180	cfg = irqd_cfg(irq_data);
				4181	if (!irq_data \|\| !cfg) {
				4182	ret = -EINVAL;
				4183	goto out_free_data;
				4184	}
				4185
				4186	ret = -ENOMEM;
				4187	data = kzalloc(sizeof(*data), GFP_KERNEL);
				4188	if (!data)
				4189	goto out_free_data;
				4190
				4191	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
				4192	data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
				4193	else
				4194	data->entry = kzalloc(sizeof(struct irte_ga),
				4195	GFP_KERNEL);
				4196	if (!data->entry) {
				4197	kfree(data);
				4198	goto out_free_data;
				4199	}
				4200
				4201	irq_data->hwirq = (devid << 16) + i;
				4202	irq_data->chip_data = data;
				4203	irq_data->chip = &amd_ir_chip;
				4204	irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
				4205	irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
				4206	}
				4207
				4208	return 0;
				4209
				4210	out_free_data:
				4211	for (i--; i >= 0; i--) {
				4212	irq_data = irq_domain_get_irq_data(domain, virq + i);
				4213	if (irq_data)
				4214	kfree(irq_data->chip_data);
				4215	}
				4216	for (i = 0; i < nr_irqs; i++)
				4217	free_irte(devid, index + i);
				4218	out_free_parent:
				4219	irq_domain_free_irqs_common(domain, virq, nr_irqs);
				4220	return ret;
				4221	}
				4222
				4223	static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
				4224	unsigned int nr_irqs)
				4225	{
				4226	struct irq_2_irte *irte_info;
				4227	struct irq_data *irq_data;
				4228	struct amd_ir_data *data;
				4229	int i;
				4230
				4231	for (i = 0; i < nr_irqs; i++) {
				4232	irq_data = irq_domain_get_irq_data(domain, virq + i);
				4233	if (irq_data && irq_data->chip_data) {
				4234	data = irq_data->chip_data;
				4235	irte_info = &data->irq_2_irte;
				4236	free_irte(irte_info->devid, irte_info->index);
				4237	kfree(data->entry);
				4238	kfree(data);
				4239	}
				4240	}
				4241	irq_domain_free_irqs_common(domain, virq, nr_irqs);
				4242	}
				4243
				4244	static void irq_remapping_activate(struct irq_domain *domain,
				4245	struct irq_data *irq_data)
				4246	{
				4247	struct amd_ir_data *data = irq_data->chip_data;
				4248	struct irq_2_irte *irte_info = &data->irq_2_irte;
				4249	struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
				4250
				4251	if (iommu)
				4252	iommu->irte_ops->activate(data->entry, irte_info->devid,
				4253	irte_info->index);
				4254	}
				4255
				4256	static void irq_remapping_deactivate(struct irq_domain *domain,
				4257	struct irq_data *irq_data)
				4258	{
				4259	struct amd_ir_data *data = irq_data->chip_data;
				4260	struct irq_2_irte *irte_info = &data->irq_2_irte;
				4261	struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
				4262
				4263	if (iommu)
				4264	iommu->irte_ops->deactivate(data->entry, irte_info->devid,
				4265	irte_info->index);
				4266	}
				4267
				4268	static const struct irq_domain_ops amd_ir_domain_ops = {
				4269	.alloc = irq_remapping_alloc,
				4270	.free = irq_remapping_free,
				4271	.activate = irq_remapping_activate,
				4272	.deactivate = irq_remapping_deactivate,
				4273	};
				4274
				4275	static int amd_ir_set_vcpu_affinity(struct irq_data data, void vcpu_info)
				4276	{
				4277	struct amd_iommu *iommu;
				4278	struct amd_iommu_pi_data *pi_data = vcpu_info;
				4279	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
				4280	struct amd_ir_data *ir_data = data->chip_data;
				4281	struct irte_ga irte = (struct irte_ga ) ir_data->entry;
				4282	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
				4283	struct iommu_dev_data *dev_data = search_dev_data(irte_info->devid);
				4284
				4285	/* Note:
				4286	* This device has never been set up for guest mode.
				4287	* we should not modify the IRTE
				4288	*/
				4289	if (!dev_data \|\| !dev_data->use_vapic)
				4290	return 0;
				4291
				4292	pi_data->ir_data = ir_data;
				4293
				4294	/* Note:
				4295	* SVM tries to set up for VAPIC mode, but we are in
				4296	* legacy mode. So, we force legacy mode instead.
				4297	*/
				4298	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
				4299	pr_debug("AMD-Vi: %s: Fall back to using intr legacy remap\n",
				4300	__func__);
				4301	pi_data->is_guest_mode = false;
				4302	}
				4303
				4304	iommu = amd_iommu_rlookup_table[irte_info->devid];
				4305	if (iommu == NULL)
				4306	return -EINVAL;
				4307
				4308	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
				4309	if (pi_data->is_guest_mode) {
				4310	/* Setting */
				4311	irte->hi.fields.ga_root_ptr = (pi_data->base >> 12);
				4312	irte->hi.fields.vector = vcpu_pi_info->vector;
				4313	irte->lo.fields_vapic.ga_log_intr = 1;
				4314	irte->lo.fields_vapic.guest_mode = 1;
				4315	irte->lo.fields_vapic.ga_tag = pi_data->ga_tag;
				4316
				4317	ir_data->cached_ga_tag = pi_data->ga_tag;
				4318	} else {
				4319	/* Un-Setting */
				4320	struct irq_cfg *cfg = irqd_cfg(data);
				4321
				4322	irte->hi.val = 0;
				4323	irte->lo.val = 0;
				4324	irte->hi.fields.vector = cfg->vector;
				4325	irte->lo.fields_remap.guest_mode = 0;
				4326	irte->lo.fields_remap.destination = cfg->dest_apicid;
				4327	irte->lo.fields_remap.int_type = apic->irq_delivery_mode;
				4328	irte->lo.fields_remap.dm = apic->irq_dest_mode;
				4329
				4330	/*
				4331	* This communicates the ga_tag back to the caller
				4332	* so that it can do all the necessary clean up.
				4333	*/
				4334	ir_data->cached_ga_tag = 0;
				4335	}
				4336
				4337	return modify_irte_ga(irte_info->devid, irte_info->index, irte, ir_data);
				4338	}
				4339
				4340	static int amd_ir_set_affinity(struct irq_data *data,
				4341	const struct cpumask *mask, bool force)
				4342	{
				4343	struct amd_ir_data *ir_data = data->chip_data;
				4344	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
				4345	struct irq_cfg *cfg = irqd_cfg(data);
				4346	struct irq_data *parent = data->parent_data;
				4347	struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
				4348	int ret;
				4349
				4350	if (!iommu)
				4351	return -ENODEV;
				4352
				4353	ret = parent->chip->irq_set_affinity(parent, mask, force);
				4354	if (ret < 0 \|\| ret == IRQ_SET_MASK_OK_DONE)
				4355	return ret;
				4356
				4357	/*
				4358	* Atomically updates the IRTE with the new destination, vector
				4359	* and flushes the interrupt entry cache.
				4360	*/
				4361	iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid,
				4362	irte_info->index, cfg->vector, cfg->dest_apicid);
				4363
				4364	/*
				4365	* After this point, all the interrupts will start arriving
				4366	* at the new destination. So, time to cleanup the previous
				4367	* vector allocation.
				4368	*/
				4369	send_cleanup_vector(cfg);
				4370
				4371	return IRQ_SET_MASK_OK_DONE;
				4372	}
				4373
				4374	static void ir_compose_msi_msg(struct irq_data irq_data, struct msi_msg msg)
				4375	{
				4376	struct amd_ir_data *ir_data = irq_data->chip_data;
				4377
				4378	*msg = ir_data->msi_entry;
				4379	}
				4380
				4381	static struct irq_chip amd_ir_chip = {
				4382	.name = "AMD-IR",
				4383	.irq_ack = ir_ack_apic_edge,
				4384	.irq_set_affinity = amd_ir_set_affinity,
				4385	.irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity,
				4386	.irq_compose_msi_msg = ir_compose_msi_msg,
				4387	};
				4388
				4389	int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
				4390	{
				4391	struct fwnode_handle *fn;
				4392
				4393	fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
				4394	if (!fn)
				4395	return -ENOMEM;
				4396	iommu->ir_domain = irq_domain_create_tree(fn, &amd_ir_domain_ops, iommu);
				4397	if (!iommu->ir_domain) {
				4398	irq_domain_free_fwnode(fn);
				4399	return -ENOMEM;
				4400	}
				4401
				4402	iommu->ir_domain->parent = arch_get_ir_parent_domain();
				4403	iommu->msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain,
				4404	"AMD-IR-MSI",
				4405	iommu->index);
				4406	return 0;
				4407	}
				4408
				4409	int amd_iommu_update_ga(int cpu, bool is_run, void *data)
				4410	{
				4411	unsigned long flags;
				4412	struct amd_iommu *iommu;
				4413	struct irq_remap_table *irt;
				4414	struct amd_ir_data ir_data = (struct amd_ir_data )data;
				4415	int devid = ir_data->irq_2_irte.devid;
				4416	struct irte_ga entry = (struct irte_ga ) ir_data->entry;
				4417	struct irte_ga ref = (struct irte_ga ) ir_data->ref;
				4418
				4419	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) \|\|
				4420	!ref \|\| !entry \|\| !entry->lo.fields_vapic.guest_mode)
				4421	return 0;
				4422
				4423	iommu = amd_iommu_rlookup_table[devid];
				4424	if (!iommu)
				4425	return -ENODEV;
				4426
				4427	irt = get_irq_table(devid, false);
				4428	if (!irt)
				4429	return -ENODEV;
				4430
				4431	spin_lock_irqsave(&irt->lock, flags);
				4432
				4433	if (ref->lo.fields_vapic.guest_mode) {
				4434	if (cpu >= 0)
				4435	ref->lo.fields_vapic.destination = cpu;
				4436	ref->lo.fields_vapic.is_run = is_run;
				4437	barrier();
				4438	}
				4439
				4440	spin_unlock_irqrestore(&irt->lock, flags);
				4441
				4442	iommu_flush_irt(iommu, devid);
				4443	iommu_completion_wait(iommu);
				4444	return 0;
				4445	}
				4446	EXPORT_SYMBOL(amd_iommu_update_ga);
				4447	#endif