Blame - src/kernel/linux/v4.19/drivers/iommu/intel-iommu.c - T800

blob: b9af2419006f8341563551d2f32c1226c081b249 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Copyright © 2006-2014 Intel Corporation.
				3	*
				4	* This program is free software; you can redistribute it and/or modify it
				5	* under the terms and conditions of the GNU General Public License,
				6	* version 2, as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope it will be useful, but WITHOUT
				9	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				10	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
				11	* more details.
				12	*
				13	* Authors: David Woodhouse <dwmw2@infradead.org>,
				14	* Ashok Raj <ashok.raj@intel.com>,
				15	* Shaohua Li <shaohua.li@intel.com>,
				16	* Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
				17	* Fenghua Yu <fenghua.yu@intel.com>
				18	* Joerg Roedel <jroedel@suse.de>
				19	*/
				20
				21	#define pr_fmt(fmt) "DMAR: " fmt
				22
				23	#include <linux/init.h>
				24	#include <linux/bitmap.h>
				25	#include <linux/debugfs.h>
				26	#include <linux/export.h>
				27	#include <linux/slab.h>
				28	#include <linux/irq.h>
				29	#include <linux/interrupt.h>
				30	#include <linux/spinlock.h>
				31	#include <linux/pci.h>
				32	#include <linux/dmar.h>
				33	#include <linux/dma-mapping.h>
				34	#include <linux/mempool.h>
				35	#include <linux/memory.h>
				36	#include <linux/cpu.h>
				37	#include <linux/timer.h>
				38	#include <linux/io.h>
				39	#include <linux/iova.h>
				40	#include <linux/iommu.h>
				41	#include <linux/intel-iommu.h>
				42	#include <linux/syscore_ops.h>
				43	#include <linux/tboot.h>
				44	#include <linux/dmi.h>
				45	#include <linux/pci-ats.h>
				46	#include <linux/memblock.h>
				47	#include <linux/dma-contiguous.h>
				48	#include <linux/dma-direct.h>
				49	#include <linux/crash_dump.h>
				50	#include <asm/irq_remapping.h>
				51	#include <asm/cacheflush.h>
				52	#include <asm/iommu.h>
				53
				54	#include "irq_remapping.h"
				55	#include "intel-pasid.h"
				56
				57	#define ROOT_SIZE VTD_PAGE_SIZE
				58	#define CONTEXT_SIZE VTD_PAGE_SIZE
				59
				60	#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
				61	#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
				62	#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
				63	#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
				64
				65	#define IOAPIC_RANGE_START (0xfee00000)
				66	#define IOAPIC_RANGE_END (0xfeefffff)
				67	#define IOVA_START_ADDR (0x1000)
				68
				69	#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
				70
				71	#define MAX_AGAW_WIDTH 64
				72	#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
				73
				74	#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
				75	#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
				76
				77	/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
				78	to match. That way, we can use 'unsigned long' for PFNs with impunity. */
				79	#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
				80	__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
				81	#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
				82
				83	/* IO virtual address start page frame number */
				84	#define IOVA_START_PFN (1)
				85
				86	#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
				87
				88	/* page table handling */
				89	#define LEVEL_STRIDE (9)
				90	#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
				91
				92	/*
				93	* This bitmap is used to advertise the page sizes our hardware support
				94	* to the IOMMU core, which will then use this information to split
				95	* physically contiguous memory regions it is mapping into page sizes
				96	* that we support.
				97	*
				98	* Traditionally the IOMMU core just handed us the mappings directly,
				99	* after making sure the size is an order of a 4KiB page and that the
				100	* mapping has natural alignment.
				101	*
				102	* To retain this behavior, we currently advertise that we support
				103	* all page sizes that are an order of 4KiB.
				104	*
				105	* If at some point we'd like to utilize the IOMMU core's new behavior,
				106	* we could change this to advertise the real page sizes we support.
				107	*/
				108	#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
				109
				110	static inline int agaw_to_level(int agaw)
				111	{
				112	return agaw + 2;
				113	}
				114
				115	static inline int agaw_to_width(int agaw)
				116	{
				117	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
				118	}
				119
				120	static inline int width_to_agaw(int width)
				121	{
				122	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
				123	}
				124
				125	static inline unsigned int level_to_offset_bits(int level)
				126	{
				127	return (level - 1) * LEVEL_STRIDE;
				128	}
				129
				130	static inline int pfn_level_offset(unsigned long pfn, int level)
				131	{
				132	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
				133	}
				134
				135	static inline unsigned long level_mask(int level)
				136	{
				137	return -1UL << level_to_offset_bits(level);
				138	}
				139
				140	static inline unsigned long level_size(int level)
				141	{
				142	return 1UL << level_to_offset_bits(level);
				143	}
				144
				145	static inline unsigned long align_to_level(unsigned long pfn, int level)
				146	{
				147	return (pfn + level_size(level) - 1) & level_mask(level);
				148	}
				149
				150	static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
				151	{
				152	return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
				153	}
				154
				155	/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
				156	are never going to work. */
				157	static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
				158	{
				159	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
				160	}
				161
				162	static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
				163	{
				164	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
				165	}
				166	static inline unsigned long page_to_dma_pfn(struct page *pg)
				167	{
				168	return mm_to_dma_pfn(page_to_pfn(pg));
				169	}
				170	static inline unsigned long virt_to_dma_pfn(void *p)
				171	{
				172	return page_to_dma_pfn(virt_to_page(p));
				173	}
				174
				175	/* global iommu list, set NULL for ignored DMAR units */
				176	static struct intel_iommu **g_iommus;
				177
				178	static void __init check_tylersburg_isoch(void);
				179	static int rwbf_quirk;
				180
				181	/*
				182	* set to 1 to panic kernel if can't successfully enable VT-d
				183	* (used when kernel is launched w/ TXT)
				184	*/
				185	static int force_on = 0;
				186	int intel_iommu_tboot_noforce;
				187
				188	/*
				189	* 0: Present
				190	* 1-11: Reserved
				191	* 12-63: Context Ptr (12 - (haw-1))
				192	* 64-127: Reserved
				193	*/
				194	struct root_entry {
				195	u64 lo;
				196	u64 hi;
				197	};
				198	#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
				199
				200	/*
				201	* Take a root_entry and return the Lower Context Table Pointer (LCTP)
				202	* if marked present.
				203	*/
				204	static phys_addr_t root_entry_lctp(struct root_entry *re)
				205	{
				206	if (!(re->lo & 1))
				207	return 0;
				208
				209	return re->lo & VTD_PAGE_MASK;
				210	}
				211
				212	/*
				213	* Take a root_entry and return the Upper Context Table Pointer (UCTP)
				214	* if marked present.
				215	*/
				216	static phys_addr_t root_entry_uctp(struct root_entry *re)
				217	{
				218	if (!(re->hi & 1))
				219	return 0;
				220
				221	return re->hi & VTD_PAGE_MASK;
				222	}
				223	/*
				224	* low 64 bits:
				225	* 0: present
				226	* 1: fault processing disable
				227	* 2-3: translation type
				228	* 12-63: address space root
				229	* high 64 bits:
				230	* 0-2: address width
				231	* 3-6: aval
				232	* 8-23: domain id
				233	*/
				234	struct context_entry {
				235	u64 lo;
				236	u64 hi;
				237	};
				238
				239	static inline void context_clear_pasid_enable(struct context_entry *context)
				240	{
				241	context->lo &= ~(1ULL << 11);
				242	}
				243
				244	static inline bool context_pasid_enabled(struct context_entry *context)
				245	{
				246	return !!(context->lo & (1ULL << 11));
				247	}
				248
				249	static inline void context_set_copied(struct context_entry *context)
				250	{
				251	context->hi \|= (1ull << 3);
				252	}
				253
				254	static inline bool context_copied(struct context_entry *context)
				255	{
				256	return !!(context->hi & (1ULL << 3));
				257	}
				258
				259	static inline bool __context_present(struct context_entry *context)
				260	{
				261	return (context->lo & 1);
				262	}
				263
				264	static inline bool context_present(struct context_entry *context)
				265	{
				266	return context_pasid_enabled(context) ?
				267	__context_present(context) :
				268	__context_present(context) && !context_copied(context);
				269	}
				270
				271	static inline void context_set_present(struct context_entry *context)
				272	{
				273	context->lo \|= 1;
				274	}
				275
				276	static inline void context_set_fault_enable(struct context_entry *context)
				277	{
				278	context->lo &= (((u64)-1) << 2) \| 1;
				279	}
				280
				281	static inline void context_set_translation_type(struct context_entry *context,
				282	unsigned long value)
				283	{
				284	context->lo &= (((u64)-1) << 4) \| 3;
				285	context->lo \|= (value & 3) << 2;
				286	}
				287
				288	static inline void context_set_address_root(struct context_entry *context,
				289	unsigned long value)
				290	{
				291	context->lo &= ~VTD_PAGE_MASK;
				292	context->lo \|= value & VTD_PAGE_MASK;
				293	}
				294
				295	static inline void context_set_address_width(struct context_entry *context,
				296	unsigned long value)
				297	{
				298	context->hi \|= value & 7;
				299	}
				300
				301	static inline void context_set_domain_id(struct context_entry *context,
				302	unsigned long value)
				303	{
				304	context->hi \|= (value & ((1 << 16) - 1)) << 8;
				305	}
				306
				307	static inline int context_domain_id(struct context_entry *c)
				308	{
				309	return((c->hi >> 8) & 0xffff);
				310	}
				311
				312	static inline void context_clear_entry(struct context_entry *context)
				313	{
				314	context->lo = 0;
				315	context->hi = 0;
				316	}
				317
				318	/*
				319	* 0: readable
				320	* 1: writable
				321	* 2-6: reserved
				322	* 7: super page
				323	* 8-10: available
				324	* 11: snoop behavior
				325	* 12-63: Host physcial address
				326	*/
				327	struct dma_pte {
				328	u64 val;
				329	};
				330
				331	static inline void dma_clear_pte(struct dma_pte *pte)
				332	{
				333	pte->val = 0;
				334	}
				335
				336	static inline u64 dma_pte_addr(struct dma_pte *pte)
				337	{
				338	#ifdef CONFIG_64BIT
				339	return pte->val & VTD_PAGE_MASK;
				340	#else
				341	/* Must have a full atomic 64-bit read */
				342	return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
				343	#endif
				344	}
				345
				346	static inline bool dma_pte_present(struct dma_pte *pte)
				347	{
				348	return (pte->val & 3) != 0;
				349	}
				350
				351	static inline bool dma_pte_superpage(struct dma_pte *pte)
				352	{
				353	return (pte->val & DMA_PTE_LARGE_PAGE);
				354	}
				355
				356	static inline int first_pte_in_page(struct dma_pte *pte)
				357	{
				358	return !((unsigned long)pte & ~VTD_PAGE_MASK);
				359	}
				360
				361	/*
				362	* This domain is a statically identity mapping domain.
				363	* 1. This domain creats a static 1:1 mapping to all usable memory.
				364	* 2. It maps to each iommu if successful.
				365	* 3. Each iommu mapps to this domain if successful.
				366	*/
				367	static struct dmar_domain *si_domain;
				368	static int hw_pass_through = 1;
				369
				370	/*
				371	* Domain represents a virtual machine, more than one devices
				372	* across iommus may be owned in one domain, e.g. kvm guest.
				373	*/
				374	#define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
				375
				376	/* si_domain contains mulitple devices */
				377	#define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
				378
				379	#define for_each_domain_iommu(idx, domain) \
				380	for (idx = 0; idx < g_num_of_iommus; idx++) \
				381	if (domain->iommu_refcnt[idx])
				382
				383	struct dmar_rmrr_unit {
				384	struct list_head list; /* list of rmrr units */
				385	struct acpi_dmar_header hdr; / ACPI header */
				386	u64 base_address; /* reserved base address*/
				387	u64 end_address; /* reserved end address */
				388	struct dmar_dev_scope devices; / target devices */
				389	int devices_cnt; /* target device count */
				390	struct iommu_resv_region resv; / reserved region handle */
				391	};
				392
				393	struct dmar_atsr_unit {
				394	struct list_head list; /* list of ATSR units */
				395	struct acpi_dmar_header hdr; / ACPI header */
				396	struct dmar_dev_scope devices; / target devices */
				397	int devices_cnt; /* target device count */
				398	u8 include_all:1; /* include all ports */
				399	};
				400
				401	static LIST_HEAD(dmar_atsr_units);
				402	static LIST_HEAD(dmar_rmrr_units);
				403
				404	#define for_each_rmrr_units(rmrr) \
				405	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
				406
				407	/* bitmap for indexing intel_iommus */
				408	static int g_num_of_iommus;
				409
				410	static void domain_exit(struct dmar_domain *domain);
				411	static void domain_remove_dev_info(struct dmar_domain *domain);
				412	static void dmar_remove_one_dev_info(struct dmar_domain *domain,
				413	struct device *dev);
				414	static void __dmar_remove_one_dev_info(struct device_domain_info *info);
				415	static void domain_context_clear(struct intel_iommu *iommu,
				416	struct device *dev);
				417	static int domain_detach_iommu(struct dmar_domain *domain,
				418	struct intel_iommu *iommu);
				419
				420	#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
				421	int dmar_disabled = 0;
				422	#else
				423	int dmar_disabled = 1;
				424	#endif /CONFIG_INTEL_IOMMU_DEFAULT_ON/
				425
				426	int intel_iommu_enabled = 0;
				427	EXPORT_SYMBOL_GPL(intel_iommu_enabled);
				428
				429	static int dmar_map_gfx = 1;
				430	static int dmar_forcedac;
				431	static int intel_iommu_strict;
				432	static int intel_iommu_superpage = 1;
				433	static int intel_iommu_ecs = 1;
				434	static int intel_iommu_pasid28;
				435	static int iommu_identity_mapping;
				436
				437	#define IDENTMAP_ALL 1
				438	#define IDENTMAP_GFX 2
				439	#define IDENTMAP_AZALIA 4
				440
				441	/* Broadwell and Skylake have broken ECS support — normal so-called "second
				442	* level" translation of DMA requests-without-PASID doesn't actually happen
				443	* unless you also set the NESTE bit in an extended context-entry. Which of
				444	* course means that SVM doesn't work because it's trying to do nested
				445	* translation of the physical addresses it finds in the process page tables,
				446	* through the IOVA->phys mapping found in the "second level" page tables.
				447	*
				448	* The VT-d specification was retroactively changed to change the definition
				449	* of the capability bits and pretend that Broadwell/Skylake never happened...
				450	* but unfortunately the wrong bit was changed. It's ECS which is broken, but
				451	* for some reason it was the PASID capability bit which was redefined (from
				452	* bit 28 on BDW/SKL to bit 40 in future).
				453	*
				454	* So our test for ECS needs to eschew those implementations which set the old
				455	* PASID capabiity bit 28, since those are the ones on which ECS is broken.
				456	* Unless we are working around the 'pasid28' limitations, that is, by putting
				457	* the device into passthrough mode for normal DMA and thus masking the bug.
				458	*/
				459	#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
				460	(intel_iommu_pasid28 \|\| !ecap_broken_pasid(iommu->ecap)))
				461	/* PASID support is thus enabled if ECS is enabled and either of the old
				462	* or new capability bits are set. */
				463	#define pasid_enabled(iommu) (ecs_enabled(iommu) && \
				464	(ecap_pasid(iommu->ecap) \|\| ecap_broken_pasid(iommu->ecap)))
				465
				466	int intel_iommu_gfx_mapped;
				467	EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
				468
				469	#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
				470	static DEFINE_SPINLOCK(device_domain_lock);
				471	static LIST_HEAD(device_domain_list);
				472
				473	/*
				474	* Iterate over elements in device_domain_list and call the specified
				475	* callback @fn against each element. This helper should only be used
				476	* in the context where the device_domain_lock has already been holden.
				477	*/
				478	int for_each_device_domain(int (fn)(struct device_domain_info info,
				479	void data), void data)
				480	{
				481	int ret = 0;
				482	struct device_domain_info *info;
				483
				484	assert_spin_locked(&device_domain_lock);
				485	list_for_each_entry(info, &device_domain_list, global) {
				486	ret = fn(info, data);
				487	if (ret)
				488	return ret;
				489	}
				490
				491	return 0;
				492	}
				493
				494	const struct iommu_ops intel_iommu_ops;
				495
				496	static bool translation_pre_enabled(struct intel_iommu *iommu)
				497	{
				498	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
				499	}
				500
				501	static void clear_translation_pre_enabled(struct intel_iommu *iommu)
				502	{
				503	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
				504	}
				505
				506	static void init_translation_status(struct intel_iommu *iommu)
				507	{
				508	u32 gsts;
				509
				510	gsts = readl(iommu->reg + DMAR_GSTS_REG);
				511	if (gsts & DMA_GSTS_TES)
				512	iommu->flags \|= VTD_FLAG_TRANS_PRE_ENABLED;
				513	}
				514
				515	/* Convert generic 'struct iommu_domain to private struct dmar_domain */
				516	static struct dmar_domain to_dmar_domain(struct iommu_domain dom)
				517	{
				518	return container_of(dom, struct dmar_domain, domain);
				519	}
				520
				521	static int __init intel_iommu_setup(char *str)
				522	{
				523	if (!str)
				524	return -EINVAL;
				525	while (*str) {
				526	if (!strncmp(str, "on", 2)) {
				527	dmar_disabled = 0;
				528	pr_info("IOMMU enabled\n");
				529	} else if (!strncmp(str, "off", 3)) {
				530	dmar_disabled = 1;
				531	pr_info("IOMMU disabled\n");
				532	} else if (!strncmp(str, "igfx_off", 8)) {
				533	dmar_map_gfx = 0;
				534	pr_info("Disable GFX device mapping\n");
				535	} else if (!strncmp(str, "forcedac", 8)) {
				536	pr_info("Forcing DAC for PCI devices\n");
				537	dmar_forcedac = 1;
				538	} else if (!strncmp(str, "strict", 6)) {
				539	pr_info("Disable batched IOTLB flush\n");
				540	intel_iommu_strict = 1;
				541	} else if (!strncmp(str, "sp_off", 6)) {
				542	pr_info("Disable supported super page\n");
				543	intel_iommu_superpage = 0;
				544	} else if (!strncmp(str, "ecs_off", 7)) {
				545	printk(KERN_INFO
				546	"Intel-IOMMU: disable extended context table support\n");
				547	intel_iommu_ecs = 0;
				548	} else if (!strncmp(str, "pasid28", 7)) {
				549	printk(KERN_INFO
				550	"Intel-IOMMU: enable pre-production PASID support\n");
				551	intel_iommu_pasid28 = 1;
				552	iommu_identity_mapping \|= IDENTMAP_GFX;
				553	} else if (!strncmp(str, "tboot_noforce", 13)) {
				554	printk(KERN_INFO
				555	"Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
				556	intel_iommu_tboot_noforce = 1;
				557	}
				558
				559	str += strcspn(str, ",");
				560	while (*str == ',')
				561	str++;
				562	}
				563	return 0;
				564	}
				565	__setup("intel_iommu=", intel_iommu_setup);
				566
				567	static struct kmem_cache *iommu_domain_cache;
				568	static struct kmem_cache *iommu_devinfo_cache;
				569
				570	static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
				571	{
				572	struct dmar_domain **domains;
				573	int idx = did >> 8;
				574
				575	domains = iommu->domains[idx];
				576	if (!domains)
				577	return NULL;
				578
				579	return domains[did & 0xff];
				580	}
				581
				582	static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
				583	struct dmar_domain *domain)
				584	{
				585	struct dmar_domain **domains;
				586	int idx = did >> 8;
				587
				588	if (!iommu->domains[idx]) {
				589	size_t size = 256 * sizeof(struct dmar_domain *);
				590	iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
				591	}
				592
				593	domains = iommu->domains[idx];
				594	if (WARN_ON(!domains))
				595	return;
				596	else
				597	domains[did & 0xff] = domain;
				598	}
				599
				600	void *alloc_pgtable_page(int node)
				601	{
				602	struct page *page;
				603	void *vaddr = NULL;
				604
				605	page = alloc_pages_node(node, GFP_ATOMIC \| __GFP_ZERO, 0);
				606	if (page)
				607	vaddr = page_address(page);
				608	return vaddr;
				609	}
				610
				611	void free_pgtable_page(void *vaddr)
				612	{
				613	free_page((unsigned long)vaddr);
				614	}
				615
				616	static inline void *alloc_domain_mem(void)
				617	{
				618	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
				619	}
				620
				621	static void free_domain_mem(void *vaddr)
				622	{
				623	kmem_cache_free(iommu_domain_cache, vaddr);
				624	}
				625
				626	static inline void * alloc_devinfo_mem(void)
				627	{
				628	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
				629	}
				630
				631	static inline void free_devinfo_mem(void *vaddr)
				632	{
				633	kmem_cache_free(iommu_devinfo_cache, vaddr);
				634	}
				635
				636	static inline int domain_type_is_vm(struct dmar_domain *domain)
				637	{
				638	return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
				639	}
				640
				641	static inline int domain_type_is_si(struct dmar_domain *domain)
				642	{
				643	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
				644	}
				645
				646	static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
				647	{
				648	return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE \|
				649	DOMAIN_FLAG_STATIC_IDENTITY);
				650	}
				651
				652	static inline int domain_pfn_supported(struct dmar_domain *domain,
				653	unsigned long pfn)
				654	{
				655	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
				656
				657	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
				658	}
				659
				660	static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
				661	{
				662	unsigned long sagaw;
				663	int agaw = -1;
				664
				665	sagaw = cap_sagaw(iommu->cap);
				666	for (agaw = width_to_agaw(max_gaw);
				667	agaw >= 0; agaw--) {
				668	if (test_bit(agaw, &sagaw))
				669	break;
				670	}
				671
				672	return agaw;
				673	}
				674
				675	/*
				676	* Calculate max SAGAW for each iommu.
				677	*/
				678	int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
				679	{
				680	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
				681	}
				682
				683	/*
				684	* calculate agaw for each iommu.
				685	* "SAGAW" may be different across iommus, use a default agaw, and
				686	* get a supported less agaw for iommus that don't support the default agaw.
				687	*/
				688	int iommu_calculate_agaw(struct intel_iommu *iommu)
				689	{
				690	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
				691	}
				692
				693	/* This functionin only returns single iommu in a domain */
				694	struct intel_iommu domain_get_iommu(struct dmar_domain domain)
				695	{
				696	int iommu_id;
				697
				698	/* si_domain and vm domain should not get here. */
				699	BUG_ON(domain_type_is_vm_or_si(domain));
				700	for_each_domain_iommu(iommu_id, domain)
				701	break;
				702
				703	if (iommu_id < 0 \|\| iommu_id >= g_num_of_iommus)
				704	return NULL;
				705
				706	return g_iommus[iommu_id];
				707	}
				708
				709	static void domain_update_iommu_coherency(struct dmar_domain *domain)
				710	{
				711	struct dmar_drhd_unit *drhd;
				712	struct intel_iommu *iommu;
				713	bool found = false;
				714	int i;
				715
				716	domain->iommu_coherency = 1;
				717
				718	for_each_domain_iommu(i, domain) {
				719	found = true;
				720	if (!ecap_coherent(g_iommus[i]->ecap)) {
				721	domain->iommu_coherency = 0;
				722	break;
				723	}
				724	}
				725	if (found)
				726	return;
				727
				728	/* No hardware attached; use lowest common denominator */
				729	rcu_read_lock();
				730	for_each_active_iommu(iommu, drhd) {
				731	if (!ecap_coherent(iommu->ecap)) {
				732	domain->iommu_coherency = 0;
				733	break;
				734	}
				735	}
				736	rcu_read_unlock();
				737	}
				738
				739	static int domain_update_iommu_snooping(struct intel_iommu *skip)
				740	{
				741	struct dmar_drhd_unit *drhd;
				742	struct intel_iommu *iommu;
				743	int ret = 1;
				744
				745	rcu_read_lock();
				746	for_each_active_iommu(iommu, drhd) {
				747	if (iommu != skip) {
				748	if (!ecap_sc_support(iommu->ecap)) {
				749	ret = 0;
				750	break;
				751	}
				752	}
				753	}
				754	rcu_read_unlock();
				755
				756	return ret;
				757	}
				758
				759	static int domain_update_iommu_superpage(struct intel_iommu *skip)
				760	{
				761	struct dmar_drhd_unit *drhd;
				762	struct intel_iommu *iommu;
				763	int mask = 0xf;
				764
				765	if (!intel_iommu_superpage) {
				766	return 0;
				767	}
				768
				769	/* set iommu_superpage to the smallest common denominator */
				770	rcu_read_lock();
				771	for_each_active_iommu(iommu, drhd) {
				772	if (iommu != skip) {
				773	mask &= cap_super_page_val(iommu->cap);
				774	if (!mask)
				775	break;
				776	}
				777	}
				778	rcu_read_unlock();
				779
				780	return fls(mask);
				781	}
				782
				783	/* Some capabilities may be different across iommus */
				784	static void domain_update_iommu_cap(struct dmar_domain *domain)
				785	{
				786	domain_update_iommu_coherency(domain);
				787	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
				788	domain->iommu_superpage = domain_update_iommu_superpage(NULL);
				789	}
				790
				791	static inline struct context_entry iommu_context_addr(struct intel_iommu iommu,
				792	u8 bus, u8 devfn, int alloc)
				793	{
				794	struct root_entry *root = &iommu->root_entry[bus];
				795	struct context_entry *context;
				796	u64 *entry;
				797
				798	entry = &root->lo;
				799	if (ecs_enabled(iommu)) {
				800	if (devfn >= 0x80) {
				801	devfn -= 0x80;
				802	entry = &root->hi;
				803	}
				804	devfn *= 2;
				805	}
				806	if (*entry & 1)
				807	context = phys_to_virt(*entry & VTD_PAGE_MASK);
				808	else {
				809	unsigned long phy_addr;
				810	if (!alloc)
				811	return NULL;
				812
				813	context = alloc_pgtable_page(iommu->node);
				814	if (!context)
				815	return NULL;
				816
				817	__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
				818	phy_addr = virt_to_phys((void *)context);
				819	*entry = phy_addr \| 1;
				820	__iommu_flush_cache(iommu, entry, sizeof(*entry));
				821	}
				822	return &context[devfn];
				823	}
				824
				825	static int iommu_dummy(struct device *dev)
				826	{
				827	return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
				828	}
				829
				830	static struct intel_iommu device_to_iommu(struct device dev, u8 bus, u8 devfn)
				831	{
				832	struct dmar_drhd_unit *drhd = NULL;
				833	struct intel_iommu *iommu;
				834	struct device *tmp;
				835	struct pci_dev ptmp, pdev = NULL;
				836	u16 segment = 0;
				837	int i;
				838
				839	if (iommu_dummy(dev))
				840	return NULL;
				841
				842	if (dev_is_pci(dev)) {
				843	struct pci_dev *pf_pdev;
				844
				845	pdev = to_pci_dev(dev);
				846
				847	#ifdef CONFIG_X86
				848	/* VMD child devices currently cannot be handled individually */
				849	if (is_vmd(pdev->bus))
				850	return NULL;
				851	#endif
				852
				853	/* VFs aren't listed in scope tables; we need to look up
				854	* the PF instead to find the IOMMU. */
				855	pf_pdev = pci_physfn(pdev);
				856	dev = &pf_pdev->dev;
				857	segment = pci_domain_nr(pdev->bus);
				858	} else if (has_acpi_companion(dev))
				859	dev = &ACPI_COMPANION(dev)->dev;
				860
				861	rcu_read_lock();
				862	for_each_active_iommu(iommu, drhd) {
				863	if (pdev && segment != drhd->segment)
				864	continue;
				865
				866	for_each_active_dev_scope(drhd->devices,
				867	drhd->devices_cnt, i, tmp) {
				868	if (tmp == dev) {
				869	/* For a VF use its original BDF# not that of the PF
				870	* which we used for the IOMMU lookup. Strictly speaking
				871	* we could do this for all PCI devices; we only need to
				872	* get the BDF# from the scope table for ACPI matches. */
				873	if (pdev && pdev->is_virtfn)
				874	goto got_pdev;
				875
				876	*bus = drhd->devices[i].bus;
				877	*devfn = drhd->devices[i].devfn;
				878	goto out;
				879	}
				880
				881	if (!pdev \|\| !dev_is_pci(tmp))
				882	continue;
				883
				884	ptmp = to_pci_dev(tmp);
				885	if (ptmp->subordinate &&
				886	ptmp->subordinate->number <= pdev->bus->number &&
				887	ptmp->subordinate->busn_res.end >= pdev->bus->number)
				888	goto got_pdev;
				889	}
				890
				891	if (pdev && drhd->include_all) {
				892	got_pdev:
				893	*bus = pdev->bus->number;
				894	*devfn = pdev->devfn;
				895	goto out;
				896	}
				897	}
				898	iommu = NULL;
				899	out:
				900	rcu_read_unlock();
				901
				902	return iommu;
				903	}
				904
				905	static void domain_flush_cache(struct dmar_domain *domain,
				906	void *addr, int size)
				907	{
				908	if (!domain->iommu_coherency)
				909	clflush_cache_range(addr, size);
				910	}
				911
				912	static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
				913	{
				914	struct context_entry *context;
				915	int ret = 0;
				916	unsigned long flags;
				917
				918	spin_lock_irqsave(&iommu->lock, flags);
				919	context = iommu_context_addr(iommu, bus, devfn, 0);
				920	if (context)
				921	ret = context_present(context);
				922	spin_unlock_irqrestore(&iommu->lock, flags);
				923	return ret;
				924	}
				925
				926	static void free_context_table(struct intel_iommu *iommu)
				927	{
				928	int i;
				929	unsigned long flags;
				930	struct context_entry *context;
				931
				932	spin_lock_irqsave(&iommu->lock, flags);
				933	if (!iommu->root_entry) {
				934	goto out;
				935	}
				936	for (i = 0; i < ROOT_ENTRY_NR; i++) {
				937	context = iommu_context_addr(iommu, i, 0, 0);
				938	if (context)
				939	free_pgtable_page(context);
				940
				941	if (!ecs_enabled(iommu))
				942	continue;
				943
				944	context = iommu_context_addr(iommu, i, 0x80, 0);
				945	if (context)
				946	free_pgtable_page(context);
				947
				948	}
				949	free_pgtable_page(iommu->root_entry);
				950	iommu->root_entry = NULL;
				951	out:
				952	spin_unlock_irqrestore(&iommu->lock, flags);
				953	}
				954
				955	static struct dma_pte pfn_to_dma_pte(struct dmar_domain domain,
				956	unsigned long pfn, int *target_level)
				957	{
				958	struct dma_pte parent, pte = NULL;
				959	int level = agaw_to_level(domain->agaw);
				960	int offset;
				961
				962	BUG_ON(!domain->pgd);
				963
				964	if (!domain_pfn_supported(domain, pfn))
				965	/* Address beyond IOMMU's addressing capabilities. */
				966	return NULL;
				967
				968	parent = domain->pgd;
				969
				970	while (1) {
				971	void *tmp_page;
				972
				973	offset = pfn_level_offset(pfn, level);
				974	pte = &parent[offset];
				975	if (!*target_level && (dma_pte_superpage(pte) \|\| !dma_pte_present(pte)))
				976	break;
				977	if (level == *target_level)
				978	break;
				979
				980	if (!dma_pte_present(pte)) {
				981	uint64_t pteval;
				982
				983	tmp_page = alloc_pgtable_page(domain->nid);
				984
				985	if (!tmp_page)
				986	return NULL;
				987
				988	domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
				989	pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) \| DMA_PTE_READ \| DMA_PTE_WRITE;
				990	if (cmpxchg64(&pte->val, 0ULL, pteval))
				991	/* Someone else set it while we were thinking; use theirs. */
				992	free_pgtable_page(tmp_page);
				993	else
				994	domain_flush_cache(domain, pte, sizeof(*pte));
				995	}
				996	if (level == 1)
				997	break;
				998
				999	parent = phys_to_virt(dma_pte_addr(pte));
				1000	level--;
				1001	}
				1002
				1003	if (!*target_level)
				1004	*target_level = level;
				1005
				1006	return pte;
				1007	}
				1008
				1009
				1010	/* return address's pte at specific level */
				1011	static struct dma_pte dma_pfn_level_pte(struct dmar_domain domain,
				1012	unsigned long pfn,
				1013	int level, int *large_page)
				1014	{
				1015	struct dma_pte parent, pte = NULL;
				1016	int total = agaw_to_level(domain->agaw);
				1017	int offset;
				1018
				1019	parent = domain->pgd;
				1020	while (level <= total) {
				1021	offset = pfn_level_offset(pfn, total);
				1022	pte = &parent[offset];
				1023	if (level == total)
				1024	return pte;
				1025
				1026	if (!dma_pte_present(pte)) {
				1027	*large_page = total;
				1028	break;
				1029	}
				1030
				1031	if (dma_pte_superpage(pte)) {
				1032	*large_page = total;
				1033	return pte;
				1034	}
				1035
				1036	parent = phys_to_virt(dma_pte_addr(pte));
				1037	total--;
				1038	}
				1039	return NULL;
				1040	}
				1041
				1042	/* clear last level pte, a tlb flush should be followed */
				1043	static void dma_pte_clear_range(struct dmar_domain *domain,
				1044	unsigned long start_pfn,
				1045	unsigned long last_pfn)
				1046	{
				1047	unsigned int large_page = 1;
				1048	struct dma_pte first_pte, pte;
				1049
				1050	BUG_ON(!domain_pfn_supported(domain, start_pfn));
				1051	BUG_ON(!domain_pfn_supported(domain, last_pfn));
				1052	BUG_ON(start_pfn > last_pfn);
				1053
				1054	/* we don't need lock here; nobody else touches the iova range */
				1055	do {
				1056	large_page = 1;
				1057	first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
				1058	if (!pte) {
				1059	start_pfn = align_to_level(start_pfn + 1, large_page + 1);
				1060	continue;
				1061	}
				1062	do {
				1063	dma_clear_pte(pte);
				1064	start_pfn += lvl_to_nr_pages(large_page);
				1065	pte++;
				1066	} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
				1067
				1068	domain_flush_cache(domain, first_pte,
				1069	(void )pte - (void )first_pte);
				1070
				1071	} while (start_pfn && start_pfn <= last_pfn);
				1072	}
				1073
				1074	static void dma_pte_free_level(struct dmar_domain *domain, int level,
				1075	int retain_level, struct dma_pte *pte,
				1076	unsigned long pfn, unsigned long start_pfn,
				1077	unsigned long last_pfn)
				1078	{
				1079	pfn = max(start_pfn, pfn);
				1080	pte = &pte[pfn_level_offset(pfn, level)];
				1081
				1082	do {
				1083	unsigned long level_pfn;
				1084	struct dma_pte *level_pte;
				1085
				1086	if (!dma_pte_present(pte) \|\| dma_pte_superpage(pte))
				1087	goto next;
				1088
				1089	level_pfn = pfn & level_mask(level);
				1090	level_pte = phys_to_virt(dma_pte_addr(pte));
				1091
				1092	if (level > 2) {
				1093	dma_pte_free_level(domain, level - 1, retain_level,
				1094	level_pte, level_pfn, start_pfn,
				1095	last_pfn);
				1096	}
				1097
				1098	/*
				1099	* Free the page table if we're below the level we want to
				1100	* retain and the range covers the entire table.
				1101	*/
				1102	if (level < retain_level && !(start_pfn > level_pfn \|\|
				1103	last_pfn < level_pfn + level_size(level) - 1)) {
				1104	dma_clear_pte(pte);
				1105	domain_flush_cache(domain, pte, sizeof(*pte));
				1106	free_pgtable_page(level_pte);
				1107	}
				1108	next:
				1109	pfn += level_size(level);
				1110	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
				1111	}
				1112
				1113	/*
				1114	* clear last level (leaf) ptes and free page table pages below the
				1115	* level we wish to keep intact.
				1116	*/
				1117	static void dma_pte_free_pagetable(struct dmar_domain *domain,
				1118	unsigned long start_pfn,
				1119	unsigned long last_pfn,
				1120	int retain_level)
				1121	{
				1122	BUG_ON(!domain_pfn_supported(domain, start_pfn));
				1123	BUG_ON(!domain_pfn_supported(domain, last_pfn));
				1124	BUG_ON(start_pfn > last_pfn);
				1125
				1126	dma_pte_clear_range(domain, start_pfn, last_pfn);
				1127
				1128	/* We don't need lock here; nobody else touches the iova range */
				1129	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
				1130	domain->pgd, 0, start_pfn, last_pfn);
				1131
				1132	/* free pgd */
				1133	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
				1134	free_pgtable_page(domain->pgd);
				1135	domain->pgd = NULL;
				1136	}
				1137	}
				1138
				1139	/* When a page at a given level is being unlinked from its parent, we don't
				1140	need to modify it at all. All we need to do is make a list of all the
				1141	pages which can be freed just as soon as we've flushed the IOTLB and we
				1142	know the hardware page-walk will no longer touch them.
				1143	The 'pte' argument is the parent PTE, pointing to the page that is to
				1144	be freed. */
				1145	static struct page dma_pte_list_pagetables(struct dmar_domain domain,
				1146	int level, struct dma_pte *pte,
				1147	struct page *freelist)
				1148	{
				1149	struct page *pg;
				1150
				1151	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
				1152	pg->freelist = freelist;
				1153	freelist = pg;
				1154
				1155	if (level == 1)
				1156	return freelist;
				1157
				1158	pte = page_address(pg);
				1159	do {
				1160	if (dma_pte_present(pte) && !dma_pte_superpage(pte))
				1161	freelist = dma_pte_list_pagetables(domain, level - 1,
				1162	pte, freelist);
				1163	pte++;
				1164	} while (!first_pte_in_page(pte));
				1165
				1166	return freelist;
				1167	}
				1168
				1169	static struct page dma_pte_clear_level(struct dmar_domain domain, int level,
				1170	struct dma_pte *pte, unsigned long pfn,
				1171	unsigned long start_pfn,
				1172	unsigned long last_pfn,
				1173	struct page *freelist)
				1174	{
				1175	struct dma_pte first_pte = NULL, last_pte = NULL;
				1176
				1177	pfn = max(start_pfn, pfn);
				1178	pte = &pte[pfn_level_offset(pfn, level)];
				1179
				1180	do {
				1181	unsigned long level_pfn;
				1182
				1183	if (!dma_pte_present(pte))
				1184	goto next;
				1185
				1186	level_pfn = pfn & level_mask(level);
				1187
				1188	/* If range covers entire pagetable, free it */
				1189	if (start_pfn <= level_pfn &&
				1190	last_pfn >= level_pfn + level_size(level) - 1) {
				1191	/* These suborbinate page tables are going away entirely. Don't
				1192	bother to clear them; we're just going to free them. */
				1193	if (level > 1 && !dma_pte_superpage(pte))
				1194	freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
				1195
				1196	dma_clear_pte(pte);
				1197	if (!first_pte)
				1198	first_pte = pte;
				1199	last_pte = pte;
				1200	} else if (level > 1) {
				1201	/* Recurse down into a level that isn't entirely obsolete */
				1202	freelist = dma_pte_clear_level(domain, level - 1,
				1203	phys_to_virt(dma_pte_addr(pte)),
				1204	level_pfn, start_pfn, last_pfn,
				1205	freelist);
				1206	}
				1207	next:
				1208	pfn += level_size(level);
				1209	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
				1210
				1211	if (first_pte)
				1212	domain_flush_cache(domain, first_pte,
				1213	(void )++last_pte - (void )first_pte);
				1214
				1215	return freelist;
				1216	}
				1217
				1218	/* We can't just free the pages because the IOMMU may still be walking
				1219	the page tables, and may have cached the intermediate levels. The
				1220	pages can only be freed after the IOTLB flush has been done. */
				1221	static struct page domain_unmap(struct dmar_domain domain,
				1222	unsigned long start_pfn,
				1223	unsigned long last_pfn)
				1224	{
				1225	struct page *freelist = NULL;
				1226
				1227	BUG_ON(!domain_pfn_supported(domain, start_pfn));
				1228	BUG_ON(!domain_pfn_supported(domain, last_pfn));
				1229	BUG_ON(start_pfn > last_pfn);
				1230
				1231	/* we don't need lock here; nobody else touches the iova range */
				1232	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
				1233	domain->pgd, 0, start_pfn, last_pfn, NULL);
				1234
				1235	/* free pgd */
				1236	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
				1237	struct page *pgd_page = virt_to_page(domain->pgd);
				1238	pgd_page->freelist = freelist;
				1239	freelist = pgd_page;
				1240
				1241	domain->pgd = NULL;
				1242	}
				1243
				1244	return freelist;
				1245	}
				1246
				1247	static void dma_free_pagelist(struct page *freelist)
				1248	{
				1249	struct page *pg;
				1250
				1251	while ((pg = freelist)) {
				1252	freelist = pg->freelist;
				1253	free_pgtable_page(page_address(pg));
				1254	}
				1255	}
				1256
				1257	static void iova_entry_free(unsigned long data)
				1258	{
				1259	struct page freelist = (struct page )data;
				1260
				1261	dma_free_pagelist(freelist);
				1262	}
				1263
				1264	/* iommu handling */
				1265	static int iommu_alloc_root_entry(struct intel_iommu *iommu)
				1266	{
				1267	struct root_entry *root;
				1268	unsigned long flags;
				1269
				1270	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
				1271	if (!root) {
				1272	pr_err("Allocating root entry for %s failed\n",
				1273	iommu->name);
				1274	return -ENOMEM;
				1275	}
				1276
				1277	__iommu_flush_cache(iommu, root, ROOT_SIZE);
				1278
				1279	spin_lock_irqsave(&iommu->lock, flags);
				1280	iommu->root_entry = root;
				1281	spin_unlock_irqrestore(&iommu->lock, flags);
				1282
				1283	return 0;
				1284	}
				1285
				1286	static void iommu_set_root_entry(struct intel_iommu *iommu)
				1287	{
				1288	u64 addr;
				1289	u32 sts;
				1290	unsigned long flag;
				1291
				1292	addr = virt_to_phys(iommu->root_entry);
				1293	if (ecs_enabled(iommu))
				1294	addr \|= DMA_RTADDR_RTT;
				1295
				1296	raw_spin_lock_irqsave(&iommu->register_lock, flag);
				1297	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
				1298
				1299	writel(iommu->gcmd \| DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
				1300
				1301	/* Make sure hardware complete it */
				1302	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
				1303	readl, (sts & DMA_GSTS_RTPS), sts);
				1304
				1305	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
				1306	}
				1307
				1308	static void iommu_flush_write_buffer(struct intel_iommu *iommu)
				1309	{
				1310	u32 val;
				1311	unsigned long flag;
				1312
				1313	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
				1314	return;
				1315
				1316	raw_spin_lock_irqsave(&iommu->register_lock, flag);
				1317	writel(iommu->gcmd \| DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
				1318
				1319	/* Make sure hardware complete it */
				1320	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
				1321	readl, (!(val & DMA_GSTS_WBFS)), val);
				1322
				1323	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
				1324	}
				1325
				1326	/* return value determine if we need a write buffer flush */
				1327	static void __iommu_flush_context(struct intel_iommu *iommu,
				1328	u16 did, u16 source_id, u8 function_mask,
				1329	u64 type)
				1330	{
				1331	u64 val = 0;
				1332	unsigned long flag;
				1333
				1334	switch (type) {
				1335	case DMA_CCMD_GLOBAL_INVL:
				1336	val = DMA_CCMD_GLOBAL_INVL;
				1337	break;
				1338	case DMA_CCMD_DOMAIN_INVL:
				1339	val = DMA_CCMD_DOMAIN_INVL\|DMA_CCMD_DID(did);
				1340	break;
				1341	case DMA_CCMD_DEVICE_INVL:
				1342	val = DMA_CCMD_DEVICE_INVL\|DMA_CCMD_DID(did)
				1343	\| DMA_CCMD_SID(source_id) \| DMA_CCMD_FM(function_mask);
				1344	break;
				1345	default:
				1346	BUG();
				1347	}
				1348	val \|= DMA_CCMD_ICC;
				1349
				1350	raw_spin_lock_irqsave(&iommu->register_lock, flag);
				1351	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
				1352
				1353	/* Make sure hardware complete it */
				1354	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
				1355	dmar_readq, (!(val & DMA_CCMD_ICC)), val);
				1356
				1357	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
				1358	}
				1359
				1360	/* return value determine if we need a write buffer flush */
				1361	static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
				1362	u64 addr, unsigned int size_order, u64 type)
				1363	{
				1364	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
				1365	u64 val = 0, val_iva = 0;
				1366	unsigned long flag;
				1367
				1368	switch (type) {
				1369	case DMA_TLB_GLOBAL_FLUSH:
				1370	/* global flush doesn't need set IVA_REG */
				1371	val = DMA_TLB_GLOBAL_FLUSH\|DMA_TLB_IVT;
				1372	break;
				1373	case DMA_TLB_DSI_FLUSH:
				1374	val = DMA_TLB_DSI_FLUSH\|DMA_TLB_IVT\|DMA_TLB_DID(did);
				1375	break;
				1376	case DMA_TLB_PSI_FLUSH:
				1377	val = DMA_TLB_PSI_FLUSH\|DMA_TLB_IVT\|DMA_TLB_DID(did);
				1378	/* IH bit is passed in as part of address */
				1379	val_iva = size_order \| addr;
				1380	break;
				1381	default:
				1382	BUG();
				1383	}
				1384	/* Note: set drain read/write */
				1385	#if 0
				1386	/*
				1387	* This is probably to be super secure.. Looks like we can
				1388	* ignore it without any impact.
				1389	*/
				1390	if (cap_read_drain(iommu->cap))
				1391	val \|= DMA_TLB_READ_DRAIN;
				1392	#endif
				1393	if (cap_write_drain(iommu->cap))
				1394	val \|= DMA_TLB_WRITE_DRAIN;
				1395
				1396	raw_spin_lock_irqsave(&iommu->register_lock, flag);
				1397	/* Note: Only uses first TLB reg currently */
				1398	if (val_iva)
				1399	dmar_writeq(iommu->reg + tlb_offset, val_iva);
				1400	dmar_writeq(iommu->reg + tlb_offset + 8, val);
				1401
				1402	/* Make sure hardware complete it */
				1403	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
				1404	dmar_readq, (!(val & DMA_TLB_IVT)), val);
				1405
				1406	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
				1407
				1408	/* check IOTLB invalidation granularity */
				1409	if (DMA_TLB_IAIG(val) == 0)
				1410	pr_err("Flush IOTLB failed\n");
				1411	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
				1412	pr_debug("TLB flush request %Lx, actual %Lx\n",
				1413	(unsigned long long)DMA_TLB_IIRG(type),
				1414	(unsigned long long)DMA_TLB_IAIG(val));
				1415	}
				1416
				1417	static struct device_domain_info *
				1418	iommu_support_dev_iotlb (struct dmar_domain domain, struct intel_iommu iommu,
				1419	u8 bus, u8 devfn)
				1420	{
				1421	struct device_domain_info *info;
				1422
				1423	assert_spin_locked(&device_domain_lock);
				1424
				1425	if (!iommu->qi)
				1426	return NULL;
				1427
				1428	list_for_each_entry(info, &domain->devices, link)
				1429	if (info->iommu == iommu && info->bus == bus &&
				1430	info->devfn == devfn) {
				1431	if (info->ats_supported && info->dev)
				1432	return info;
				1433	break;
				1434	}
				1435
				1436	return NULL;
				1437	}
				1438
				1439	static void domain_update_iotlb(struct dmar_domain *domain)
				1440	{
				1441	struct device_domain_info *info;
				1442	bool has_iotlb_device = false;
				1443
				1444	assert_spin_locked(&device_domain_lock);
				1445
				1446	list_for_each_entry(info, &domain->devices, link) {
				1447	struct pci_dev *pdev;
				1448
				1449	if (!info->dev \|\| !dev_is_pci(info->dev))
				1450	continue;
				1451
				1452	pdev = to_pci_dev(info->dev);
				1453	if (pdev->ats_enabled) {
				1454	has_iotlb_device = true;
				1455	break;
				1456	}
				1457	}
				1458
				1459	domain->has_iotlb_device = has_iotlb_device;
				1460	}
				1461
				1462	static void iommu_enable_dev_iotlb(struct device_domain_info *info)
				1463	{
				1464	struct pci_dev *pdev;
				1465
				1466	assert_spin_locked(&device_domain_lock);
				1467
				1468	if (!info \|\| !dev_is_pci(info->dev))
				1469	return;
				1470
				1471	pdev = to_pci_dev(info->dev);
				1472	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
				1473	* PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
				1474	* queue depth at PF level. If DIT is not set, PFSID will be treated as
				1475	* reserved, which should be set to 0.
				1476	*/
				1477	if (!ecap_dit(info->iommu->ecap))
				1478	info->pfsid = 0;
				1479	else {
				1480	struct pci_dev *pf_pdev;
				1481
				1482	/* pdev will be returned if device is not a vf */
				1483	pf_pdev = pci_physfn(pdev);
				1484	info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
				1485	}
				1486
				1487	#ifdef CONFIG_INTEL_IOMMU_SVM
				1488	/* The PCIe spec, in its wisdom, declares that the behaviour of
				1489	the device if you enable PASID support after ATS support is
				1490	undefined. So always enable PASID support on devices which
				1491	have it, even if we can't yet know if we're ever going to
				1492	use it. */
				1493	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
				1494	info->pasid_enabled = 1;
				1495
				1496	if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
				1497	info->pri_enabled = 1;
				1498	#endif
				1499	if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
				1500	info->ats_enabled = 1;
				1501	domain_update_iotlb(info->domain);
				1502	info->ats_qdep = pci_ats_queue_depth(pdev);
				1503	}
				1504	}
				1505
				1506	static void iommu_disable_dev_iotlb(struct device_domain_info *info)
				1507	{
				1508	struct pci_dev *pdev;
				1509
				1510	assert_spin_locked(&device_domain_lock);
				1511
				1512	if (!dev_is_pci(info->dev))
				1513	return;
				1514
				1515	pdev = to_pci_dev(info->dev);
				1516
				1517	if (info->ats_enabled) {
				1518	pci_disable_ats(pdev);
				1519	info->ats_enabled = 0;
				1520	domain_update_iotlb(info->domain);
				1521	}
				1522	#ifdef CONFIG_INTEL_IOMMU_SVM
				1523	if (info->pri_enabled) {
				1524	pci_disable_pri(pdev);
				1525	info->pri_enabled = 0;
				1526	}
				1527	if (info->pasid_enabled) {
				1528	pci_disable_pasid(pdev);
				1529	info->pasid_enabled = 0;
				1530	}
				1531	#endif
				1532	}
				1533
				1534	static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
				1535	u64 addr, unsigned mask)
				1536	{
				1537	u16 sid, qdep;
				1538	unsigned long flags;
				1539	struct device_domain_info *info;
				1540
				1541	if (!domain->has_iotlb_device)
				1542	return;
				1543
				1544	spin_lock_irqsave(&device_domain_lock, flags);
				1545	list_for_each_entry(info, &domain->devices, link) {
				1546	if (!info->ats_enabled)
				1547	continue;
				1548
				1549	sid = info->bus << 8 \| info->devfn;
				1550	qdep = info->ats_qdep;
				1551	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
				1552	qdep, addr, mask);
				1553	}
				1554	spin_unlock_irqrestore(&device_domain_lock, flags);
				1555	}
				1556
				1557	static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
				1558	struct dmar_domain *domain,
				1559	unsigned long pfn, unsigned int pages,
				1560	int ih, int map)
				1561	{
				1562	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
				1563	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
				1564	u16 did = domain->iommu_did[iommu->seq_id];
				1565
				1566	BUG_ON(pages == 0);
				1567
				1568	if (ih)
				1569	ih = 1 << 6;
				1570	/*
				1571	* Fallback to domain selective flush if no PSI support or the size is
				1572	* too big.
				1573	* PSI requires page size to be 2 ^ x, and the base address is naturally
				1574	* aligned to the size
				1575	*/
				1576	if (!cap_pgsel_inv(iommu->cap) \|\| mask > cap_max_amask_val(iommu->cap))
				1577	iommu->flush.flush_iotlb(iommu, did, 0, 0,
				1578	DMA_TLB_DSI_FLUSH);
				1579	else
				1580	iommu->flush.flush_iotlb(iommu, did, addr \| ih, mask,
				1581	DMA_TLB_PSI_FLUSH);
				1582
				1583	/*
				1584	* In caching mode, changes of pages from non-present to present require
				1585	* flush. However, device IOTLB doesn't need to be flushed in this case.
				1586	*/
				1587	if (!cap_caching_mode(iommu->cap) \|\| !map)
				1588	iommu_flush_dev_iotlb(domain, addr, mask);
				1589	}
				1590
				1591	/* Notification for newly created mappings */
				1592	static inline void __mapping_notify_one(struct intel_iommu *iommu,
				1593	struct dmar_domain *domain,
				1594	unsigned long pfn, unsigned int pages)
				1595	{
				1596	/* It's a non-present to present mapping. Only flush if caching mode */
				1597	if (cap_caching_mode(iommu->cap))
				1598	iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
				1599	else
				1600	iommu_flush_write_buffer(iommu);
				1601	}
				1602
				1603	static void iommu_flush_iova(struct iova_domain *iovad)
				1604	{
				1605	struct dmar_domain *domain;
				1606	int idx;
				1607
				1608	domain = container_of(iovad, struct dmar_domain, iovad);
				1609
				1610	for_each_domain_iommu(idx, domain) {
				1611	struct intel_iommu *iommu = g_iommus[idx];
				1612	u16 did = domain->iommu_did[iommu->seq_id];
				1613
				1614	iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
				1615
				1616	if (!cap_caching_mode(iommu->cap))
				1617	iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
				1618	0, MAX_AGAW_PFN_WIDTH);
				1619	}
				1620	}
				1621
				1622	static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
				1623	{
				1624	u32 pmen;
				1625	unsigned long flags;
				1626
				1627	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
				1628	return;
				1629
				1630	raw_spin_lock_irqsave(&iommu->register_lock, flags);
				1631	pmen = readl(iommu->reg + DMAR_PMEN_REG);
				1632	pmen &= ~DMA_PMEN_EPM;
				1633	writel(pmen, iommu->reg + DMAR_PMEN_REG);
				1634
				1635	/* wait for the protected region status bit to clear */
				1636	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
				1637	readl, !(pmen & DMA_PMEN_PRS), pmen);
				1638
				1639	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
				1640	}
				1641
				1642	static void iommu_enable_translation(struct intel_iommu *iommu)
				1643	{
				1644	u32 sts;
				1645	unsigned long flags;
				1646
				1647	raw_spin_lock_irqsave(&iommu->register_lock, flags);
				1648	iommu->gcmd \|= DMA_GCMD_TE;
				1649	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
				1650
				1651	/* Make sure hardware complete it */
				1652	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
				1653	readl, (sts & DMA_GSTS_TES), sts);
				1654
				1655	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
				1656	}
				1657
				1658	static void iommu_disable_translation(struct intel_iommu *iommu)
				1659	{
				1660	u32 sts;
				1661	unsigned long flag;
				1662
				1663	raw_spin_lock_irqsave(&iommu->register_lock, flag);
				1664	iommu->gcmd &= ~DMA_GCMD_TE;
				1665	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
				1666
				1667	/* Make sure hardware complete it */
				1668	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
				1669	readl, (!(sts & DMA_GSTS_TES)), sts);
				1670
				1671	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
				1672	}
				1673
				1674
				1675	static int iommu_init_domains(struct intel_iommu *iommu)
				1676	{
				1677	u32 ndomains, nlongs;
				1678	size_t size;
				1679
				1680	ndomains = cap_ndoms(iommu->cap);
				1681	pr_debug("%s: Number of Domains supported <%d>\n",
				1682	iommu->name, ndomains);
				1683	nlongs = BITS_TO_LONGS(ndomains);
				1684
				1685	spin_lock_init(&iommu->lock);
				1686
				1687	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
				1688	if (!iommu->domain_ids) {
				1689	pr_err("%s: Allocating domain id array failed\n",
				1690	iommu->name);
				1691	return -ENOMEM;
				1692	}
				1693
				1694	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
				1695	iommu->domains = kzalloc(size, GFP_KERNEL);
				1696
				1697	if (iommu->domains) {
				1698	size = 256 * sizeof(struct dmar_domain *);
				1699	iommu->domains[0] = kzalloc(size, GFP_KERNEL);
				1700	}
				1701
				1702	if (!iommu->domains \|\| !iommu->domains[0]) {
				1703	pr_err("%s: Allocating domain array failed\n",
				1704	iommu->name);
				1705	kfree(iommu->domain_ids);
				1706	kfree(iommu->domains);
				1707	iommu->domain_ids = NULL;
				1708	iommu->domains = NULL;
				1709	return -ENOMEM;
				1710	}
				1711
				1712
				1713
				1714	/*
				1715	* If Caching mode is set, then invalid translations are tagged
				1716	* with domain-id 0, hence we need to pre-allocate it. We also
				1717	* use domain-id 0 as a marker for non-allocated domain-id, so
				1718	* make sure it is not used for a real domain.
				1719	*/
				1720	set_bit(0, iommu->domain_ids);
				1721
				1722	return 0;
				1723	}
				1724
				1725	static void disable_dmar_iommu(struct intel_iommu *iommu)
				1726	{
				1727	struct device_domain_info info, tmp;
				1728	unsigned long flags;
				1729
				1730	if (!iommu->domains \|\| !iommu->domain_ids)
				1731	return;
				1732
				1733	again:
				1734	spin_lock_irqsave(&device_domain_lock, flags);
				1735	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
				1736	struct dmar_domain *domain;
				1737
				1738	if (info->iommu != iommu)
				1739	continue;
				1740
				1741	if (!info->dev \|\| !info->domain)
				1742	continue;
				1743
				1744	domain = info->domain;
				1745
				1746	__dmar_remove_one_dev_info(info);
				1747
				1748	if (!domain_type_is_vm_or_si(domain)) {
				1749	/*
				1750	* The domain_exit() function can't be called under
				1751	* device_domain_lock, as it takes this lock itself.
				1752	* So release the lock here and re-run the loop
				1753	* afterwards.
				1754	*/
				1755	spin_unlock_irqrestore(&device_domain_lock, flags);
				1756	domain_exit(domain);
				1757	goto again;
				1758	}
				1759	}
				1760	spin_unlock_irqrestore(&device_domain_lock, flags);
				1761
				1762	if (iommu->gcmd & DMA_GCMD_TE)
				1763	iommu_disable_translation(iommu);
				1764	}
				1765
				1766	static void free_dmar_iommu(struct intel_iommu *iommu)
				1767	{
				1768	if ((iommu->domains) && (iommu->domain_ids)) {
				1769	int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
				1770	int i;
				1771
				1772	for (i = 0; i < elems; i++)
				1773	kfree(iommu->domains[i]);
				1774	kfree(iommu->domains);
				1775	kfree(iommu->domain_ids);
				1776	iommu->domains = NULL;
				1777	iommu->domain_ids = NULL;
				1778	}
				1779
				1780	g_iommus[iommu->seq_id] = NULL;
				1781
				1782	/* free context mapping */
				1783	free_context_table(iommu);
				1784
				1785	#ifdef CONFIG_INTEL_IOMMU_SVM
				1786	if (pasid_enabled(iommu)) {
				1787	if (ecap_prs(iommu->ecap))
				1788	intel_svm_finish_prq(iommu);
				1789	intel_svm_exit(iommu);
				1790	}
				1791	#endif
				1792	}
				1793
				1794	static struct dmar_domain *alloc_domain(int flags)
				1795	{
				1796	struct dmar_domain *domain;
				1797
				1798	domain = alloc_domain_mem();
				1799	if (!domain)
				1800	return NULL;
				1801
				1802	memset(domain, 0, sizeof(*domain));
				1803	domain->nid = -1;
				1804	domain->flags = flags;
				1805	domain->has_iotlb_device = false;
				1806	INIT_LIST_HEAD(&domain->devices);
				1807
				1808	return domain;
				1809	}
				1810
				1811	/* Must be called with iommu->lock */
				1812	static int domain_attach_iommu(struct dmar_domain *domain,
				1813	struct intel_iommu *iommu)
				1814	{
				1815	unsigned long ndomains;
				1816	int num;
				1817
				1818	assert_spin_locked(&device_domain_lock);
				1819	assert_spin_locked(&iommu->lock);
				1820
				1821	domain->iommu_refcnt[iommu->seq_id] += 1;
				1822	domain->iommu_count += 1;
				1823	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
				1824	ndomains = cap_ndoms(iommu->cap);
				1825	num = find_first_zero_bit(iommu->domain_ids, ndomains);
				1826
				1827	if (num >= ndomains) {
				1828	pr_err("%s: No free domain ids\n", iommu->name);
				1829	domain->iommu_refcnt[iommu->seq_id] -= 1;
				1830	domain->iommu_count -= 1;
				1831	return -ENOSPC;
				1832	}
				1833
				1834	set_bit(num, iommu->domain_ids);
				1835	set_iommu_domain(iommu, num, domain);
				1836
				1837	domain->iommu_did[iommu->seq_id] = num;
				1838	domain->nid = iommu->node;
				1839
				1840	domain_update_iommu_cap(domain);
				1841	}
				1842
				1843	return 0;
				1844	}
				1845
				1846	static int domain_detach_iommu(struct dmar_domain *domain,
				1847	struct intel_iommu *iommu)
				1848	{
				1849	int num, count = INT_MAX;
				1850
				1851	assert_spin_locked(&device_domain_lock);
				1852	assert_spin_locked(&iommu->lock);
				1853
				1854	domain->iommu_refcnt[iommu->seq_id] -= 1;
				1855	count = --domain->iommu_count;
				1856	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
				1857	num = domain->iommu_did[iommu->seq_id];
				1858	clear_bit(num, iommu->domain_ids);
				1859	set_iommu_domain(iommu, num, NULL);
				1860
				1861	domain_update_iommu_cap(domain);
				1862	domain->iommu_did[iommu->seq_id] = 0;
				1863	}
				1864
				1865	return count;
				1866	}
				1867
				1868	static struct iova_domain reserved_iova_list;
				1869	static struct lock_class_key reserved_rbtree_key;
				1870
				1871	static int dmar_init_reserved_ranges(void)
				1872	{
				1873	struct pci_dev *pdev = NULL;
				1874	struct iova *iova;
				1875	int i;
				1876
				1877	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
				1878
				1879	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
				1880	&reserved_rbtree_key);
				1881
				1882	/* IOAPIC ranges shouldn't be accessed by DMA */
				1883	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
				1884	IOVA_PFN(IOAPIC_RANGE_END));
				1885	if (!iova) {
				1886	pr_err("Reserve IOAPIC range failed\n");
				1887	return -ENODEV;
				1888	}
				1889
				1890	/* Reserve all PCI MMIO to avoid peer-to-peer access */
				1891	for_each_pci_dev(pdev) {
				1892	struct resource *r;
				1893
				1894	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
				1895	r = &pdev->resource[i];
				1896	if (!r->flags \|\| !(r->flags & IORESOURCE_MEM))
				1897	continue;
				1898	iova = reserve_iova(&reserved_iova_list,
				1899	IOVA_PFN(r->start),
				1900	IOVA_PFN(r->end));
				1901	if (!iova) {
				1902	pr_err("Reserve iova failed\n");
				1903	return -ENODEV;
				1904	}
				1905	}
				1906	}
				1907	return 0;
				1908	}
				1909
				1910	static void domain_reserve_special_ranges(struct dmar_domain *domain)
				1911	{
				1912	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
				1913	}
				1914
				1915	static inline int guestwidth_to_adjustwidth(int gaw)
				1916	{
				1917	int agaw;
				1918	int r = (gaw - 12) % 9;
				1919
				1920	if (r == 0)
				1921	agaw = gaw;
				1922	else
				1923	agaw = gaw + 9 - r;
				1924	if (agaw > 64)
				1925	agaw = 64;
				1926	return agaw;
				1927	}
				1928
				1929	static int domain_init(struct dmar_domain domain, struct intel_iommu iommu,
				1930	int guest_width)
				1931	{
				1932	int adjust_width, agaw;
				1933	unsigned long sagaw;
				1934	int err;
				1935
				1936	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
				1937
				1938	err = init_iova_flush_queue(&domain->iovad,
				1939	iommu_flush_iova, iova_entry_free);
				1940	if (err)
				1941	return err;
				1942
				1943	domain_reserve_special_ranges(domain);
				1944
				1945	/* calculate AGAW */
				1946	if (guest_width > cap_mgaw(iommu->cap))
				1947	guest_width = cap_mgaw(iommu->cap);
				1948	domain->gaw = guest_width;
				1949	adjust_width = guestwidth_to_adjustwidth(guest_width);
				1950	agaw = width_to_agaw(adjust_width);
				1951	sagaw = cap_sagaw(iommu->cap);
				1952	if (!test_bit(agaw, &sagaw)) {
				1953	/* hardware doesn't support it, choose a bigger one */
				1954	pr_debug("Hardware doesn't support agaw %d\n", agaw);
				1955	agaw = find_next_bit(&sagaw, 5, agaw);
				1956	if (agaw >= 5)
				1957	return -ENODEV;
				1958	}
				1959	domain->agaw = agaw;
				1960
				1961	if (ecap_coherent(iommu->ecap))
				1962	domain->iommu_coherency = 1;
				1963	else
				1964	domain->iommu_coherency = 0;
				1965
				1966	if (ecap_sc_support(iommu->ecap))
				1967	domain->iommu_snooping = 1;
				1968	else
				1969	domain->iommu_snooping = 0;
				1970
				1971	if (intel_iommu_superpage)
				1972	domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
				1973	else
				1974	domain->iommu_superpage = 0;
				1975
				1976	domain->nid = iommu->node;
				1977
				1978	/* always allocate the top pgd */
				1979	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
				1980	if (!domain->pgd)
				1981	return -ENOMEM;
				1982	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
				1983	return 0;
				1984	}
				1985
				1986	static void domain_exit(struct dmar_domain *domain)
				1987	{
				1988	struct page *freelist = NULL;
				1989
				1990	/* Domain 0 is reserved, so dont process it */
				1991	if (!domain)
				1992	return;
				1993
				1994	/* Remove associated devices and clear attached or cached domains */
				1995	rcu_read_lock();
				1996	domain_remove_dev_info(domain);
				1997	rcu_read_unlock();
				1998
				1999	/* destroy iovas */
				2000	put_iova_domain(&domain->iovad);
				2001
				2002	freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
				2003
				2004	dma_free_pagelist(freelist);
				2005
				2006	free_domain_mem(domain);
				2007	}
				2008
				2009	static int domain_context_mapping_one(struct dmar_domain *domain,
				2010	struct intel_iommu *iommu,
				2011	u8 bus, u8 devfn)
				2012	{
				2013	u16 did = domain->iommu_did[iommu->seq_id];
				2014	int translation = CONTEXT_TT_MULTI_LEVEL;
				2015	struct device_domain_info *info = NULL;
				2016	struct context_entry *context;
				2017	unsigned long flags;
				2018	struct dma_pte *pgd;
				2019	int ret, agaw;
				2020
				2021	WARN_ON(did == 0);
				2022
				2023	if (hw_pass_through && domain_type_is_si(domain))
				2024	translation = CONTEXT_TT_PASS_THROUGH;
				2025
				2026	pr_debug("Set context mapping for %02x:%02x.%d\n",
				2027	bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
				2028
				2029	BUG_ON(!domain->pgd);
				2030
				2031	spin_lock_irqsave(&device_domain_lock, flags);
				2032	spin_lock(&iommu->lock);
				2033
				2034	ret = -ENOMEM;
				2035	context = iommu_context_addr(iommu, bus, devfn, 1);
				2036	if (!context)
				2037	goto out_unlock;
				2038
				2039	ret = 0;
				2040	if (context_present(context))
				2041	goto out_unlock;
				2042
				2043	/*
				2044	* For kdump cases, old valid entries may be cached due to the
				2045	* in-flight DMA and copied pgtable, but there is no unmapping
				2046	* behaviour for them, thus we need an explicit cache flush for
				2047	* the newly-mapped device. For kdump, at this point, the device
				2048	* is supposed to finish reset at its driver probe stage, so no
				2049	* in-flight DMA will exist, and we don't need to worry anymore
				2050	* hereafter.
				2051	*/
				2052	if (context_copied(context)) {
				2053	u16 did_old = context_domain_id(context);
				2054
				2055	if (did_old < cap_ndoms(iommu->cap)) {
				2056	iommu->flush.flush_context(iommu, did_old,
				2057	(((u16)bus) << 8) \| devfn,
				2058	DMA_CCMD_MASK_NOBIT,
				2059	DMA_CCMD_DEVICE_INVL);
				2060	iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
				2061	DMA_TLB_DSI_FLUSH);
				2062	}
				2063	}
				2064
				2065	pgd = domain->pgd;
				2066
				2067	context_clear_entry(context);
				2068	context_set_domain_id(context, did);
				2069
				2070	/*
				2071	* Skip top levels of page tables for iommu which has less agaw
				2072	* than default. Unnecessary for PT mode.
				2073	*/
				2074	if (translation != CONTEXT_TT_PASS_THROUGH) {
				2075	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
				2076	ret = -ENOMEM;
				2077	pgd = phys_to_virt(dma_pte_addr(pgd));
				2078	if (!dma_pte_present(pgd))
				2079	goto out_unlock;
				2080	}
				2081
				2082	info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
				2083	if (info && info->ats_supported)
				2084	translation = CONTEXT_TT_DEV_IOTLB;
				2085	else
				2086	translation = CONTEXT_TT_MULTI_LEVEL;
				2087
				2088	context_set_address_root(context, virt_to_phys(pgd));
				2089	context_set_address_width(context, agaw);
				2090	} else {
				2091	/*
				2092	* In pass through mode, AW must be programmed to
				2093	* indicate the largest AGAW value supported by
				2094	* hardware. And ASR is ignored by hardware.
				2095	*/
				2096	context_set_address_width(context, iommu->msagaw);
				2097	}
				2098
				2099	context_set_translation_type(context, translation);
				2100	context_set_fault_enable(context);
				2101	context_set_present(context);
				2102	domain_flush_cache(domain, context, sizeof(*context));
				2103
				2104	/*
				2105	* It's a non-present to present mapping. If hardware doesn't cache
				2106	* non-present entry we only need to flush the write-buffer. If the
				2107	* _does_ cache non-present entries, then it does so in the special
				2108	* domain #0, which we have to flush:
				2109	*/
				2110	if (cap_caching_mode(iommu->cap)) {
				2111	iommu->flush.flush_context(iommu, 0,
				2112	(((u16)bus) << 8) \| devfn,
				2113	DMA_CCMD_MASK_NOBIT,
				2114	DMA_CCMD_DEVICE_INVL);
				2115	iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
				2116	} else {
				2117	iommu_flush_write_buffer(iommu);
				2118	}
				2119	iommu_enable_dev_iotlb(info);
				2120
				2121	ret = 0;
				2122
				2123	out_unlock:
				2124	spin_unlock(&iommu->lock);
				2125	spin_unlock_irqrestore(&device_domain_lock, flags);
				2126
				2127	return ret;
				2128	}
				2129
				2130	struct domain_context_mapping_data {
				2131	struct dmar_domain *domain;
				2132	struct intel_iommu *iommu;
				2133	};
				2134
				2135	static int domain_context_mapping_cb(struct pci_dev *pdev,
				2136	u16 alias, void *opaque)
				2137	{
				2138	struct domain_context_mapping_data *data = opaque;
				2139
				2140	return domain_context_mapping_one(data->domain, data->iommu,
				2141	PCI_BUS_NUM(alias), alias & 0xff);
				2142	}
				2143
				2144	static int
				2145	domain_context_mapping(struct dmar_domain domain, struct device dev)
				2146	{
				2147	struct intel_iommu *iommu;
				2148	u8 bus, devfn;
				2149	struct domain_context_mapping_data data;
				2150
				2151	iommu = device_to_iommu(dev, &bus, &devfn);
				2152	if (!iommu)
				2153	return -ENODEV;
				2154
				2155	if (!dev_is_pci(dev))
				2156	return domain_context_mapping_one(domain, iommu, bus, devfn);
				2157
				2158	data.domain = domain;
				2159	data.iommu = iommu;
				2160
				2161	return pci_for_each_dma_alias(to_pci_dev(dev),
				2162	&domain_context_mapping_cb, &data);
				2163	}
				2164
				2165	static int domain_context_mapped_cb(struct pci_dev *pdev,
				2166	u16 alias, void *opaque)
				2167	{
				2168	struct intel_iommu *iommu = opaque;
				2169
				2170	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
				2171	}
				2172
				2173	static int domain_context_mapped(struct device *dev)
				2174	{
				2175	struct intel_iommu *iommu;
				2176	u8 bus, devfn;
				2177
				2178	iommu = device_to_iommu(dev, &bus, &devfn);
				2179	if (!iommu)
				2180	return -ENODEV;
				2181
				2182	if (!dev_is_pci(dev))
				2183	return device_context_mapped(iommu, bus, devfn);
				2184
				2185	return !pci_for_each_dma_alias(to_pci_dev(dev),
				2186	domain_context_mapped_cb, iommu);
				2187	}
				2188
				2189	/* Returns a number of VTD pages, but aligned to MM page size */
				2190	static inline unsigned long aligned_nrpages(unsigned long host_addr,
				2191	size_t size)
				2192	{
				2193	host_addr &= ~PAGE_MASK;
				2194	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
				2195	}
				2196
				2197	/* Return largest possible superpage level for a given mapping */
				2198	static inline int hardware_largepage_caps(struct dmar_domain *domain,
				2199	unsigned long iov_pfn,
				2200	unsigned long phy_pfn,
				2201	unsigned long pages)
				2202	{
				2203	int support, level = 1;
				2204	unsigned long pfnmerge;
				2205
				2206	support = domain->iommu_superpage;
				2207
				2208	/* To use a large page, the virtual and physical addresses
				2209	must be aligned to 2MiB/1GiB/etc. Lower bits set in either
				2210	of them will mean we have to use smaller pages. So just
				2211	merge them and check both at once. */
				2212	pfnmerge = iov_pfn \| phy_pfn;
				2213
				2214	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
				2215	pages >>= VTD_STRIDE_SHIFT;
				2216	if (!pages)
				2217	break;
				2218	pfnmerge >>= VTD_STRIDE_SHIFT;
				2219	level++;
				2220	support--;
				2221	}
				2222	return level;
				2223	}
				2224
				2225	static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
				2226	struct scatterlist *sg, unsigned long phys_pfn,
				2227	unsigned long nr_pages, int prot)
				2228	{
				2229	struct dma_pte first_pte = NULL, pte = NULL;
				2230	phys_addr_t uninitialized_var(pteval);
				2231	unsigned long sg_res = 0;
				2232	unsigned int largepage_lvl = 0;
				2233	unsigned long lvl_pages = 0;
				2234
				2235	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
				2236
				2237	if ((prot & (DMA_PTE_READ\|DMA_PTE_WRITE)) == 0)
				2238	return -EINVAL;
				2239
				2240	prot &= DMA_PTE_READ \| DMA_PTE_WRITE \| DMA_PTE_SNP;
				2241
				2242	if (!sg) {
				2243	sg_res = nr_pages;
				2244	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) \| prot;
				2245	}
				2246
				2247	while (nr_pages > 0) {
				2248	uint64_t tmp;
				2249
				2250	if (!sg_res) {
				2251	unsigned int pgoff = sg->offset & ~PAGE_MASK;
				2252
				2253	sg_res = aligned_nrpages(sg->offset, sg->length);
				2254	sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
				2255	sg->dma_length = sg->length;
				2256	pteval = (sg_phys(sg) - pgoff) \| prot;
				2257	phys_pfn = pteval >> VTD_PAGE_SHIFT;
				2258	}
				2259
				2260	if (!pte) {
				2261	largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
				2262
				2263	first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
				2264	if (!pte)
				2265	return -ENOMEM;
				2266	/* It is large page*/
				2267	if (largepage_lvl > 1) {
				2268	unsigned long nr_superpages, end_pfn;
				2269
				2270	pteval \|= DMA_PTE_LARGE_PAGE;
				2271	lvl_pages = lvl_to_nr_pages(largepage_lvl);
				2272
				2273	nr_superpages = sg_res / lvl_pages;
				2274	end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
				2275
				2276	/*
				2277	* Ensure that old small page tables are
				2278	* removed to make room for superpage(s).
				2279	* We're adding new large pages, so make sure
				2280	* we don't remove their parent tables.
				2281	*/
				2282	dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
				2283	largepage_lvl + 1);
				2284	} else {
				2285	pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
				2286	}
				2287
				2288	}
				2289	/* We don't need lock here, nobody else
				2290	* touches the iova range
				2291	*/
				2292	tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
				2293	if (tmp) {
				2294	static int dumps = 5;
				2295	pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
				2296	iov_pfn, tmp, (unsigned long long)pteval);
				2297	if (dumps) {
				2298	dumps--;
				2299	debug_dma_dump_mappings(NULL);
				2300	}
				2301	WARN_ON(1);
				2302	}
				2303
				2304	lvl_pages = lvl_to_nr_pages(largepage_lvl);
				2305
				2306	BUG_ON(nr_pages < lvl_pages);
				2307	BUG_ON(sg_res < lvl_pages);
				2308
				2309	nr_pages -= lvl_pages;
				2310	iov_pfn += lvl_pages;
				2311	phys_pfn += lvl_pages;
				2312	pteval += lvl_pages * VTD_PAGE_SIZE;
				2313	sg_res -= lvl_pages;
				2314
				2315	/* If the next PTE would be the first in a new page, then we
				2316	need to flush the cache on the entries we've just written.
				2317	And then we'll need to recalculate 'pte', so clear it and
				2318	let it get set again in the if (!pte) block above.
				2319
				2320	If we're done (!nr_pages) we need to flush the cache too.
				2321
				2322	Also if we've been setting superpages, we may need to
				2323	recalculate 'pte' and switch back to smaller pages for the
				2324	end of the mapping, if the trailing size is not enough to
				2325	use another superpage (i.e. sg_res < lvl_pages). */
				2326	pte++;
				2327	if (!nr_pages \|\| first_pte_in_page(pte) \|\|
				2328	(largepage_lvl > 1 && sg_res < lvl_pages)) {
				2329	domain_flush_cache(domain, first_pte,
				2330	(void )pte - (void )first_pte);
				2331	pte = NULL;
				2332	}
				2333
				2334	if (!sg_res && nr_pages)
				2335	sg = sg_next(sg);
				2336	}
				2337	return 0;
				2338	}
				2339
				2340	static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
				2341	struct scatterlist *sg, unsigned long phys_pfn,
				2342	unsigned long nr_pages, int prot)
				2343	{
				2344	int ret;
				2345	struct intel_iommu *iommu;
				2346
				2347	/* Do the real mapping first */
				2348	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
				2349	if (ret)
				2350	return ret;
				2351
				2352	/* Notify about the new mapping */
				2353	if (domain_type_is_vm(domain)) {
				2354	/* VM typed domains can have more than one IOMMUs */
				2355	int iommu_id;
				2356	for_each_domain_iommu(iommu_id, domain) {
				2357	iommu = g_iommus[iommu_id];
				2358	__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
				2359	}
				2360	} else {
				2361	/* General domains only have one IOMMU */
				2362	iommu = domain_get_iommu(domain);
				2363	__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
				2364	}
				2365
				2366	return 0;
				2367	}
				2368
				2369	static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
				2370	struct scatterlist *sg, unsigned long nr_pages,
				2371	int prot)
				2372	{
				2373	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
				2374	}
				2375
				2376	static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
				2377	unsigned long phys_pfn, unsigned long nr_pages,
				2378	int prot)
				2379	{
				2380	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
				2381	}
				2382
				2383	static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
				2384	{
				2385	unsigned long flags;
				2386	struct context_entry *context;
				2387	u16 did_old;
				2388
				2389	if (!iommu)
				2390	return;
				2391
				2392	spin_lock_irqsave(&iommu->lock, flags);
				2393	context = iommu_context_addr(iommu, bus, devfn, 0);
				2394	if (!context) {
				2395	spin_unlock_irqrestore(&iommu->lock, flags);
				2396	return;
				2397	}
				2398	did_old = context_domain_id(context);
				2399	context_clear_entry(context);
				2400	__iommu_flush_cache(iommu, context, sizeof(*context));
				2401	spin_unlock_irqrestore(&iommu->lock, flags);
				2402	iommu->flush.flush_context(iommu,
				2403	did_old,
				2404	(((u16)bus) << 8) \| devfn,
				2405	DMA_CCMD_MASK_NOBIT,
				2406	DMA_CCMD_DEVICE_INVL);
				2407	iommu->flush.flush_iotlb(iommu,
				2408	did_old,
				2409	0,
				2410	0,
				2411	DMA_TLB_DSI_FLUSH);
				2412	}
				2413
				2414	static inline void unlink_domain_info(struct device_domain_info *info)
				2415	{
				2416	assert_spin_locked(&device_domain_lock);
				2417	list_del(&info->link);
				2418	list_del(&info->global);
				2419	if (info->dev)
				2420	info->dev->archdata.iommu = NULL;
				2421	}
				2422
				2423	static void domain_remove_dev_info(struct dmar_domain *domain)
				2424	{
				2425	struct device_domain_info info, tmp;
				2426	unsigned long flags;
				2427
				2428	spin_lock_irqsave(&device_domain_lock, flags);
				2429	list_for_each_entry_safe(info, tmp, &domain->devices, link)
				2430	__dmar_remove_one_dev_info(info);
				2431	spin_unlock_irqrestore(&device_domain_lock, flags);
				2432	}
				2433
				2434	/*
				2435	* find_domain
				2436	* Note: we use struct device->archdata.iommu stores the info
				2437	*/
				2438	static struct dmar_domain find_domain(struct device dev)
				2439	{
				2440	struct device_domain_info *info;
				2441
				2442	/* No lock here, assumes no domain exit in normal case */
				2443	info = dev->archdata.iommu;
				2444	if (likely(info))
				2445	return info->domain;
				2446	return NULL;
				2447	}
				2448
				2449	static inline struct device_domain_info *
				2450	dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
				2451	{
				2452	struct device_domain_info *info;
				2453
				2454	list_for_each_entry(info, &device_domain_list, global)
				2455	if (info->iommu->segment == segment && info->bus == bus &&
				2456	info->devfn == devfn)
				2457	return info;
				2458
				2459	return NULL;
				2460	}
				2461
				2462	static struct dmar_domain dmar_insert_one_dev_info(struct intel_iommu iommu,
				2463	int bus, int devfn,
				2464	struct device *dev,
				2465	struct dmar_domain *domain)
				2466	{
				2467	struct dmar_domain *found = NULL;
				2468	struct device_domain_info *info;
				2469	unsigned long flags;
				2470	int ret;
				2471
				2472	info = alloc_devinfo_mem();
				2473	if (!info)
				2474	return NULL;
				2475
				2476	info->bus = bus;
				2477	info->devfn = devfn;
				2478	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
				2479	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
				2480	info->ats_qdep = 0;
				2481	info->dev = dev;
				2482	info->domain = domain;
				2483	info->iommu = iommu;
				2484	info->pasid_table = NULL;
				2485
				2486	if (dev && dev_is_pci(dev)) {
				2487	struct pci_dev *pdev = to_pci_dev(info->dev);
				2488
				2489	if (!pci_ats_disabled() &&
				2490	ecap_dev_iotlb_support(iommu->ecap) &&
				2491	pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
				2492	dmar_find_matched_atsr_unit(pdev))
				2493	info->ats_supported = 1;
				2494
				2495	if (ecs_enabled(iommu)) {
				2496	if (pasid_enabled(iommu)) {
				2497	int features = pci_pasid_features(pdev);
				2498	if (features >= 0)
				2499	info->pasid_supported = features \| 1;
				2500	}
				2501
				2502	if (info->ats_supported && ecap_prs(iommu->ecap) &&
				2503	pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
				2504	info->pri_supported = 1;
				2505	}
				2506	}
				2507
				2508	spin_lock_irqsave(&device_domain_lock, flags);
				2509	if (dev)
				2510	found = find_domain(dev);
				2511
				2512	if (!found) {
				2513	struct device_domain_info *info2;
				2514	info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
				2515	if (info2) {
				2516	found = info2->domain;
				2517	info2->dev = dev;
				2518	}
				2519	}
				2520
				2521	if (found) {
				2522	spin_unlock_irqrestore(&device_domain_lock, flags);
				2523	free_devinfo_mem(info);
				2524	/* Caller must free the original domain */
				2525	return found;
				2526	}
				2527
				2528	spin_lock(&iommu->lock);
				2529	ret = domain_attach_iommu(domain, iommu);
				2530	spin_unlock(&iommu->lock);
				2531
				2532	if (ret) {
				2533	spin_unlock_irqrestore(&device_domain_lock, flags);
				2534	free_devinfo_mem(info);
				2535	return NULL;
				2536	}
				2537
				2538	list_add(&info->link, &domain->devices);
				2539	list_add(&info->global, &device_domain_list);
				2540	if (dev)
				2541	dev->archdata.iommu = info;
				2542
				2543	if (dev && dev_is_pci(dev) && info->pasid_supported) {
				2544	ret = intel_pasid_alloc_table(dev);
				2545	if (ret) {
				2546	pr_warn("No pasid table for %s, pasid disabled\n",
				2547	dev_name(dev));
				2548	info->pasid_supported = 0;
				2549	}
				2550	}
				2551	spin_unlock_irqrestore(&device_domain_lock, flags);
				2552
				2553	if (dev && domain_context_mapping(domain, dev)) {
				2554	pr_err("Domain context map for %s failed\n", dev_name(dev));
				2555	dmar_remove_one_dev_info(domain, dev);
				2556	return NULL;
				2557	}
				2558
				2559	return domain;
				2560	}
				2561
				2562	static int get_last_alias(struct pci_dev pdev, u16 alias, void opaque)
				2563	{
				2564	(u16 )opaque = alias;
				2565	return 0;
				2566	}
				2567
				2568	static struct dmar_domain find_or_alloc_domain(struct device dev, int gaw)
				2569	{
				2570	struct device_domain_info *info = NULL;
				2571	struct dmar_domain *domain = NULL;
				2572	struct intel_iommu *iommu;
				2573	u16 dma_alias;
				2574	unsigned long flags;
				2575	u8 bus, devfn;
				2576
				2577	iommu = device_to_iommu(dev, &bus, &devfn);
				2578	if (!iommu)
				2579	return NULL;
				2580
				2581	if (dev_is_pci(dev)) {
				2582	struct pci_dev *pdev = to_pci_dev(dev);
				2583
				2584	pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
				2585
				2586	spin_lock_irqsave(&device_domain_lock, flags);
				2587	info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
				2588	PCI_BUS_NUM(dma_alias),
				2589	dma_alias & 0xff);
				2590	if (info) {
				2591	iommu = info->iommu;
				2592	domain = info->domain;
				2593	}
				2594	spin_unlock_irqrestore(&device_domain_lock, flags);
				2595
				2596	/* DMA alias already has a domain, use it */
				2597	if (info)
				2598	goto out;
				2599	}
				2600
				2601	/* Allocate and initialize new domain for the device */
				2602	domain = alloc_domain(0);
				2603	if (!domain)
				2604	return NULL;
				2605	if (domain_init(domain, iommu, gaw)) {
				2606	domain_exit(domain);
				2607	return NULL;
				2608	}
				2609
				2610	out:
				2611
				2612	return domain;
				2613	}
				2614
				2615	static struct dmar_domain set_domain_for_dev(struct device dev,
				2616	struct dmar_domain *domain)
				2617	{
				2618	struct intel_iommu *iommu;
				2619	struct dmar_domain *tmp;
				2620	u16 req_id, dma_alias;
				2621	u8 bus, devfn;
				2622
				2623	iommu = device_to_iommu(dev, &bus, &devfn);
				2624	if (!iommu)
				2625	return NULL;
				2626
				2627	req_id = ((u16)bus << 8) \| devfn;
				2628
				2629	if (dev_is_pci(dev)) {
				2630	struct pci_dev *pdev = to_pci_dev(dev);
				2631
				2632	pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
				2633
				2634	/* register PCI DMA alias device */
				2635	if (req_id != dma_alias) {
				2636	tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
				2637	dma_alias & 0xff, NULL, domain);
				2638
				2639	if (!tmp \|\| tmp != domain)
				2640	return tmp;
				2641	}
				2642	}
				2643
				2644	tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
				2645	if (!tmp \|\| tmp != domain)
				2646	return tmp;
				2647
				2648	return domain;
				2649	}
				2650
				2651	static struct dmar_domain get_domain_for_dev(struct device dev, int gaw)
				2652	{
				2653	struct dmar_domain domain, tmp;
				2654
				2655	domain = find_domain(dev);
				2656	if (domain)
				2657	goto out;
				2658
				2659	domain = find_or_alloc_domain(dev, gaw);
				2660	if (!domain)
				2661	goto out;
				2662
				2663	tmp = set_domain_for_dev(dev, domain);
				2664	if (!tmp \|\| domain != tmp) {
				2665	domain_exit(domain);
				2666	domain = tmp;
				2667	}
				2668
				2669	out:
				2670
				2671	return domain;
				2672	}
				2673
				2674	static int iommu_domain_identity_map(struct dmar_domain *domain,
				2675	unsigned long long start,
				2676	unsigned long long end)
				2677	{
				2678	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
				2679	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
				2680
				2681	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
				2682	dma_to_mm_pfn(last_vpfn))) {
				2683	pr_err("Reserving iova failed\n");
				2684	return -ENOMEM;
				2685	}
				2686
				2687	pr_debug("Mapping reserved region %llx-%llx\n", start, end);
				2688	/*
				2689	* RMRR range might have overlap with physical memory range,
				2690	* clear it first
				2691	*/
				2692	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
				2693
				2694	return __domain_mapping(domain, first_vpfn, NULL,
				2695	first_vpfn, last_vpfn - first_vpfn + 1,
				2696	DMA_PTE_READ\|DMA_PTE_WRITE);
				2697	}
				2698
				2699	static int domain_prepare_identity_map(struct device *dev,
				2700	struct dmar_domain *domain,
				2701	unsigned long long start,
				2702	unsigned long long end)
				2703	{
				2704	/* For _hardware_ passthrough, don't bother. But for software
				2705	passthrough, we do it anyway -- it may indicate a memory
				2706	range which is reserved in E820, so which didn't get set
				2707	up to start with in si_domain */
				2708	if (domain == si_domain && hw_pass_through) {
				2709	pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
				2710	dev_name(dev), start, end);
				2711	return 0;
				2712	}
				2713
				2714	pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
				2715	dev_name(dev), start, end);
				2716
				2717	if (end < start) {
				2718	WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
				2719	"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
				2720	dmi_get_system_info(DMI_BIOS_VENDOR),
				2721	dmi_get_system_info(DMI_BIOS_VERSION),
				2722	dmi_get_system_info(DMI_PRODUCT_VERSION));
				2723	return -EIO;
				2724	}
				2725
				2726	if (end >> agaw_to_width(domain->agaw)) {
				2727	WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
				2728	"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
				2729	agaw_to_width(domain->agaw),
				2730	dmi_get_system_info(DMI_BIOS_VENDOR),
				2731	dmi_get_system_info(DMI_BIOS_VERSION),
				2732	dmi_get_system_info(DMI_PRODUCT_VERSION));
				2733	return -EIO;
				2734	}
				2735
				2736	return iommu_domain_identity_map(domain, start, end);
				2737	}
				2738
				2739	static int iommu_prepare_identity_map(struct device *dev,
				2740	unsigned long long start,
				2741	unsigned long long end)
				2742	{
				2743	struct dmar_domain *domain;
				2744	int ret;
				2745
				2746	domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
				2747	if (!domain)
				2748	return -ENOMEM;
				2749
				2750	ret = domain_prepare_identity_map(dev, domain, start, end);
				2751	if (ret)
				2752	domain_exit(domain);
				2753
				2754	return ret;
				2755	}
				2756
				2757	static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
				2758	struct device *dev)
				2759	{
				2760	if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
				2761	return 0;
				2762	return iommu_prepare_identity_map(dev, rmrr->base_address,
				2763	rmrr->end_address);
				2764	}
				2765
				2766	#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
				2767	static inline void iommu_prepare_isa(void)
				2768	{
				2769	struct pci_dev *pdev;
				2770	int ret;
				2771
				2772	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
				2773	if (!pdev)
				2774	return;
				2775
				2776	pr_info("Prepare 0-16MiB unity mapping for LPC\n");
				2777	ret = iommu_prepare_identity_map(&pdev->dev, 0, 1610241024 - 1);
				2778
				2779	if (ret)
				2780	pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
				2781
				2782	pci_dev_put(pdev);
				2783	}
				2784	#else
				2785	static inline void iommu_prepare_isa(void)
				2786	{
				2787	return;
				2788	}
				2789	#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
				2790
				2791	static int md_domain_init(struct dmar_domain *domain, int guest_width);
				2792
				2793	static int __init si_domain_init(int hw)
				2794	{
				2795	int nid, ret = 0;
				2796
				2797	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
				2798	if (!si_domain)
				2799	return -EFAULT;
				2800
				2801	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
				2802	domain_exit(si_domain);
				2803	return -EFAULT;
				2804	}
				2805
				2806	pr_debug("Identity mapping domain allocated\n");
				2807
				2808	if (hw)
				2809	return 0;
				2810
				2811	for_each_online_node(nid) {
				2812	unsigned long start_pfn, end_pfn;
				2813	int i;
				2814
				2815	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
				2816	ret = iommu_domain_identity_map(si_domain,
				2817	PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
				2818	if (ret)
				2819	return ret;
				2820	}
				2821	}
				2822
				2823	return 0;
				2824	}
				2825
				2826	static int identity_mapping(struct device *dev)
				2827	{
				2828	struct device_domain_info *info;
				2829
				2830	if (likely(!iommu_identity_mapping))
				2831	return 0;
				2832
				2833	info = dev->archdata.iommu;
				2834	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
				2835	return (info->domain == si_domain);
				2836
				2837	return 0;
				2838	}
				2839
				2840	static int domain_add_dev_info(struct dmar_domain domain, struct device dev)
				2841	{
				2842	struct dmar_domain *ndomain;
				2843	struct intel_iommu *iommu;
				2844	u8 bus, devfn;
				2845
				2846	iommu = device_to_iommu(dev, &bus, &devfn);
				2847	if (!iommu)
				2848	return -ENODEV;
				2849
				2850	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
				2851	if (ndomain != domain)
				2852	return -EBUSY;
				2853
				2854	return 0;
				2855	}
				2856
				2857	static bool device_has_rmrr(struct device *dev)
				2858	{
				2859	struct dmar_rmrr_unit *rmrr;
				2860	struct device *tmp;
				2861	int i;
				2862
				2863	rcu_read_lock();
				2864	for_each_rmrr_units(rmrr) {
				2865	/*
				2866	* Return TRUE if this RMRR contains the device that
				2867	* is passed in.
				2868	*/
				2869	for_each_active_dev_scope(rmrr->devices,
				2870	rmrr->devices_cnt, i, tmp)
				2871	if (tmp == dev) {
				2872	rcu_read_unlock();
				2873	return true;
				2874	}
				2875	}
				2876	rcu_read_unlock();
				2877	return false;
				2878	}
				2879
				2880	/*
				2881	* There are a couple cases where we need to restrict the functionality of
				2882	* devices associated with RMRRs. The first is when evaluating a device for
				2883	* identity mapping because problems exist when devices are moved in and out
				2884	* of domains and their respective RMRR information is lost. This means that
				2885	* a device with associated RMRRs will never be in a "passthrough" domain.
				2886	* The second is use of the device through the IOMMU API. This interface
				2887	* expects to have full control of the IOVA space for the device. We cannot
				2888	* satisfy both the requirement that RMRR access is maintained and have an
				2889	* unencumbered IOVA space. We also have no ability to quiesce the device's
				2890	* use of the RMRR space or even inform the IOMMU API user of the restriction.
				2891	* We therefore prevent devices associated with an RMRR from participating in
				2892	* the IOMMU API, which eliminates them from device assignment.
				2893	*
				2894	* In both cases we assume that PCI USB devices with RMRRs have them largely
				2895	* for historical reasons and that the RMRR space is not actively used post
				2896	* boot. This exclusion may change if vendors begin to abuse it.
				2897	*
				2898	* The same exception is made for graphics devices, with the requirement that
				2899	* any use of the RMRR regions will be torn down before assigning the device
				2900	* to a guest.
				2901	*/
				2902	static bool device_is_rmrr_locked(struct device *dev)
				2903	{
				2904	if (!device_has_rmrr(dev))
				2905	return false;
				2906
				2907	if (dev_is_pci(dev)) {
				2908	struct pci_dev *pdev = to_pci_dev(dev);
				2909
				2910	if (IS_USB_DEVICE(pdev) \|\| IS_GFX_DEVICE(pdev))
				2911	return false;
				2912	}
				2913
				2914	return true;
				2915	}
				2916
				2917	static int iommu_should_identity_map(struct device *dev, int startup)
				2918	{
				2919
				2920	if (dev_is_pci(dev)) {
				2921	struct pci_dev *pdev = to_pci_dev(dev);
				2922
				2923	if (device_is_rmrr_locked(dev))
				2924	return 0;
				2925
				2926	if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
				2927	return 1;
				2928
				2929	if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
				2930	return 1;
				2931
				2932	if (!(iommu_identity_mapping & IDENTMAP_ALL))
				2933	return 0;
				2934
				2935	/*
				2936	* We want to start off with all devices in the 1:1 domain, and
				2937	* take them out later if we find they can't access all of memory.
				2938	*
				2939	* However, we can't do this for PCI devices behind bridges,
				2940	* because all PCI devices behind the same bridge will end up
				2941	* with the same source-id on their transactions.
				2942	*
				2943	* Practically speaking, we can't change things around for these
				2944	* devices at run-time, because we can't be sure there'll be no
				2945	* DMA transactions in flight for any of their siblings.
				2946	*
				2947	* So PCI devices (unless they're on the root bus) as well as
				2948	* their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
				2949	* the 1:1 domain, just in _case_ one of their siblings turns out
				2950	* not to be able to map all of memory.
				2951	*/
				2952	if (!pci_is_pcie(pdev)) {
				2953	if (!pci_is_root_bus(pdev->bus))
				2954	return 0;
				2955	if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
				2956	return 0;
				2957	} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
				2958	return 0;
				2959	} else {
				2960	if (device_has_rmrr(dev))
				2961	return 0;
				2962	}
				2963
				2964	/*
				2965	* At boot time, we don't yet know if devices will be 64-bit capable.
				2966	* Assume that they will — if they turn out not to be, then we can
				2967	* take them out of the 1:1 domain later.
				2968	*/
				2969	if (!startup) {
				2970	/*
				2971	* If the device's dma_mask is less than the system's memory
				2972	* size then this is not a candidate for identity mapping.
				2973	*/
				2974	u64 dma_mask = *dev->dma_mask;
				2975
				2976	if (dev->coherent_dma_mask &&
				2977	dev->coherent_dma_mask < dma_mask)
				2978	dma_mask = dev->coherent_dma_mask;
				2979
				2980	return dma_mask >= dma_get_required_mask(dev);
				2981	}
				2982
				2983	return 1;
				2984	}
				2985
				2986	static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
				2987	{
				2988	int ret;
				2989
				2990	if (!iommu_should_identity_map(dev, 1))
				2991	return 0;
				2992
				2993	ret = domain_add_dev_info(si_domain, dev);
				2994	if (!ret)
				2995	pr_info("%s identity mapping for device %s\n",
				2996	hw ? "Hardware" : "Software", dev_name(dev));
				2997	else if (ret == -ENODEV)
				2998	/* device not associated with an iommu */
				2999	ret = 0;
				3000
				3001	return ret;
				3002	}
				3003
				3004
				3005	static int __init iommu_prepare_static_identity_mapping(int hw)
				3006	{
				3007	struct pci_dev *pdev = NULL;
				3008	struct dmar_drhd_unit *drhd;
				3009	struct intel_iommu *iommu;
				3010	struct device *dev;
				3011	int i;
				3012	int ret = 0;
				3013
				3014	for_each_pci_dev(pdev) {
				3015	ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
				3016	if (ret)
				3017	return ret;
				3018	}
				3019
				3020	for_each_active_iommu(iommu, drhd)
				3021	for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
				3022	struct acpi_device_physical_node *pn;
				3023	struct acpi_device *adev;
				3024
				3025	if (dev->bus != &acpi_bus_type)
				3026	continue;
				3027
				3028	adev= to_acpi_device(dev);
				3029	mutex_lock(&adev->physical_node_lock);
				3030	list_for_each_entry(pn, &adev->physical_node_list, node) {
				3031	ret = dev_prepare_static_identity_mapping(pn->dev, hw);
				3032	if (ret)
				3033	break;
				3034	}
				3035	mutex_unlock(&adev->physical_node_lock);
				3036	if (ret)
				3037	return ret;
				3038	}
				3039
				3040	return 0;
				3041	}
				3042
				3043	static void intel_iommu_init_qi(struct intel_iommu *iommu)
				3044	{
				3045	/*
				3046	* Start from the sane iommu hardware state.
				3047	* If the queued invalidation is already initialized by us
				3048	* (for example, while enabling interrupt-remapping) then
				3049	* we got the things already rolling from a sane state.
				3050	*/
				3051	if (!iommu->qi) {
				3052	/*
				3053	* Clear any previous faults.
				3054	*/
				3055	dmar_fault(-1, iommu);
				3056	/*
				3057	* Disable queued invalidation if supported and already enabled
				3058	* before OS handover.
				3059	*/
				3060	dmar_disable_qi(iommu);
				3061	}
				3062
				3063	if (dmar_enable_qi(iommu)) {
				3064	/*
				3065	* Queued Invalidate not enabled, use Register Based Invalidate
				3066	*/
				3067	iommu->flush.flush_context = __iommu_flush_context;
				3068	iommu->flush.flush_iotlb = __iommu_flush_iotlb;
				3069	pr_info("%s: Using Register based invalidation\n",
				3070	iommu->name);
				3071	} else {
				3072	iommu->flush.flush_context = qi_flush_context;
				3073	iommu->flush.flush_iotlb = qi_flush_iotlb;
				3074	pr_info("%s: Using Queued invalidation\n", iommu->name);
				3075	}
				3076	}
				3077
				3078	static int copy_context_table(struct intel_iommu *iommu,
				3079	struct root_entry *old_re,
				3080	struct context_entry **tbl,
				3081	int bus, bool ext)
				3082	{
				3083	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
				3084	struct context_entry *new_ce = NULL, ce;
				3085	struct context_entry *old_ce = NULL;
				3086	struct root_entry re;
				3087	phys_addr_t old_ce_phys;
				3088
				3089	tbl_idx = ext ? bus * 2 : bus;
				3090	memcpy(&re, old_re, sizeof(re));
				3091
				3092	for (devfn = 0; devfn < 256; devfn++) {
				3093	/* First calculate the correct index */
				3094	idx = (ext ? devfn * 2 : devfn) % 256;
				3095
				3096	if (idx == 0) {
				3097	/* First save what we may have and clean up */
				3098	if (new_ce) {
				3099	tbl[tbl_idx] = new_ce;
				3100	__iommu_flush_cache(iommu, new_ce,
				3101	VTD_PAGE_SIZE);
				3102	pos = 1;
				3103	}
				3104
				3105	if (old_ce)
				3106	memunmap(old_ce);
				3107
				3108	ret = 0;
				3109	if (devfn < 0x80)
				3110	old_ce_phys = root_entry_lctp(&re);
				3111	else
				3112	old_ce_phys = root_entry_uctp(&re);
				3113
				3114	if (!old_ce_phys) {
				3115	if (ext && devfn == 0) {
				3116	/* No LCTP, try UCTP */
				3117	devfn = 0x7f;
				3118	continue;
				3119	} else {
				3120	goto out;
				3121	}
				3122	}
				3123
				3124	ret = -ENOMEM;
				3125	old_ce = memremap(old_ce_phys, PAGE_SIZE,
				3126	MEMREMAP_WB);
				3127	if (!old_ce)
				3128	goto out;
				3129
				3130	new_ce = alloc_pgtable_page(iommu->node);
				3131	if (!new_ce)
				3132	goto out_unmap;
				3133
				3134	ret = 0;
				3135	}
				3136
				3137	/* Now copy the context entry */
				3138	memcpy(&ce, old_ce + idx, sizeof(ce));
				3139
				3140	if (!__context_present(&ce))
				3141	continue;
				3142
				3143	did = context_domain_id(&ce);
				3144	if (did >= 0 && did < cap_ndoms(iommu->cap))
				3145	set_bit(did, iommu->domain_ids);
				3146
				3147	/*
				3148	* We need a marker for copied context entries. This
				3149	* marker needs to work for the old format as well as
				3150	* for extended context entries.
				3151	*
				3152	* Bit 67 of the context entry is used. In the old
				3153	* format this bit is available to software, in the
				3154	* extended format it is the PGE bit, but PGE is ignored
				3155	* by HW if PASIDs are disabled (and thus still
				3156	* available).
				3157	*
				3158	* So disable PASIDs first and then mark the entry
				3159	* copied. This means that we don't copy PASID
				3160	* translations from the old kernel, but this is fine as
				3161	* faults there are not fatal.
				3162	*/
				3163	context_clear_pasid_enable(&ce);
				3164	context_set_copied(&ce);
				3165
				3166	new_ce[idx] = ce;
				3167	}
				3168
				3169	tbl[tbl_idx + pos] = new_ce;
				3170
				3171	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
				3172
				3173	out_unmap:
				3174	memunmap(old_ce);
				3175
				3176	out:
				3177	return ret;
				3178	}
				3179
				3180	static int copy_translation_tables(struct intel_iommu *iommu)
				3181	{
				3182	struct context_entry **ctxt_tbls;
				3183	struct root_entry *old_rt;
				3184	phys_addr_t old_rt_phys;
				3185	int ctxt_table_entries;
				3186	unsigned long flags;
				3187	u64 rtaddr_reg;
				3188	int bus, ret;
				3189	bool new_ext, ext;
				3190
				3191	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
				3192	ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
				3193	new_ext = !!ecap_ecs(iommu->ecap);
				3194
				3195	/*
				3196	* The RTT bit can only be changed when translation is disabled,
				3197	* but disabling translation means to open a window for data
				3198	* corruption. So bail out and don't copy anything if we would
				3199	* have to change the bit.
				3200	*/
				3201	if (new_ext != ext)
				3202	return -EINVAL;
				3203
				3204	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
				3205	if (!old_rt_phys)
				3206	return -EINVAL;
				3207
				3208	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
				3209	if (!old_rt)
				3210	return -ENOMEM;
				3211
				3212	/* This is too big for the stack - allocate it from slab */
				3213	ctxt_table_entries = ext ? 512 : 256;
				3214	ret = -ENOMEM;
				3215	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
				3216	if (!ctxt_tbls)
				3217	goto out_unmap;
				3218
				3219	for (bus = 0; bus < 256; bus++) {
				3220	ret = copy_context_table(iommu, &old_rt[bus],
				3221	ctxt_tbls, bus, ext);
				3222	if (ret) {
				3223	pr_err("%s: Failed to copy context table for bus %d\n",
				3224	iommu->name, bus);
				3225	continue;
				3226	}
				3227	}
				3228
				3229	spin_lock_irqsave(&iommu->lock, flags);
				3230
				3231	/* Context tables are copied, now write them to the root_entry table */
				3232	for (bus = 0; bus < 256; bus++) {
				3233	int idx = ext ? bus * 2 : bus;
				3234	u64 val;
				3235
				3236	if (ctxt_tbls[idx]) {
				3237	val = virt_to_phys(ctxt_tbls[idx]) \| 1;
				3238	iommu->root_entry[bus].lo = val;
				3239	}
				3240
				3241	if (!ext \|\| !ctxt_tbls[idx + 1])
				3242	continue;
				3243
				3244	val = virt_to_phys(ctxt_tbls[idx + 1]) \| 1;
				3245	iommu->root_entry[bus].hi = val;
				3246	}
				3247
				3248	spin_unlock_irqrestore(&iommu->lock, flags);
				3249
				3250	kfree(ctxt_tbls);
				3251
				3252	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
				3253
				3254	ret = 0;
				3255
				3256	out_unmap:
				3257	memunmap(old_rt);
				3258
				3259	return ret;
				3260	}
				3261
				3262	static int __init init_dmars(void)
				3263	{
				3264	struct dmar_drhd_unit *drhd;
				3265	struct dmar_rmrr_unit *rmrr;
				3266	bool copied_tables = false;
				3267	struct device *dev;
				3268	struct intel_iommu *iommu;
				3269	int i, ret;
				3270
				3271	/*
				3272	* for each drhd
				3273	* allocate root
				3274	* initialize and program root entry to not present
				3275	* endfor
				3276	*/
				3277	for_each_drhd_unit(drhd) {
				3278	/*
				3279	* lock not needed as this is only incremented in the single
				3280	* threaded kernel __init code path all other access are read
				3281	* only
				3282	*/
				3283	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
				3284	g_num_of_iommus++;
				3285	continue;
				3286	}
				3287	pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
				3288	}
				3289
				3290	/* Preallocate enough resources for IOMMU hot-addition */
				3291	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
				3292	g_num_of_iommus = DMAR_UNITS_SUPPORTED;
				3293
				3294	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
				3295	GFP_KERNEL);
				3296	if (!g_iommus) {
				3297	pr_err("Allocating global iommu array failed\n");
				3298	ret = -ENOMEM;
				3299	goto error;
				3300	}
				3301
				3302	for_each_active_iommu(iommu, drhd) {
				3303	/*
				3304	* Find the max pasid size of all IOMMU's in the system.
				3305	* We need to ensure the system pasid table is no bigger
				3306	* than the smallest supported.
				3307	*/
				3308	if (pasid_enabled(iommu)) {
				3309	u32 temp = 2 << ecap_pss(iommu->ecap);
				3310
				3311	intel_pasid_max_id = min_t(u32, temp,
				3312	intel_pasid_max_id);
				3313	}
				3314
				3315	g_iommus[iommu->seq_id] = iommu;
				3316
				3317	intel_iommu_init_qi(iommu);
				3318
				3319	ret = iommu_init_domains(iommu);
				3320	if (ret)
				3321	goto free_iommu;
				3322
				3323	init_translation_status(iommu);
				3324
				3325	if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
				3326	iommu_disable_translation(iommu);
				3327	clear_translation_pre_enabled(iommu);
				3328	pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
				3329	iommu->name);
				3330	}
				3331
				3332	/*
				3333	* TBD:
				3334	* we could share the same root & context tables
				3335	* among all IOMMU's. Need to Split it later.
				3336	*/
				3337	ret = iommu_alloc_root_entry(iommu);
				3338	if (ret)
				3339	goto free_iommu;
				3340
				3341	if (translation_pre_enabled(iommu)) {
				3342	pr_info("Translation already enabled - trying to copy translation structures\n");
				3343
				3344	ret = copy_translation_tables(iommu);
				3345	if (ret) {
				3346	/*
				3347	* We found the IOMMU with translation
				3348	* enabled - but failed to copy over the
				3349	* old root-entry table. Try to proceed
				3350	* by disabling translation now and
				3351	* allocating a clean root-entry table.
				3352	* This might cause DMAR faults, but
				3353	* probably the dump will still succeed.
				3354	*/
				3355	pr_err("Failed to copy translation tables from previous kernel for %s\n",
				3356	iommu->name);
				3357	iommu_disable_translation(iommu);
				3358	clear_translation_pre_enabled(iommu);
				3359	} else {
				3360	pr_info("Copied translation tables from previous kernel for %s\n",
				3361	iommu->name);
				3362	copied_tables = true;
				3363	}
				3364	}
				3365
				3366	if (!ecap_pass_through(iommu->ecap))
				3367	hw_pass_through = 0;
				3368	#ifdef CONFIG_INTEL_IOMMU_SVM
				3369	if (pasid_enabled(iommu))
				3370	intel_svm_init(iommu);
				3371	#endif
				3372	}
				3373
				3374	/*
				3375	* Now that qi is enabled on all iommus, set the root entry and flush
				3376	* caches. This is required on some Intel X58 chipsets, otherwise the
				3377	* flush_context function will loop forever and the boot hangs.
				3378	*/
				3379	for_each_active_iommu(iommu, drhd) {
				3380	iommu_flush_write_buffer(iommu);
				3381	iommu_set_root_entry(iommu);
				3382	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
				3383	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
				3384	}
				3385
				3386	if (iommu_pass_through)
				3387	iommu_identity_mapping \|= IDENTMAP_ALL;
				3388
				3389	#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
				3390	iommu_identity_mapping \|= IDENTMAP_GFX;
				3391	#endif
				3392
				3393	check_tylersburg_isoch();
				3394
				3395	if (iommu_identity_mapping) {
				3396	ret = si_domain_init(hw_pass_through);
				3397	if (ret)
				3398	goto free_iommu;
				3399	}
				3400
				3401
				3402	/*
				3403	* If we copied translations from a previous kernel in the kdump
				3404	* case, we can not assign the devices to domains now, as that
				3405	* would eliminate the old mappings. So skip this part and defer
				3406	* the assignment to device driver initialization time.
				3407	*/
				3408	if (copied_tables)
				3409	goto domains_done;
				3410
				3411	/*
				3412	* If pass through is not set or not enabled, setup context entries for
				3413	* identity mappings for rmrr, gfx, and isa and may fall back to static
				3414	* identity mapping if iommu_identity_mapping is set.
				3415	*/
				3416	if (iommu_identity_mapping) {
				3417	ret = iommu_prepare_static_identity_mapping(hw_pass_through);
				3418	if (ret) {
				3419	pr_crit("Failed to setup IOMMU pass-through\n");
				3420	goto free_iommu;
				3421	}
				3422	}
				3423	/*
				3424	* For each rmrr
				3425	* for each dev attached to rmrr
				3426	* do
				3427	* locate drhd for dev, alloc domain for dev
				3428	* allocate free domain
				3429	* allocate page table entries for rmrr
				3430	* if context not allocated for bus
				3431	* allocate and init context
				3432	* set present in root table for this bus
				3433	* init context with domain, translation etc
				3434	* endfor
				3435	* endfor
				3436	*/
				3437	pr_info("Setting RMRR:\n");
				3438	for_each_rmrr_units(rmrr) {
				3439	/* some BIOS lists non-exist devices in DMAR table. */
				3440	for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
				3441	i, dev) {
				3442	ret = iommu_prepare_rmrr_dev(rmrr, dev);
				3443	if (ret)
				3444	pr_err("Mapping reserved region failed\n");
				3445	}
				3446	}
				3447
				3448	iommu_prepare_isa();
				3449
				3450	domains_done:
				3451
				3452	/*
				3453	* for each drhd
				3454	* enable fault log
				3455	* global invalidate context cache
				3456	* global invalidate iotlb
				3457	* enable translation
				3458	*/
				3459	for_each_iommu(iommu, drhd) {
				3460	if (drhd->ignored) {
				3461	/*
				3462	* we always have to disable PMRs or DMA may fail on
				3463	* this device
				3464	*/
				3465	if (force_on)
				3466	iommu_disable_protect_mem_regions(iommu);
				3467	continue;
				3468	}
				3469
				3470	iommu_flush_write_buffer(iommu);
				3471
				3472	#ifdef CONFIG_INTEL_IOMMU_SVM
				3473	if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
				3474	ret = intel_svm_enable_prq(iommu);
				3475	if (ret)
				3476	goto free_iommu;
				3477	}
				3478	#endif
				3479	ret = dmar_set_interrupt(iommu);
				3480	if (ret)
				3481	goto free_iommu;
				3482
				3483	if (!translation_pre_enabled(iommu))
				3484	iommu_enable_translation(iommu);
				3485
				3486	iommu_disable_protect_mem_regions(iommu);
				3487	}
				3488
				3489	return 0;
				3490
				3491	free_iommu:
				3492	for_each_active_iommu(iommu, drhd) {
				3493	disable_dmar_iommu(iommu);
				3494	free_dmar_iommu(iommu);
				3495	}
				3496
				3497	kfree(g_iommus);
				3498
				3499	error:
				3500	return ret;
				3501	}
				3502
				3503	/* This takes a number of _MM_ pages, not VTD pages */
				3504	static unsigned long intel_alloc_iova(struct device *dev,
				3505	struct dmar_domain *domain,
				3506	unsigned long nrpages, uint64_t dma_mask)
				3507	{
				3508	unsigned long iova_pfn = 0;
				3509
				3510	/* Restrict dma_mask to the width that the iommu can handle */
				3511	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
				3512	/* Ensure we reserve the whole size-aligned region */
				3513	nrpages = __roundup_pow_of_two(nrpages);
				3514
				3515	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
				3516	/*
				3517	* First try to allocate an io virtual address in
				3518	* DMA_BIT_MASK(32) and if that fails then try allocating
				3519	* from higher range
				3520	*/
				3521	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
				3522	IOVA_PFN(DMA_BIT_MASK(32)), false);
				3523	if (iova_pfn)
				3524	return iova_pfn;
				3525	}
				3526	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
				3527	IOVA_PFN(dma_mask), true);
				3528	if (unlikely(!iova_pfn)) {
				3529	pr_err("Allocating %ld-page iova for %s failed",
				3530	nrpages, dev_name(dev));
				3531	return 0;
				3532	}
				3533
				3534	return iova_pfn;
				3535	}
				3536
				3537	struct dmar_domain get_valid_domain_for_dev(struct device dev)
				3538	{
				3539	struct dmar_domain domain, tmp;
				3540	struct dmar_rmrr_unit *rmrr;
				3541	struct device *i_dev;
				3542	int i, ret;
				3543
				3544	domain = find_domain(dev);
				3545	if (domain)
				3546	goto out;
				3547
				3548	domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
				3549	if (!domain)
				3550	goto out;
				3551
				3552	/* We have a new domain - setup possible RMRRs for the device */
				3553	rcu_read_lock();
				3554	for_each_rmrr_units(rmrr) {
				3555	for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
				3556	i, i_dev) {
				3557	if (i_dev != dev)
				3558	continue;
				3559
				3560	ret = domain_prepare_identity_map(dev, domain,
				3561	rmrr->base_address,
				3562	rmrr->end_address);
				3563	if (ret)
				3564	dev_err(dev, "Mapping reserved region failed\n");
				3565	}
				3566	}
				3567	rcu_read_unlock();
				3568
				3569	tmp = set_domain_for_dev(dev, domain);
				3570	if (!tmp \|\| domain != tmp) {
				3571	domain_exit(domain);
				3572	domain = tmp;
				3573	}
				3574
				3575	out:
				3576
				3577	if (!domain)
				3578	pr_err("Allocating domain for %s failed\n", dev_name(dev));
				3579
				3580
				3581	return domain;
				3582	}
				3583
				3584	/* Check if the dev needs to go through non-identity map and unmap process.*/
				3585	static int iommu_no_mapping(struct device *dev)
				3586	{
				3587	int found;
				3588
				3589	if (iommu_dummy(dev))
				3590	return 1;
				3591
				3592	if (!iommu_identity_mapping)
				3593	return 0;
				3594
				3595	found = identity_mapping(dev);
				3596	if (found) {
				3597	if (iommu_should_identity_map(dev, 0))
				3598	return 1;
				3599	else {
				3600	/*
				3601	* 32 bit DMA is removed from si_domain and fall back
				3602	* to non-identity mapping.
				3603	*/
				3604	dmar_remove_one_dev_info(si_domain, dev);
				3605	pr_info("32bit %s uses non-identity mapping\n",
				3606	dev_name(dev));
				3607	return 0;
				3608	}
				3609	} else {
				3610	/*
				3611	* In case of a detached 64 bit DMA device from vm, the device
				3612	* is put into si_domain for identity mapping.
				3613	*/
				3614	if (iommu_should_identity_map(dev, 0)) {
				3615	int ret;
				3616	ret = domain_add_dev_info(si_domain, dev);
				3617	if (!ret) {
				3618	pr_info("64bit %s uses identity mapping\n",
				3619	dev_name(dev));
				3620	return 1;
				3621	}
				3622	}
				3623	}
				3624
				3625	return 0;
				3626	}
				3627
				3628	static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
				3629	size_t size, int dir, u64 dma_mask)
				3630	{
				3631	struct dmar_domain *domain;
				3632	phys_addr_t start_paddr;
				3633	unsigned long iova_pfn;
				3634	int prot = 0;
				3635	int ret;
				3636	struct intel_iommu *iommu;
				3637	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
				3638
				3639	BUG_ON(dir == DMA_NONE);
				3640
				3641	if (iommu_no_mapping(dev))
				3642	return paddr;
				3643
				3644	domain = get_valid_domain_for_dev(dev);
				3645	if (!domain)
				3646	return 0;
				3647
				3648	iommu = domain_get_iommu(domain);
				3649	size = aligned_nrpages(paddr, size);
				3650
				3651	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
				3652	if (!iova_pfn)
				3653	goto error;
				3654
				3655	/*
				3656	* Check if DMAR supports zero-length reads on write only
				3657	* mappings..
				3658	*/
				3659	if (dir == DMA_TO_DEVICE \|\| dir == DMA_BIDIRECTIONAL \|\| \
				3660	!cap_zlr(iommu->cap))
				3661	prot \|= DMA_PTE_READ;
				3662	if (dir == DMA_FROM_DEVICE \|\| dir == DMA_BIDIRECTIONAL)
				3663	prot \|= DMA_PTE_WRITE;
				3664	/*
				3665	* paddr - (paddr + size) might be partial page, we should map the whole
				3666	* page. Note: if two part of one page are separately mapped, we
				3667	* might have two guest_addr mapping to the same host paddr, but this
				3668	* is not a big problem
				3669	*/
				3670	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
				3671	mm_to_dma_pfn(paddr_pfn), size, prot);
				3672	if (ret)
				3673	goto error;
				3674
				3675	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
				3676	start_paddr += paddr & ~PAGE_MASK;
				3677	return start_paddr;
				3678
				3679	error:
				3680	if (iova_pfn)
				3681	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
				3682	pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
				3683	dev_name(dev), size, (unsigned long long)paddr, dir);
				3684	return 0;
				3685	}
				3686
				3687	static dma_addr_t intel_map_page(struct device dev, struct page page,
				3688	unsigned long offset, size_t size,
				3689	enum dma_data_direction dir,
				3690	unsigned long attrs)
				3691	{
				3692	return __intel_map_single(dev, page_to_phys(page) + offset, size,
				3693	dir, *dev->dma_mask);
				3694	}
				3695
				3696	static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
				3697	{
				3698	struct dmar_domain *domain;
				3699	unsigned long start_pfn, last_pfn;
				3700	unsigned long nrpages;
				3701	unsigned long iova_pfn;
				3702	struct intel_iommu *iommu;
				3703	struct page *freelist;
				3704
				3705	if (iommu_no_mapping(dev))
				3706	return;
				3707
				3708	domain = find_domain(dev);
				3709	BUG_ON(!domain);
				3710
				3711	iommu = domain_get_iommu(domain);
				3712
				3713	iova_pfn = IOVA_PFN(dev_addr);
				3714
				3715	nrpages = aligned_nrpages(dev_addr, size);
				3716	start_pfn = mm_to_dma_pfn(iova_pfn);
				3717	last_pfn = start_pfn + nrpages - 1;
				3718
				3719	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
				3720	dev_name(dev), start_pfn, last_pfn);
				3721
				3722	freelist = domain_unmap(domain, start_pfn, last_pfn);
				3723
				3724	if (intel_iommu_strict \|\| !has_iova_flush_queue(&domain->iovad)) {
				3725	iommu_flush_iotlb_psi(iommu, domain, start_pfn,
				3726	nrpages, !freelist, 0);
				3727	/* free iova */
				3728	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
				3729	dma_free_pagelist(freelist);
				3730	} else {
				3731	queue_iova(&domain->iovad, iova_pfn, nrpages,
				3732	(unsigned long)freelist);
				3733	/*
				3734	* queue up the release of the unmap to save the 1/6th of the
				3735	* cpu used up by the iotlb flush operation...
				3736	*/
				3737	}
				3738	}
				3739
				3740	static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
				3741	size_t size, enum dma_data_direction dir,
				3742	unsigned long attrs)
				3743	{
				3744	intel_unmap(dev, dev_addr, size);
				3745	}
				3746
				3747	static void intel_alloc_coherent(struct device dev, size_t size,
				3748	dma_addr_t *dma_handle, gfp_t flags,
				3749	unsigned long attrs)
				3750	{
				3751	struct page *page = NULL;
				3752	int order;
				3753
				3754	size = PAGE_ALIGN(size);
				3755	order = get_order(size);
				3756
				3757	if (!iommu_no_mapping(dev))
				3758	flags &= ~(GFP_DMA \| GFP_DMA32);
				3759	else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
				3760	if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
				3761	flags \|= GFP_DMA;
				3762	else
				3763	flags \|= GFP_DMA32;
				3764	}
				3765
				3766	if (gfpflags_allow_blocking(flags)) {
				3767	unsigned int count = size >> PAGE_SHIFT;
				3768
				3769	page = dma_alloc_from_contiguous(dev, count, order,
				3770	flags & __GFP_NOWARN);
				3771	if (page && iommu_no_mapping(dev) &&
				3772	page_to_phys(page) + size > dev->coherent_dma_mask) {
				3773	dma_release_from_contiguous(dev, page, count);
				3774	page = NULL;
				3775	}
				3776	}
				3777
				3778	if (!page)
				3779	page = alloc_pages(flags, order);
				3780	if (!page)
				3781	return NULL;
				3782	memset(page_address(page), 0, size);
				3783
				3784	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
				3785	DMA_BIDIRECTIONAL,
				3786	dev->coherent_dma_mask);
				3787	if (*dma_handle)
				3788	return page_address(page);
				3789	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
				3790	__free_pages(page, order);
				3791
				3792	return NULL;
				3793	}
				3794
				3795	static void intel_free_coherent(struct device dev, size_t size, void vaddr,
				3796	dma_addr_t dma_handle, unsigned long attrs)
				3797	{
				3798	int order;
				3799	struct page *page = virt_to_page(vaddr);
				3800
				3801	size = PAGE_ALIGN(size);
				3802	order = get_order(size);
				3803
				3804	intel_unmap(dev, dma_handle, size);
				3805	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
				3806	__free_pages(page, order);
				3807	}
				3808
				3809	static void intel_unmap_sg(struct device dev, struct scatterlist sglist,
				3810	int nelems, enum dma_data_direction dir,
				3811	unsigned long attrs)
				3812	{
				3813	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
				3814	unsigned long nrpages = 0;
				3815	struct scatterlist *sg;
				3816	int i;
				3817
				3818	for_each_sg(sglist, sg, nelems, i) {
				3819	nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
				3820	}
				3821
				3822	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
				3823	}
				3824
				3825	static int intel_nontranslate_map_sg(struct device *hddev,
				3826	struct scatterlist *sglist, int nelems, int dir)
				3827	{
				3828	int i;
				3829	struct scatterlist *sg;
				3830
				3831	for_each_sg(sglist, sg, nelems, i) {
				3832	BUG_ON(!sg_page(sg));
				3833	sg->dma_address = sg_phys(sg);
				3834	sg->dma_length = sg->length;
				3835	}
				3836	return nelems;
				3837	}
				3838
				3839	static int intel_map_sg(struct device dev, struct scatterlist sglist, int nelems,
				3840	enum dma_data_direction dir, unsigned long attrs)
				3841	{
				3842	int i;
				3843	struct dmar_domain *domain;
				3844	size_t size = 0;
				3845	int prot = 0;
				3846	unsigned long iova_pfn;
				3847	int ret;
				3848	struct scatterlist *sg;
				3849	unsigned long start_vpfn;
				3850	struct intel_iommu *iommu;
				3851
				3852	BUG_ON(dir == DMA_NONE);
				3853	if (iommu_no_mapping(dev))
				3854	return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
				3855
				3856	domain = get_valid_domain_for_dev(dev);
				3857	if (!domain)
				3858	return 0;
				3859
				3860	iommu = domain_get_iommu(domain);
				3861
				3862	for_each_sg(sglist, sg, nelems, i)
				3863	size += aligned_nrpages(sg->offset, sg->length);
				3864
				3865	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
				3866	*dev->dma_mask);
				3867	if (!iova_pfn) {
				3868	sglist->dma_length = 0;
				3869	return 0;
				3870	}
				3871
				3872	/*
				3873	* Check if DMAR supports zero-length reads on write only
				3874	* mappings..
				3875	*/
				3876	if (dir == DMA_TO_DEVICE \|\| dir == DMA_BIDIRECTIONAL \|\| \
				3877	!cap_zlr(iommu->cap))
				3878	prot \|= DMA_PTE_READ;
				3879	if (dir == DMA_FROM_DEVICE \|\| dir == DMA_BIDIRECTIONAL)
				3880	prot \|= DMA_PTE_WRITE;
				3881
				3882	start_vpfn = mm_to_dma_pfn(iova_pfn);
				3883
				3884	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
				3885	if (unlikely(ret)) {
				3886	dma_pte_free_pagetable(domain, start_vpfn,
				3887	start_vpfn + size - 1,
				3888	agaw_to_level(domain->agaw) + 1);
				3889	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
				3890	return 0;
				3891	}
				3892
				3893	return nelems;
				3894	}
				3895
				3896	static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
				3897	{
				3898	return !dma_addr;
				3899	}
				3900
				3901	const struct dma_map_ops intel_dma_ops = {
				3902	.alloc = intel_alloc_coherent,
				3903	.free = intel_free_coherent,
				3904	.map_sg = intel_map_sg,
				3905	.unmap_sg = intel_unmap_sg,
				3906	.map_page = intel_map_page,
				3907	.unmap_page = intel_unmap_page,
				3908	.mapping_error = intel_mapping_error,
				3909	#ifdef CONFIG_X86
				3910	.dma_supported = dma_direct_supported,
				3911	#endif
				3912	};
				3913
				3914	static inline int iommu_domain_cache_init(void)
				3915	{
				3916	int ret = 0;
				3917
				3918	iommu_domain_cache = kmem_cache_create("iommu_domain",
				3919	sizeof(struct dmar_domain),
				3920	0,
				3921	SLAB_HWCACHE_ALIGN,
				3922
				3923	NULL);
				3924	if (!iommu_domain_cache) {
				3925	pr_err("Couldn't create iommu_domain cache\n");
				3926	ret = -ENOMEM;
				3927	}
				3928
				3929	return ret;
				3930	}
				3931
				3932	static inline int iommu_devinfo_cache_init(void)
				3933	{
				3934	int ret = 0;
				3935
				3936	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
				3937	sizeof(struct device_domain_info),
				3938	0,
				3939	SLAB_HWCACHE_ALIGN,
				3940	NULL);
				3941	if (!iommu_devinfo_cache) {
				3942	pr_err("Couldn't create devinfo cache\n");
				3943	ret = -ENOMEM;
				3944	}
				3945
				3946	return ret;
				3947	}
				3948
				3949	static int __init iommu_init_mempool(void)
				3950	{
				3951	int ret;
				3952	ret = iova_cache_get();
				3953	if (ret)
				3954	return ret;
				3955
				3956	ret = iommu_domain_cache_init();
				3957	if (ret)
				3958	goto domain_error;
				3959
				3960	ret = iommu_devinfo_cache_init();
				3961	if (!ret)
				3962	return ret;
				3963
				3964	kmem_cache_destroy(iommu_domain_cache);
				3965	domain_error:
				3966	iova_cache_put();
				3967
				3968	return -ENOMEM;
				3969	}
				3970
				3971	static void __init iommu_exit_mempool(void)
				3972	{
				3973	kmem_cache_destroy(iommu_devinfo_cache);
				3974	kmem_cache_destroy(iommu_domain_cache);
				3975	iova_cache_put();
				3976	}
				3977
				3978	static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
				3979	{
				3980	struct dmar_drhd_unit *drhd;
				3981	u32 vtbar;
				3982	int rc;
				3983
				3984	/* We know that this device on this chipset has its own IOMMU.
				3985	* If we find it under a different IOMMU, then the BIOS is lying
				3986	* to us. Hope that the IOMMU for this device is actually
				3987	* disabled, and it needs no translation...
				3988	*/
				3989	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
				3990	if (rc) {
				3991	/* "can't" happen */
				3992	dev_info(&pdev->dev, "failed to run vt-d quirk\n");
				3993	return;
				3994	}
				3995	vtbar &= 0xffff0000;
				3996
				3997	/* we know that the this iommu should be at offset 0xa000 from vtbar */
				3998	drhd = dmar_find_matched_drhd_unit(pdev);
				3999	if (WARN_TAINT_ONCE(!drhd \|\| drhd->reg_base_addr - vtbar != 0xa000,
				4000	TAINT_FIRMWARE_WORKAROUND,
				4001	"BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
				4002	pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
				4003	}
				4004	DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
				4005
				4006	static void __init init_no_remapping_devices(void)
				4007	{
				4008	struct dmar_drhd_unit *drhd;
				4009	struct device *dev;
				4010	int i;
				4011
				4012	for_each_drhd_unit(drhd) {
				4013	if (!drhd->include_all) {
				4014	for_each_active_dev_scope(drhd->devices,
				4015	drhd->devices_cnt, i, dev)
				4016	break;
				4017	/* ignore DMAR unit if no devices exist */
				4018	if (i == drhd->devices_cnt)
				4019	drhd->ignored = 1;
				4020	}
				4021	}
				4022
				4023	for_each_active_drhd_unit(drhd) {
				4024	if (drhd->include_all)
				4025	continue;
				4026
				4027	for_each_active_dev_scope(drhd->devices,
				4028	drhd->devices_cnt, i, dev)
				4029	if (!dev_is_pci(dev) \|\| !IS_GFX_DEVICE(to_pci_dev(dev)))
				4030	break;
				4031	if (i < drhd->devices_cnt)
				4032	continue;
				4033
				4034	/* This IOMMU has only gfx devices. Either bypass it or
				4035	set the gfx_mapped flag, as appropriate */
				4036	if (!dmar_map_gfx) {
				4037	drhd->ignored = 1;
				4038	for_each_active_dev_scope(drhd->devices,
				4039	drhd->devices_cnt, i, dev)
				4040	dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
				4041	}
				4042	}
				4043	}
				4044
				4045	#ifdef CONFIG_SUSPEND
				4046	static int init_iommu_hw(void)
				4047	{
				4048	struct dmar_drhd_unit *drhd;
				4049	struct intel_iommu *iommu = NULL;
				4050
				4051	for_each_active_iommu(iommu, drhd)
				4052	if (iommu->qi)
				4053	dmar_reenable_qi(iommu);
				4054
				4055	for_each_iommu(iommu, drhd) {
				4056	if (drhd->ignored) {
				4057	/*
				4058	* we always have to disable PMRs or DMA may fail on
				4059	* this device
				4060	*/
				4061	if (force_on)
				4062	iommu_disable_protect_mem_regions(iommu);
				4063	continue;
				4064	}
				4065
				4066	iommu_flush_write_buffer(iommu);
				4067
				4068	iommu_set_root_entry(iommu);
				4069
				4070	iommu->flush.flush_context(iommu, 0, 0, 0,
				4071	DMA_CCMD_GLOBAL_INVL);
				4072	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
				4073	iommu_enable_translation(iommu);
				4074	iommu_disable_protect_mem_regions(iommu);
				4075	}
				4076
				4077	return 0;
				4078	}
				4079
				4080	static void iommu_flush_all(void)
				4081	{
				4082	struct dmar_drhd_unit *drhd;
				4083	struct intel_iommu *iommu;
				4084
				4085	for_each_active_iommu(iommu, drhd) {
				4086	iommu->flush.flush_context(iommu, 0, 0, 0,
				4087	DMA_CCMD_GLOBAL_INVL);
				4088	iommu->flush.flush_iotlb(iommu, 0, 0, 0,
				4089	DMA_TLB_GLOBAL_FLUSH);
				4090	}
				4091	}
				4092
				4093	static int iommu_suspend(void)
				4094	{
				4095	struct dmar_drhd_unit *drhd;
				4096	struct intel_iommu *iommu = NULL;
				4097	unsigned long flag;
				4098
				4099	for_each_active_iommu(iommu, drhd) {
				4100	iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
				4101	GFP_ATOMIC);
				4102	if (!iommu->iommu_state)
				4103	goto nomem;
				4104	}
				4105
				4106	iommu_flush_all();
				4107
				4108	for_each_active_iommu(iommu, drhd) {
				4109	iommu_disable_translation(iommu);
				4110
				4111	raw_spin_lock_irqsave(&iommu->register_lock, flag);
				4112
				4113	iommu->iommu_state[SR_DMAR_FECTL_REG] =
				4114	readl(iommu->reg + DMAR_FECTL_REG);
				4115	iommu->iommu_state[SR_DMAR_FEDATA_REG] =
				4116	readl(iommu->reg + DMAR_FEDATA_REG);
				4117	iommu->iommu_state[SR_DMAR_FEADDR_REG] =
				4118	readl(iommu->reg + DMAR_FEADDR_REG);
				4119	iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
				4120	readl(iommu->reg + DMAR_FEUADDR_REG);
				4121
				4122	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
				4123	}
				4124	return 0;
				4125
				4126	nomem:
				4127	for_each_active_iommu(iommu, drhd)
				4128	kfree(iommu->iommu_state);
				4129
				4130	return -ENOMEM;
				4131	}
				4132
				4133	static void iommu_resume(void)
				4134	{
				4135	struct dmar_drhd_unit *drhd;
				4136	struct intel_iommu *iommu = NULL;
				4137	unsigned long flag;
				4138
				4139	if (init_iommu_hw()) {
				4140	if (force_on)
				4141	panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
				4142	else
				4143	WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
				4144	return;
				4145	}
				4146
				4147	for_each_active_iommu(iommu, drhd) {
				4148
				4149	raw_spin_lock_irqsave(&iommu->register_lock, flag);
				4150
				4151	writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
				4152	iommu->reg + DMAR_FECTL_REG);
				4153	writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
				4154	iommu->reg + DMAR_FEDATA_REG);
				4155	writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
				4156	iommu->reg + DMAR_FEADDR_REG);
				4157	writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
				4158	iommu->reg + DMAR_FEUADDR_REG);
				4159
				4160	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
				4161	}
				4162
				4163	for_each_active_iommu(iommu, drhd)
				4164	kfree(iommu->iommu_state);
				4165	}
				4166
				4167	static struct syscore_ops iommu_syscore_ops = {
				4168	.resume = iommu_resume,
				4169	.suspend = iommu_suspend,
				4170	};
				4171
				4172	static void __init init_iommu_pm_ops(void)
				4173	{
				4174	register_syscore_ops(&iommu_syscore_ops);
				4175	}
				4176
				4177	#else
				4178	static inline void init_iommu_pm_ops(void) {}
				4179	#endif /* CONFIG_PM */
				4180
				4181
				4182	int __init dmar_parse_one_rmrr(struct acpi_dmar_header header, void arg)
				4183	{
				4184	struct acpi_dmar_reserved_memory *rmrr;
				4185	int prot = DMA_PTE_READ\|DMA_PTE_WRITE;
				4186	struct dmar_rmrr_unit *rmrru;
				4187	size_t length;
				4188
				4189	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
				4190	if (!rmrru)
				4191	goto out;
				4192
				4193	rmrru->hdr = header;
				4194	rmrr = (struct acpi_dmar_reserved_memory *)header;
				4195	rmrru->base_address = rmrr->base_address;
				4196	rmrru->end_address = rmrr->end_address;
				4197
				4198	length = rmrr->end_address - rmrr->base_address + 1;
				4199	rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
				4200	IOMMU_RESV_DIRECT);
				4201	if (!rmrru->resv)
				4202	goto free_rmrru;
				4203
				4204	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
				4205	((void *)rmrr) + rmrr->header.length,
				4206	&rmrru->devices_cnt);
				4207	if (rmrru->devices_cnt && rmrru->devices == NULL)
				4208	goto free_all;
				4209
				4210	list_add(&rmrru->list, &dmar_rmrr_units);
				4211
				4212	return 0;
				4213	free_all:
				4214	kfree(rmrru->resv);
				4215	free_rmrru:
				4216	kfree(rmrru);
				4217	out:
				4218	return -ENOMEM;
				4219	}
				4220
				4221	static struct dmar_atsr_unit dmar_find_atsr(struct acpi_dmar_atsr atsr)
				4222	{
				4223	struct dmar_atsr_unit *atsru;
				4224	struct acpi_dmar_atsr *tmp;
				4225
				4226	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
				4227	tmp = (struct acpi_dmar_atsr *)atsru->hdr;
				4228	if (atsr->segment != tmp->segment)
				4229	continue;
				4230	if (atsr->header.length != tmp->header.length)
				4231	continue;
				4232	if (memcmp(atsr, tmp, atsr->header.length) == 0)
				4233	return atsru;
				4234	}
				4235
				4236	return NULL;
				4237	}
				4238
				4239	int dmar_parse_one_atsr(struct acpi_dmar_header hdr, void arg)
				4240	{
				4241	struct acpi_dmar_atsr *atsr;
				4242	struct dmar_atsr_unit *atsru;
				4243
				4244	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
				4245	return 0;
				4246
				4247	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
				4248	atsru = dmar_find_atsr(atsr);
				4249	if (atsru)
				4250	return 0;
				4251
				4252	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
				4253	if (!atsru)
				4254	return -ENOMEM;
				4255
				4256	/*
				4257	* If memory is allocated from slab by ACPI _DSM method, we need to
				4258	* copy the memory content because the memory buffer will be freed
				4259	* on return.
				4260	*/
				4261	atsru->hdr = (void *)(atsru + 1);
				4262	memcpy(atsru->hdr, hdr, hdr->length);
				4263	atsru->include_all = atsr->flags & 0x1;
				4264	if (!atsru->include_all) {
				4265	atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
				4266	(void *)atsr + atsr->header.length,
				4267	&atsru->devices_cnt);
				4268	if (atsru->devices_cnt && atsru->devices == NULL) {
				4269	kfree(atsru);
				4270	return -ENOMEM;
				4271	}
				4272	}
				4273
				4274	list_add_rcu(&atsru->list, &dmar_atsr_units);
				4275
				4276	return 0;
				4277	}
				4278
				4279	static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
				4280	{
				4281	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
				4282	kfree(atsru);
				4283	}
				4284
				4285	int dmar_release_one_atsr(struct acpi_dmar_header hdr, void arg)
				4286	{
				4287	struct acpi_dmar_atsr *atsr;
				4288	struct dmar_atsr_unit *atsru;
				4289
				4290	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
				4291	atsru = dmar_find_atsr(atsr);
				4292	if (atsru) {
				4293	list_del_rcu(&atsru->list);
				4294	synchronize_rcu();
				4295	intel_iommu_free_atsr(atsru);
				4296	}
				4297
				4298	return 0;
				4299	}
				4300
				4301	int dmar_check_one_atsr(struct acpi_dmar_header hdr, void arg)
				4302	{
				4303	int i;
				4304	struct device *dev;
				4305	struct acpi_dmar_atsr *atsr;
				4306	struct dmar_atsr_unit *atsru;
				4307
				4308	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
				4309	atsru = dmar_find_atsr(atsr);
				4310	if (!atsru)
				4311	return 0;
				4312
				4313	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
				4314	for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
				4315	i, dev)
				4316	return -EBUSY;
				4317	}
				4318
				4319	return 0;
				4320	}
				4321
				4322	static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
				4323	{
				4324	int sp, ret = 0;
				4325	struct intel_iommu *iommu = dmaru->iommu;
				4326
				4327	if (g_iommus[iommu->seq_id])
				4328	return 0;
				4329
				4330	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
				4331	pr_warn("%s: Doesn't support hardware pass through.\n",
				4332	iommu->name);
				4333	return -ENXIO;
				4334	}
				4335	if (!ecap_sc_support(iommu->ecap) &&
				4336	domain_update_iommu_snooping(iommu)) {
				4337	pr_warn("%s: Doesn't support snooping.\n",
				4338	iommu->name);
				4339	return -ENXIO;
				4340	}
				4341	sp = domain_update_iommu_superpage(iommu) - 1;
				4342	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
				4343	pr_warn("%s: Doesn't support large page.\n",
				4344	iommu->name);
				4345	return -ENXIO;
				4346	}
				4347
				4348	/*
				4349	* Disable translation if already enabled prior to OS handover.
				4350	*/
				4351	if (iommu->gcmd & DMA_GCMD_TE)
				4352	iommu_disable_translation(iommu);
				4353
				4354	g_iommus[iommu->seq_id] = iommu;
				4355	ret = iommu_init_domains(iommu);
				4356	if (ret == 0)
				4357	ret = iommu_alloc_root_entry(iommu);
				4358	if (ret)
				4359	goto out;
				4360
				4361	#ifdef CONFIG_INTEL_IOMMU_SVM
				4362	if (pasid_enabled(iommu))
				4363	intel_svm_init(iommu);
				4364	#endif
				4365
				4366	if (dmaru->ignored) {
				4367	/*
				4368	* we always have to disable PMRs or DMA may fail on this device
				4369	*/
				4370	if (force_on)
				4371	iommu_disable_protect_mem_regions(iommu);
				4372	return 0;
				4373	}
				4374
				4375	intel_iommu_init_qi(iommu);
				4376	iommu_flush_write_buffer(iommu);
				4377
				4378	#ifdef CONFIG_INTEL_IOMMU_SVM
				4379	if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
				4380	ret = intel_svm_enable_prq(iommu);
				4381	if (ret)
				4382	goto disable_iommu;
				4383	}
				4384	#endif
				4385	ret = dmar_set_interrupt(iommu);
				4386	if (ret)
				4387	goto disable_iommu;
				4388
				4389	iommu_set_root_entry(iommu);
				4390	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
				4391	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
				4392	iommu_enable_translation(iommu);
				4393
				4394	iommu_disable_protect_mem_regions(iommu);
				4395	return 0;
				4396
				4397	disable_iommu:
				4398	disable_dmar_iommu(iommu);
				4399	out:
				4400	free_dmar_iommu(iommu);
				4401	return ret;
				4402	}
				4403
				4404	int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
				4405	{
				4406	int ret = 0;
				4407	struct intel_iommu *iommu = dmaru->iommu;
				4408
				4409	if (!intel_iommu_enabled)
				4410	return 0;
				4411	if (iommu == NULL)
				4412	return -EINVAL;
				4413
				4414	if (insert) {
				4415	ret = intel_iommu_add(dmaru);
				4416	} else {
				4417	disable_dmar_iommu(iommu);
				4418	free_dmar_iommu(iommu);
				4419	}
				4420
				4421	return ret;
				4422	}
				4423
				4424	static void intel_iommu_free_dmars(void)
				4425	{
				4426	struct dmar_rmrr_unit rmrru, rmrr_n;
				4427	struct dmar_atsr_unit atsru, atsr_n;
				4428
				4429	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
				4430	list_del(&rmrru->list);
				4431	dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
				4432	kfree(rmrru->resv);
				4433	kfree(rmrru);
				4434	}
				4435
				4436	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
				4437	list_del(&atsru->list);
				4438	intel_iommu_free_atsr(atsru);
				4439	}
				4440	}
				4441
				4442	int dmar_find_matched_atsr_unit(struct pci_dev *dev)
				4443	{
				4444	int i, ret = 1;
				4445	struct pci_bus *bus;
				4446	struct pci_dev *bridge = NULL;
				4447	struct device *tmp;
				4448	struct acpi_dmar_atsr *atsr;
				4449	struct dmar_atsr_unit *atsru;
				4450
				4451	dev = pci_physfn(dev);
				4452	for (bus = dev->bus; bus; bus = bus->parent) {
				4453	bridge = bus->self;
				4454	/* If it's an integrated device, allow ATS */
				4455	if (!bridge)
				4456	return 1;
				4457	/* Connected via non-PCIe: no ATS */
				4458	if (!pci_is_pcie(bridge) \|\|
				4459	pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
				4460	return 0;
				4461	/* If we found the root port, look it up in the ATSR */
				4462	if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
				4463	break;
				4464	}
				4465
				4466	rcu_read_lock();
				4467	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
				4468	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
				4469	if (atsr->segment != pci_domain_nr(dev->bus))
				4470	continue;
				4471
				4472	for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
				4473	if (tmp == &bridge->dev)
				4474	goto out;
				4475
				4476	if (atsru->include_all)
				4477	goto out;
				4478	}
				4479	ret = 0;
				4480	out:
				4481	rcu_read_unlock();
				4482
				4483	return ret;
				4484	}
				4485
				4486	int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
				4487	{
				4488	int ret = 0;
				4489	struct dmar_rmrr_unit *rmrru;
				4490	struct dmar_atsr_unit *atsru;
				4491	struct acpi_dmar_atsr *atsr;
				4492	struct acpi_dmar_reserved_memory *rmrr;
				4493
				4494	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
				4495	return 0;
				4496
				4497	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
				4498	rmrr = container_of(rmrru->hdr,
				4499	struct acpi_dmar_reserved_memory, header);
				4500	if (info->event == BUS_NOTIFY_ADD_DEVICE) {
				4501	ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
				4502	((void *)rmrr) + rmrr->header.length,
				4503	rmrr->segment, rmrru->devices,
				4504	rmrru->devices_cnt);
				4505	if(ret < 0)
				4506	return ret;
				4507	} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
				4508	dmar_remove_dev_scope(info, rmrr->segment,
				4509	rmrru->devices, rmrru->devices_cnt);
				4510	}
				4511	}
				4512
				4513	list_for_each_entry(atsru, &dmar_atsr_units, list) {
				4514	if (atsru->include_all)
				4515	continue;
				4516
				4517	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
				4518	if (info->event == BUS_NOTIFY_ADD_DEVICE) {
				4519	ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
				4520	(void *)atsr + atsr->header.length,
				4521	atsr->segment, atsru->devices,
				4522	atsru->devices_cnt);
				4523	if (ret > 0)
				4524	break;
				4525	else if(ret < 0)
				4526	return ret;
				4527	} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
				4528	if (dmar_remove_dev_scope(info, atsr->segment,
				4529	atsru->devices, atsru->devices_cnt))
				4530	break;
				4531	}
				4532	}
				4533
				4534	return 0;
				4535	}
				4536
				4537	/*
				4538	* Here we only respond to action of unbound device from driver.
				4539	*
				4540	* Added device is not attached to its DMAR domain here yet. That will happen
				4541	* when mapping the device to iova.
				4542	*/
				4543	static int device_notifier(struct notifier_block *nb,
				4544	unsigned long action, void *data)
				4545	{
				4546	struct device *dev = data;
				4547	struct dmar_domain *domain;
				4548
				4549	if (iommu_dummy(dev))
				4550	return 0;
				4551
				4552	if (action != BUS_NOTIFY_REMOVED_DEVICE)
				4553	return 0;
				4554
				4555	domain = find_domain(dev);
				4556	if (!domain)
				4557	return 0;
				4558
				4559	dmar_remove_one_dev_info(domain, dev);
				4560	if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
				4561	domain_exit(domain);
				4562
				4563	return 0;
				4564	}
				4565
				4566	static struct notifier_block device_nb = {
				4567	.notifier_call = device_notifier,
				4568	};
				4569
				4570	static int intel_iommu_memory_notifier(struct notifier_block *nb,
				4571	unsigned long val, void *v)
				4572	{
				4573	struct memory_notify *mhp = v;
				4574	unsigned long long start, end;
				4575	unsigned long start_vpfn, last_vpfn;
				4576
				4577	switch (val) {
				4578	case MEM_GOING_ONLINE:
				4579	start = mhp->start_pfn << PAGE_SHIFT;
				4580	end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
				4581	if (iommu_domain_identity_map(si_domain, start, end)) {
				4582	pr_warn("Failed to build identity map for [%llx-%llx]\n",
				4583	start, end);
				4584	return NOTIFY_BAD;
				4585	}
				4586	break;
				4587
				4588	case MEM_OFFLINE:
				4589	case MEM_CANCEL_ONLINE:
				4590	start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
				4591	last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
				4592	while (start_vpfn <= last_vpfn) {
				4593	struct iova *iova;
				4594	struct dmar_drhd_unit *drhd;
				4595	struct intel_iommu *iommu;
				4596	struct page *freelist;
				4597
				4598	iova = find_iova(&si_domain->iovad, start_vpfn);
				4599	if (iova == NULL) {
				4600	pr_debug("Failed get IOVA for PFN %lx\n",
				4601	start_vpfn);
				4602	break;
				4603	}
				4604
				4605	iova = split_and_remove_iova(&si_domain->iovad, iova,
				4606	start_vpfn, last_vpfn);
				4607	if (iova == NULL) {
				4608	pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
				4609	start_vpfn, last_vpfn);
				4610	return NOTIFY_BAD;
				4611	}
				4612
				4613	freelist = domain_unmap(si_domain, iova->pfn_lo,
				4614	iova->pfn_hi);
				4615
				4616	rcu_read_lock();
				4617	for_each_active_iommu(iommu, drhd)
				4618	iommu_flush_iotlb_psi(iommu, si_domain,
				4619	iova->pfn_lo, iova_size(iova),
				4620	!freelist, 0);
				4621	rcu_read_unlock();
				4622	dma_free_pagelist(freelist);
				4623
				4624	start_vpfn = iova->pfn_hi + 1;
				4625	free_iova_mem(iova);
				4626	}
				4627	break;
				4628	}
				4629
				4630	return NOTIFY_OK;
				4631	}
				4632
				4633	static struct notifier_block intel_iommu_memory_nb = {
				4634	.notifier_call = intel_iommu_memory_notifier,
				4635	.priority = 0
				4636	};
				4637
				4638	static void free_all_cpu_cached_iovas(unsigned int cpu)
				4639	{
				4640	int i;
				4641
				4642	for (i = 0; i < g_num_of_iommus; i++) {
				4643	struct intel_iommu *iommu = g_iommus[i];
				4644	struct dmar_domain *domain;
				4645	int did;
				4646
				4647	if (!iommu)
				4648	continue;
				4649
				4650	for (did = 0; did < cap_ndoms(iommu->cap); did++) {
				4651	domain = get_iommu_domain(iommu, (u16)did);
				4652
				4653	if (!domain)
				4654	continue;
				4655	free_cpu_cached_iovas(cpu, &domain->iovad);
				4656	}
				4657	}
				4658	}
				4659
				4660	static int intel_iommu_cpu_dead(unsigned int cpu)
				4661	{
				4662	free_all_cpu_cached_iovas(cpu);
				4663	return 0;
				4664	}
				4665
				4666	static void intel_disable_iommus(void)
				4667	{
				4668	struct intel_iommu *iommu = NULL;
				4669	struct dmar_drhd_unit *drhd;
				4670
				4671	for_each_iommu(iommu, drhd)
				4672	iommu_disable_translation(iommu);
				4673	}
				4674
				4675	static inline struct intel_iommu dev_to_intel_iommu(struct device dev)
				4676	{
				4677	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
				4678
				4679	return container_of(iommu_dev, struct intel_iommu, iommu);
				4680	}
				4681
				4682	static ssize_t intel_iommu_show_version(struct device *dev,
				4683	struct device_attribute *attr,
				4684	char *buf)
				4685	{
				4686	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
				4687	u32 ver = readl(iommu->reg + DMAR_VER_REG);
				4688	return sprintf(buf, "%d:%d\n",
				4689	DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
				4690	}
				4691	static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
				4692
				4693	static ssize_t intel_iommu_show_address(struct device *dev,
				4694	struct device_attribute *attr,
				4695	char *buf)
				4696	{
				4697	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
				4698	return sprintf(buf, "%llx\n", iommu->reg_phys);
				4699	}
				4700	static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
				4701
				4702	static ssize_t intel_iommu_show_cap(struct device *dev,
				4703	struct device_attribute *attr,
				4704	char *buf)
				4705	{
				4706	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
				4707	return sprintf(buf, "%llx\n", iommu->cap);
				4708	}
				4709	static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
				4710
				4711	static ssize_t intel_iommu_show_ecap(struct device *dev,
				4712	struct device_attribute *attr,
				4713	char *buf)
				4714	{
				4715	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
				4716	return sprintf(buf, "%llx\n", iommu->ecap);
				4717	}
				4718	static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
				4719
				4720	static ssize_t intel_iommu_show_ndoms(struct device *dev,
				4721	struct device_attribute *attr,
				4722	char *buf)
				4723	{
				4724	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
				4725	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
				4726	}
				4727	static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
				4728
				4729	static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
				4730	struct device_attribute *attr,
				4731	char *buf)
				4732	{
				4733	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
				4734	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
				4735	cap_ndoms(iommu->cap)));
				4736	}
				4737	static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
				4738
				4739	static struct attribute *intel_iommu_attrs[] = {
				4740	&dev_attr_version.attr,
				4741	&dev_attr_address.attr,
				4742	&dev_attr_cap.attr,
				4743	&dev_attr_ecap.attr,
				4744	&dev_attr_domains_supported.attr,
				4745	&dev_attr_domains_used.attr,
				4746	NULL,
				4747	};
				4748
				4749	static struct attribute_group intel_iommu_group = {
				4750	.name = "intel-iommu",
				4751	.attrs = intel_iommu_attrs,
				4752	};
				4753
				4754	const struct attribute_group *intel_iommu_groups[] = {
				4755	&intel_iommu_group,
				4756	NULL,
				4757	};
				4758
				4759	int __init intel_iommu_init(void)
				4760	{
				4761	int ret = -ENODEV;
				4762	struct dmar_drhd_unit *drhd;
				4763	struct intel_iommu *iommu;
				4764
				4765	/* VT-d is required for a TXT/tboot launch, so enforce that */
				4766	force_on = tboot_force_iommu();
				4767
				4768	if (iommu_init_mempool()) {
				4769	if (force_on)
				4770	panic("tboot: Failed to initialize iommu memory\n");
				4771	return -ENOMEM;
				4772	}
				4773
				4774	down_write(&dmar_global_lock);
				4775	if (dmar_table_init()) {
				4776	if (force_on)
				4777	panic("tboot: Failed to initialize DMAR table\n");
				4778	goto out_free_dmar;
				4779	}
				4780
				4781	if (dmar_dev_scope_init() < 0) {
				4782	if (force_on)
				4783	panic("tboot: Failed to initialize DMAR device scope\n");
				4784	goto out_free_dmar;
				4785	}
				4786
				4787	up_write(&dmar_global_lock);
				4788
				4789	/*
				4790	* The bus notifier takes the dmar_global_lock, so lockdep will
				4791	* complain later when we register it under the lock.
				4792	*/
				4793	dmar_register_bus_notifier();
				4794
				4795	down_write(&dmar_global_lock);
				4796
				4797	if (no_iommu \|\| dmar_disabled) {
				4798	/*
				4799	* We exit the function here to ensure IOMMU's remapping and
				4800	* mempool aren't setup, which means that the IOMMU's PMRs
				4801	* won't be disabled via the call to init_dmars(). So disable
				4802	* it explicitly here. The PMRs were setup by tboot prior to
				4803	* calling SENTER, but the kernel is expected to reset/tear
				4804	* down the PMRs.
				4805	*/
				4806	if (intel_iommu_tboot_noforce) {
				4807	for_each_iommu(iommu, drhd)
				4808	iommu_disable_protect_mem_regions(iommu);
				4809	}
				4810
				4811	/*
				4812	* Make sure the IOMMUs are switched off, even when we
				4813	* boot into a kexec kernel and the previous kernel left
				4814	* them enabled
				4815	*/
				4816	intel_disable_iommus();
				4817	goto out_free_dmar;
				4818	}
				4819
				4820	if (list_empty(&dmar_rmrr_units))
				4821	pr_info("No RMRR found\n");
				4822
				4823	if (list_empty(&dmar_atsr_units))
				4824	pr_info("No ATSR found\n");
				4825
				4826	if (dmar_init_reserved_ranges()) {
				4827	if (force_on)
				4828	panic("tboot: Failed to reserve iommu ranges\n");
				4829	goto out_free_reserved_range;
				4830	}
				4831
				4832	if (dmar_map_gfx)
				4833	intel_iommu_gfx_mapped = 1;
				4834
				4835	init_no_remapping_devices();
				4836
				4837	ret = init_dmars();
				4838	if (ret) {
				4839	if (force_on)
				4840	panic("tboot: Failed to initialize DMARs\n");
				4841	pr_err("Initialization failed\n");
				4842	goto out_free_reserved_range;
				4843	}
				4844	up_write(&dmar_global_lock);
				4845	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
				4846
				4847	#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
				4848	swiotlb = 0;
				4849	#endif
				4850	dma_ops = &intel_dma_ops;
				4851
				4852	init_iommu_pm_ops();
				4853
				4854	for_each_active_iommu(iommu, drhd) {
				4855	iommu_device_sysfs_add(&iommu->iommu, NULL,
				4856	intel_iommu_groups,
				4857	"%s", iommu->name);
				4858	iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
				4859	iommu_device_register(&iommu->iommu);
				4860	}
				4861
				4862	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
				4863	bus_register_notifier(&pci_bus_type, &device_nb);
				4864	if (si_domain && !hw_pass_through)
				4865	register_memory_notifier(&intel_iommu_memory_nb);
				4866	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
				4867	intel_iommu_cpu_dead);
				4868	intel_iommu_enabled = 1;
				4869
				4870	return 0;
				4871
				4872	out_free_reserved_range:
				4873	put_iova_domain(&reserved_iova_list);
				4874	out_free_dmar:
				4875	intel_iommu_free_dmars();
				4876	up_write(&dmar_global_lock);
				4877	iommu_exit_mempool();
				4878	return ret;
				4879	}
				4880
				4881	static int domain_context_clear_one_cb(struct pci_dev pdev, u16 alias, void opaque)
				4882	{
				4883	struct intel_iommu *iommu = opaque;
				4884
				4885	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
				4886	return 0;
				4887	}
				4888
				4889	/*
				4890	* NB - intel-iommu lacks any sort of reference counting for the users of
				4891	* dependent devices. If multiple endpoints have intersecting dependent
				4892	* devices, unbinding the driver from any one of them will possibly leave
				4893	* the others unable to operate.
				4894	*/
				4895	static void domain_context_clear(struct intel_iommu iommu, struct device dev)
				4896	{
				4897	if (!iommu \|\| !dev \|\| !dev_is_pci(dev))
				4898	return;
				4899
				4900	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
				4901	}
				4902
				4903	static void __dmar_remove_one_dev_info(struct device_domain_info *info)
				4904	{
				4905	struct intel_iommu *iommu;
				4906	unsigned long flags;
				4907
				4908	assert_spin_locked(&device_domain_lock);
				4909
				4910	if (WARN_ON(!info))
				4911	return;
				4912
				4913	iommu = info->iommu;
				4914
				4915	if (info->dev) {
				4916	iommu_disable_dev_iotlb(info);
				4917	domain_context_clear(iommu, info->dev);
				4918	intel_pasid_free_table(info->dev);
				4919	}
				4920
				4921	unlink_domain_info(info);
				4922
				4923	spin_lock_irqsave(&iommu->lock, flags);
				4924	domain_detach_iommu(info->domain, iommu);
				4925	spin_unlock_irqrestore(&iommu->lock, flags);
				4926
				4927	free_devinfo_mem(info);
				4928	}
				4929
				4930	static void dmar_remove_one_dev_info(struct dmar_domain *domain,
				4931	struct device *dev)
				4932	{
				4933	struct device_domain_info *info;
				4934	unsigned long flags;
				4935
				4936	spin_lock_irqsave(&device_domain_lock, flags);
				4937	info = dev->archdata.iommu;
				4938	__dmar_remove_one_dev_info(info);
				4939	spin_unlock_irqrestore(&device_domain_lock, flags);
				4940	}
				4941
				4942	static int md_domain_init(struct dmar_domain *domain, int guest_width)
				4943	{
				4944	int adjust_width;
				4945
				4946	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
				4947	domain_reserve_special_ranges(domain);
				4948
				4949	/* calculate AGAW */
				4950	domain->gaw = guest_width;
				4951	adjust_width = guestwidth_to_adjustwidth(guest_width);
				4952	domain->agaw = width_to_agaw(adjust_width);
				4953
				4954	domain->iommu_coherency = 0;
				4955	domain->iommu_snooping = 0;
				4956	domain->iommu_superpage = 0;
				4957	domain->max_addr = 0;
				4958
				4959	/* always allocate the top pgd */
				4960	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
				4961	if (!domain->pgd)
				4962	return -ENOMEM;
				4963	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
				4964	return 0;
				4965	}
				4966
				4967	static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
				4968	{
				4969	struct dmar_domain *dmar_domain;
				4970	struct iommu_domain *domain;
				4971
				4972	if (type != IOMMU_DOMAIN_UNMANAGED)
				4973	return NULL;
				4974
				4975	dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
				4976	if (!dmar_domain) {
				4977	pr_err("Can't allocate dmar_domain\n");
				4978	return NULL;
				4979	}
				4980	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
				4981	pr_err("Domain initialization failed\n");
				4982	domain_exit(dmar_domain);
				4983	return NULL;
				4984	}
				4985	domain_update_iommu_cap(dmar_domain);
				4986
				4987	domain = &dmar_domain->domain;
				4988	domain->geometry.aperture_start = 0;
				4989	domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
				4990	domain->geometry.force_aperture = true;
				4991
				4992	return domain;
				4993	}
				4994
				4995	static void intel_iommu_domain_free(struct iommu_domain *domain)
				4996	{
				4997	domain_exit(to_dmar_domain(domain));
				4998	}
				4999
				5000	static int intel_iommu_attach_device(struct iommu_domain *domain,
				5001	struct device *dev)
				5002	{
				5003	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
				5004	struct intel_iommu *iommu;
				5005	int addr_width;
				5006	u8 bus, devfn;
				5007
				5008	if (device_is_rmrr_locked(dev)) {
				5009	dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
				5010	return -EPERM;
				5011	}
				5012
				5013	/* normally dev is not mapped */
				5014	if (unlikely(domain_context_mapped(dev))) {
				5015	struct dmar_domain *old_domain;
				5016
				5017	old_domain = find_domain(dev);
				5018	if (old_domain) {
				5019	rcu_read_lock();
				5020	dmar_remove_one_dev_info(old_domain, dev);
				5021	rcu_read_unlock();
				5022
				5023	if (!domain_type_is_vm_or_si(old_domain) &&
				5024	list_empty(&old_domain->devices))
				5025	domain_exit(old_domain);
				5026	}
				5027	}
				5028
				5029	iommu = device_to_iommu(dev, &bus, &devfn);
				5030	if (!iommu)
				5031	return -ENODEV;
				5032
				5033	/* check if this iommu agaw is sufficient for max mapped address */
				5034	addr_width = agaw_to_width(iommu->agaw);
				5035	if (addr_width > cap_mgaw(iommu->cap))
				5036	addr_width = cap_mgaw(iommu->cap);
				5037
				5038	if (dmar_domain->max_addr > (1LL << addr_width)) {
				5039	pr_err("%s: iommu width (%d) is not "
				5040	"sufficient for the mapped address (%llx)\n",
				5041	__func__, addr_width, dmar_domain->max_addr);
				5042	return -EFAULT;
				5043	}
				5044	dmar_domain->gaw = addr_width;
				5045
				5046	/*
				5047	* Knock out extra levels of page tables if necessary
				5048	*/
				5049	while (iommu->agaw < dmar_domain->agaw) {
				5050	struct dma_pte *pte;
				5051
				5052	pte = dmar_domain->pgd;
				5053	if (dma_pte_present(pte)) {
				5054	dmar_domain->pgd = (struct dma_pte *)
				5055	phys_to_virt(dma_pte_addr(pte));
				5056	free_pgtable_page(pte);
				5057	}
				5058	dmar_domain->agaw--;
				5059	}
				5060
				5061	return domain_add_dev_info(dmar_domain, dev);
				5062	}
				5063
				5064	static void intel_iommu_detach_device(struct iommu_domain *domain,
				5065	struct device *dev)
				5066	{
				5067	dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
				5068	}
				5069
				5070	static int intel_iommu_map(struct iommu_domain *domain,
				5071	unsigned long iova, phys_addr_t hpa,
				5072	size_t size, int iommu_prot)
				5073	{
				5074	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
				5075	u64 max_addr;
				5076	int prot = 0;
				5077	int ret;
				5078
				5079	if (iommu_prot & IOMMU_READ)
				5080	prot \|= DMA_PTE_READ;
				5081	if (iommu_prot & IOMMU_WRITE)
				5082	prot \|= DMA_PTE_WRITE;
				5083	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
				5084	prot \|= DMA_PTE_SNP;
				5085
				5086	max_addr = iova + size;
				5087	if (dmar_domain->max_addr < max_addr) {
				5088	u64 end;
				5089
				5090	/* check if minimum agaw is sufficient for mapped address */
				5091	end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
				5092	if (end < max_addr) {
				5093	pr_err("%s: iommu width (%d) is not "
				5094	"sufficient for the mapped address (%llx)\n",
				5095	__func__, dmar_domain->gaw, max_addr);
				5096	return -EFAULT;
				5097	}
				5098	dmar_domain->max_addr = max_addr;
				5099	}
				5100	/* Round up size to next multiple of PAGE_SIZE, if it and
				5101	the low bits of hpa would take us onto the next page */
				5102	size = aligned_nrpages(hpa, size);
				5103	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
				5104	hpa >> VTD_PAGE_SHIFT, size, prot);
				5105	return ret;
				5106	}
				5107
				5108	static size_t intel_iommu_unmap(struct iommu_domain *domain,
				5109	unsigned long iova, size_t size)
				5110	{
				5111	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
				5112	struct page *freelist = NULL;
				5113	unsigned long start_pfn, last_pfn;
				5114	unsigned int npages;
				5115	int iommu_id, level = 0;
				5116
				5117	/* Cope with horrid API which requires us to unmap more than the
				5118	size argument if it happens to be a large-page mapping. */
				5119	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
				5120
				5121	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
				5122	size = VTD_PAGE_SIZE << level_to_offset_bits(level);
				5123
				5124	start_pfn = iova >> VTD_PAGE_SHIFT;
				5125	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
				5126
				5127	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
				5128
				5129	npages = last_pfn - start_pfn + 1;
				5130
				5131	for_each_domain_iommu(iommu_id, dmar_domain)
				5132	iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
				5133	start_pfn, npages, !freelist, 0);
				5134
				5135	dma_free_pagelist(freelist);
				5136
				5137	if (dmar_domain->max_addr == iova + size)
				5138	dmar_domain->max_addr = iova;
				5139
				5140	return size;
				5141	}
				5142
				5143	static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
				5144	dma_addr_t iova)
				5145	{
				5146	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
				5147	struct dma_pte *pte;
				5148	int level = 0;
				5149	u64 phys = 0;
				5150
				5151	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
				5152	if (pte)
				5153	phys = dma_pte_addr(pte);
				5154
				5155	return phys;
				5156	}
				5157
				5158	static bool intel_iommu_capable(enum iommu_cap cap)
				5159	{
				5160	if (cap == IOMMU_CAP_CACHE_COHERENCY)
				5161	return domain_update_iommu_snooping(NULL) == 1;
				5162	if (cap == IOMMU_CAP_INTR_REMAP)
				5163	return irq_remapping_enabled == 1;
				5164
				5165	return false;
				5166	}
				5167
				5168	static int intel_iommu_add_device(struct device *dev)
				5169	{
				5170	struct intel_iommu *iommu;
				5171	struct iommu_group *group;
				5172	u8 bus, devfn;
				5173
				5174	iommu = device_to_iommu(dev, &bus, &devfn);
				5175	if (!iommu)
				5176	return -ENODEV;
				5177
				5178	iommu_device_link(&iommu->iommu, dev);
				5179
				5180	group = iommu_group_get_for_dev(dev);
				5181
				5182	if (IS_ERR(group))
				5183	return PTR_ERR(group);
				5184
				5185	iommu_group_put(group);
				5186	return 0;
				5187	}
				5188
				5189	static void intel_iommu_remove_device(struct device *dev)
				5190	{
				5191	struct intel_iommu *iommu;
				5192	u8 bus, devfn;
				5193
				5194	iommu = device_to_iommu(dev, &bus, &devfn);
				5195	if (!iommu)
				5196	return;
				5197
				5198	iommu_group_remove_device(dev);
				5199
				5200	iommu_device_unlink(&iommu->iommu, dev);
				5201	}
				5202
				5203	static void intel_iommu_get_resv_regions(struct device *device,
				5204	struct list_head *head)
				5205	{
				5206	struct iommu_resv_region *reg;
				5207	struct dmar_rmrr_unit *rmrr;
				5208	struct device *i_dev;
				5209	int i;
				5210
				5211	rcu_read_lock();
				5212	for_each_rmrr_units(rmrr) {
				5213	for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
				5214	i, i_dev) {
				5215	if (i_dev != device)
				5216	continue;
				5217
				5218	list_add_tail(&rmrr->resv->list, head);
				5219	}
				5220	}
				5221	rcu_read_unlock();
				5222
				5223	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
				5224	IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
				5225	0, IOMMU_RESV_MSI);
				5226	if (!reg)
				5227	return;
				5228	list_add_tail(&reg->list, head);
				5229	}
				5230
				5231	static void intel_iommu_put_resv_regions(struct device *dev,
				5232	struct list_head *head)
				5233	{
				5234	struct iommu_resv_region entry, next;
				5235
				5236	list_for_each_entry_safe(entry, next, head, list) {
				5237	if (entry->type == IOMMU_RESV_MSI)
				5238	kfree(entry);
				5239	}
				5240	}
				5241
				5242	#ifdef CONFIG_INTEL_IOMMU_SVM
				5243	#define MAX_NR_PASID_BITS (20)
				5244	static inline unsigned long intel_iommu_get_pts(struct device *dev)
				5245	{
				5246	int pts, max_pasid;
				5247
				5248	max_pasid = intel_pasid_get_dev_max_id(dev);
				5249	pts = find_first_bit((unsigned long *)&max_pasid, MAX_NR_PASID_BITS);
				5250	if (pts < 5)
				5251	return 0;
				5252
				5253	return pts - 5;
				5254	}
				5255
				5256	int intel_iommu_enable_pasid(struct intel_iommu iommu, struct intel_svm_dev sdev)
				5257	{
				5258	struct device_domain_info *info;
				5259	struct context_entry *context;
				5260	struct dmar_domain *domain;
				5261	unsigned long flags;
				5262	u64 ctx_lo;
				5263	int ret;
				5264
				5265	domain = get_valid_domain_for_dev(sdev->dev);
				5266	if (!domain)
				5267	return -EINVAL;
				5268
				5269	spin_lock_irqsave(&device_domain_lock, flags);
				5270	spin_lock(&iommu->lock);
				5271
				5272	ret = -EINVAL;
				5273	info = sdev->dev->archdata.iommu;
				5274	if (!info \|\| !info->pasid_supported)
				5275	goto out;
				5276
				5277	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
				5278	if (WARN_ON(!context))
				5279	goto out;
				5280
				5281	ctx_lo = context[0].lo;
				5282
				5283	sdev->did = domain->iommu_did[iommu->seq_id];
				5284	sdev->sid = PCI_DEVID(info->bus, info->devfn);
				5285
				5286	if (!(ctx_lo & CONTEXT_PASIDE)) {
				5287	if (iommu->pasid_state_table)
				5288	context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
				5289	context[1].lo = (u64)virt_to_phys(info->pasid_table->table) \|
				5290	intel_iommu_get_pts(sdev->dev);
				5291
				5292	wmb();
				5293	/* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
				5294	* extended to permit requests-with-PASID if the PASIDE bit
				5295	* is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
				5296	* however, the PASIDE bit is ignored and requests-with-PASID
				5297	* are unconditionally blocked. Which makes less sense.
				5298	* So convert from CONTEXT_TT_PASS_THROUGH to one of the new
				5299	* "guest mode" translation types depending on whether ATS
				5300	* is available or not. Annoyingly, we can't use the new
				5301	* modes unless PASIDE is set. */
				5302	if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
				5303	ctx_lo &= ~CONTEXT_TT_MASK;
				5304	if (info->ats_supported)
				5305	ctx_lo \|= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
				5306	else
				5307	ctx_lo \|= CONTEXT_TT_PT_PASID << 2;
				5308	}
				5309	ctx_lo \|= CONTEXT_PASIDE;
				5310	if (iommu->pasid_state_table)
				5311	ctx_lo \|= CONTEXT_DINVE;
				5312	if (info->pri_supported)
				5313	ctx_lo \|= CONTEXT_PRS;
				5314	context[0].lo = ctx_lo;
				5315	wmb();
				5316	iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
				5317	DMA_CCMD_MASK_NOBIT,
				5318	DMA_CCMD_DEVICE_INVL);
				5319	}
				5320
				5321	/* Enable PASID support in the device, if it wasn't already */
				5322	if (!info->pasid_enabled)
				5323	iommu_enable_dev_iotlb(info);
				5324
				5325	if (info->ats_enabled) {
				5326	sdev->dev_iotlb = 1;
				5327	sdev->qdep = info->ats_qdep;
				5328	if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
				5329	sdev->qdep = 0;
				5330	}
				5331	ret = 0;
				5332
				5333	out:
				5334	spin_unlock(&iommu->lock);
				5335	spin_unlock_irqrestore(&device_domain_lock, flags);
				5336
				5337	return ret;
				5338	}
				5339
				5340	struct intel_iommu intel_svm_device_to_iommu(struct device dev)
				5341	{
				5342	struct intel_iommu *iommu;
				5343	u8 bus, devfn;
				5344
				5345	if (iommu_dummy(dev)) {
				5346	dev_warn(dev,
				5347	"No IOMMU translation for device; cannot enable SVM\n");
				5348	return NULL;
				5349	}
				5350
				5351	iommu = device_to_iommu(dev, &bus, &devfn);
				5352	if ((!iommu)) {
				5353	dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
				5354	return NULL;
				5355	}
				5356
				5357	return iommu;
				5358	}
				5359	#endif /* CONFIG_INTEL_IOMMU_SVM */
				5360
				5361	const struct iommu_ops intel_iommu_ops = {
				5362	.capable = intel_iommu_capable,
				5363	.domain_alloc = intel_iommu_domain_alloc,
				5364	.domain_free = intel_iommu_domain_free,
				5365	.attach_dev = intel_iommu_attach_device,
				5366	.detach_dev = intel_iommu_detach_device,
				5367	.map = intel_iommu_map,
				5368	.unmap = intel_iommu_unmap,
				5369	.iova_to_phys = intel_iommu_iova_to_phys,
				5370	.add_device = intel_iommu_add_device,
				5371	.remove_device = intel_iommu_remove_device,
				5372	.get_resv_regions = intel_iommu_get_resv_regions,
				5373	.put_resv_regions = intel_iommu_put_resv_regions,
				5374	.device_group = pci_device_group,
				5375	.pgsize_bitmap = INTEL_IOMMU_PGSIZES,
				5376	};
				5377
				5378	static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
				5379	{
				5380	/* G4x/GM45 integrated gfx dmar support is totally busted. */
				5381	pr_info("Disabling IOMMU for graphics on this chipset\n");
				5382	dmar_map_gfx = 0;
				5383	}
				5384
				5385	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
				5386	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
				5387	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
				5388	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
				5389	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
				5390	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
				5391	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
				5392
				5393	static void quirk_iommu_rwbf(struct pci_dev *dev)
				5394	{
				5395	/*
				5396	* Mobile 4 Series Chipset neglects to set RWBF capability,
				5397	* but needs it. Same seems to hold for the desktop versions.
				5398	*/
				5399	pr_info("Forcing write-buffer flush capability\n");
				5400	rwbf_quirk = 1;
				5401	}
				5402
				5403	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
				5404	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
				5405	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
				5406	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
				5407	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
				5408	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
				5409	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
				5410
				5411	#define GGC 0x52
				5412	#define GGC_MEMORY_SIZE_MASK (0xf << 8)
				5413	#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
				5414	#define GGC_MEMORY_SIZE_1M (0x1 << 8)
				5415	#define GGC_MEMORY_SIZE_2M (0x3 << 8)
				5416	#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
				5417	#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
				5418	#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
				5419	#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
				5420
				5421	static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
				5422	{
				5423	unsigned short ggc;
				5424
				5425	if (pci_read_config_word(dev, GGC, &ggc))
				5426	return;
				5427
				5428	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
				5429	pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
				5430	dmar_map_gfx = 0;
				5431	} else if (dmar_map_gfx) {
				5432	/* we have to ensure the gfx device is idle before we flush */
				5433	pr_info("Disabling batched IOTLB flush on Ironlake\n");
				5434	intel_iommu_strict = 1;
				5435	}
				5436	}
				5437	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
				5438	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
				5439	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
				5440	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
				5441
				5442	/* On Tylersburg chipsets, some BIOSes have been known to enable the
				5443	ISOCH DMAR unit for the Azalia sound device, but not give it any
				5444	TLB entries, which causes it to deadlock. Check for that. We do
				5445	this in a function called from init_dmars(), instead of in a PCI
				5446	quirk, because we don't want to print the obnoxious "BIOS broken"
				5447	message if VT-d is actually disabled.
				5448	*/
				5449	static void __init check_tylersburg_isoch(void)
				5450	{
				5451	struct pci_dev *pdev;
				5452	uint32_t vtisochctrl;
				5453
				5454	/* If there's no Azalia in the system anyway, forget it. */
				5455	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
				5456	if (!pdev)
				5457	return;
				5458	pci_dev_put(pdev);
				5459
				5460	/* System Management Registers. Might be hidden, in which case
				5461	we can't do the sanity check. But that's OK, because the
				5462	known-broken BIOSes _don't_ actually hide it, so far. */
				5463	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
				5464	if (!pdev)
				5465	return;
				5466
				5467	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
				5468	pci_dev_put(pdev);
				5469	return;
				5470	}
				5471
				5472	pci_dev_put(pdev);
				5473
				5474	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
				5475	if (vtisochctrl & 1)
				5476	return;
				5477
				5478	/* Drop all bits other than the number of TLB entries */
				5479	vtisochctrl &= 0x1c;
				5480
				5481	/* If we have the recommended number of TLB entries (16), fine. */
				5482	if (vtisochctrl == 0x10)
				5483	return;
				5484
				5485	/* Zero TLB entries? You get to ride the short bus to school. */
				5486	if (!vtisochctrl) {
				5487	WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
				5488	"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
				5489	dmi_get_system_info(DMI_BIOS_VENDOR),
				5490	dmi_get_system_info(DMI_BIOS_VERSION),
				5491	dmi_get_system_info(DMI_PRODUCT_VERSION));
				5492	iommu_identity_mapping \|= IDENTMAP_AZALIA;
				5493	return;
				5494	}
				5495
				5496	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
				5497	vtisochctrl);
				5498	}