blob: 28af276cff9f69e56220c9bbddf5ab2dbd27ca9b [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
22 */
23
24#include <linux/init.h>
25#include <linux/bitmap.h>
26#include <linux/debugfs.h>
27#include <linux/export.h>
28#include <linux/slab.h>
29#include <linux/irq.h>
30#include <linux/interrupt.h>
31#include <linux/spinlock.h>
32#include <linux/pci.h>
33#include <linux/dmar.h>
34#include <linux/dma-mapping.h>
35#include <linux/mempool.h>
36#include <linux/timer.h>
37#include <linux/iova.h>
38#include <linux/iommu.h>
39#include <linux/intel-iommu.h>
40#include <linux/syscore_ops.h>
41#include <linux/tboot.h>
42#include <linux/dmi.h>
43#include <linux/pci-ats.h>
44#include <linux/memblock.h>
45#include <asm/cacheflush.h>
46#include <asm/iommu.h>
47
48#define ROOT_SIZE VTD_PAGE_SIZE
49#define CONTEXT_SIZE VTD_PAGE_SIZE
50
51#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
54
55#define IOAPIC_RANGE_START (0xfee00000)
56#define IOAPIC_RANGE_END (0xfeefffff)
57#define IOVA_START_ADDR (0x1000)
58
59#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
60
61#define MAX_AGAW_WIDTH 64
62
63#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
65
66/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
69 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
71
72#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
73#define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
74#define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
75
76/* page table handling */
77#define LEVEL_STRIDE (9)
78#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
79
80/*
81 * This bitmap is used to advertise the page sizes our hardware support
82 * to the IOMMU core, which will then use this information to split
83 * physically contiguous memory regions it is mapping into page sizes
84 * that we support.
85 *
86 * Traditionally the IOMMU core just handed us the mappings directly,
87 * after making sure the size is an order of a 4KiB page and that the
88 * mapping has natural alignment.
89 *
90 * To retain this behavior, we currently advertise that we support
91 * all page sizes that are an order of 4KiB.
92 *
93 * If at some point we'd like to utilize the IOMMU core's new behavior,
94 * we could change this to advertise the real page sizes we support.
95 */
96#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
97
98static inline int agaw_to_level(int agaw)
99{
100 return agaw + 2;
101}
102
103static inline int agaw_to_width(int agaw)
104{
105 return 30 + agaw * LEVEL_STRIDE;
106}
107
108static inline int width_to_agaw(int width)
109{
110 return (width - 30) / LEVEL_STRIDE;
111}
112
113static inline unsigned int level_to_offset_bits(int level)
114{
115 return (level - 1) * LEVEL_STRIDE;
116}
117
118static inline int pfn_level_offset(unsigned long pfn, int level)
119{
120 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
121}
122
123static inline unsigned long level_mask(int level)
124{
125 return -1UL << level_to_offset_bits(level);
126}
127
128static inline unsigned long level_size(int level)
129{
130 return 1UL << level_to_offset_bits(level);
131}
132
133static inline unsigned long align_to_level(unsigned long pfn, int level)
134{
135 return (pfn + level_size(level) - 1) & level_mask(level);
136}
137
138static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
139{
140 return 1 << ((lvl - 1) * LEVEL_STRIDE);
141}
142
143/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
144 are never going to work. */
145static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
146{
147 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
148}
149
150static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
151{
152 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
153}
154static inline unsigned long page_to_dma_pfn(struct page *pg)
155{
156 return mm_to_dma_pfn(page_to_pfn(pg));
157}
158static inline unsigned long virt_to_dma_pfn(void *p)
159{
160 return page_to_dma_pfn(virt_to_page(p));
161}
162
163/* global iommu list, set NULL for ignored DMAR units */
164static struct intel_iommu **g_iommus;
165
166static void __init check_tylersburg_isoch(void);
167static int rwbf_quirk;
168
169/*
170 * set to 1 to panic kernel if can't successfully enable VT-d
171 * (used when kernel is launched w/ TXT)
172 */
173static int force_on = 0;
174
175/*
176 * 0: Present
177 * 1-11: Reserved
178 * 12-63: Context Ptr (12 - (haw-1))
179 * 64-127: Reserved
180 */
181struct root_entry {
182 u64 val;
183 u64 rsvd1;
184};
185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186static inline bool root_present(struct root_entry *root)
187{
188 return (root->val & 1);
189}
190static inline void set_root_present(struct root_entry *root)
191{
192 root->val |= 1;
193}
194static inline void set_root_value(struct root_entry *root, unsigned long value)
195{
196 root->val |= value & VTD_PAGE_MASK;
197}
198
199static inline struct context_entry *
200get_context_addr_from_root(struct root_entry *root)
201{
202 return (struct context_entry *)
203 (root_present(root)?phys_to_virt(
204 root->val & VTD_PAGE_MASK) :
205 NULL);
206}
207
208/*
209 * low 64 bits:
210 * 0: present
211 * 1: fault processing disable
212 * 2-3: translation type
213 * 12-63: address space root
214 * high 64 bits:
215 * 0-2: address width
216 * 3-6: aval
217 * 8-23: domain id
218 */
219struct context_entry {
220 u64 lo;
221 u64 hi;
222};
223
224static inline bool context_present(struct context_entry *context)
225{
226 return (context->lo & 1);
227}
228static inline void context_set_present(struct context_entry *context)
229{
230 context->lo |= 1;
231}
232
233static inline void context_set_fault_enable(struct context_entry *context)
234{
235 context->lo &= (((u64)-1) << 2) | 1;
236}
237
238static inline void context_set_translation_type(struct context_entry *context,
239 unsigned long value)
240{
241 context->lo &= (((u64)-1) << 4) | 3;
242 context->lo |= (value & 3) << 2;
243}
244
245static inline void context_set_address_root(struct context_entry *context,
246 unsigned long value)
247{
248 context->lo |= value & VTD_PAGE_MASK;
249}
250
251static inline void context_set_address_width(struct context_entry *context,
252 unsigned long value)
253{
254 context->hi |= value & 7;
255}
256
257static inline void context_set_domain_id(struct context_entry *context,
258 unsigned long value)
259{
260 context->hi |= (value & ((1 << 16) - 1)) << 8;
261}
262
263static inline void context_clear_entry(struct context_entry *context)
264{
265 context->lo = 0;
266 context->hi = 0;
267}
268
269/*
270 * 0: readable
271 * 1: writable
272 * 2-6: reserved
273 * 7: super page
274 * 8-10: available
275 * 11: snoop behavior
276 * 12-63: Host physcial address
277 */
278struct dma_pte {
279 u64 val;
280};
281
282static inline void dma_clear_pte(struct dma_pte *pte)
283{
284 pte->val = 0;
285}
286
287static inline void dma_set_pte_readable(struct dma_pte *pte)
288{
289 pte->val |= DMA_PTE_READ;
290}
291
292static inline void dma_set_pte_writable(struct dma_pte *pte)
293{
294 pte->val |= DMA_PTE_WRITE;
295}
296
297static inline void dma_set_pte_snp(struct dma_pte *pte)
298{
299 pte->val |= DMA_PTE_SNP;
300}
301
302static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
303{
304 pte->val = (pte->val & ~3) | (prot & 3);
305}
306
307static inline u64 dma_pte_addr(struct dma_pte *pte)
308{
309#ifdef CONFIG_64BIT
310 return pte->val & VTD_PAGE_MASK;
311#else
312 /* Must have a full atomic 64-bit read */
313 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
314#endif
315}
316
317static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
318{
319 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
320}
321
322static inline bool dma_pte_present(struct dma_pte *pte)
323{
324 return (pte->val & 3) != 0;
325}
326
327static inline bool dma_pte_superpage(struct dma_pte *pte)
328{
329 return (pte->val & (1 << 7));
330}
331
332static inline int first_pte_in_page(struct dma_pte *pte)
333{
334 return !((unsigned long)pte & ~VTD_PAGE_MASK);
335}
336
337/*
338 * This domain is a statically identity mapping domain.
339 * 1. This domain creats a static 1:1 mapping to all usable memory.
340 * 2. It maps to each iommu if successful.
341 * 3. Each iommu mapps to this domain if successful.
342 */
343static struct dmar_domain *si_domain;
344static int hw_pass_through = 1;
345
346/* devices under the same p2p bridge are owned in one domain */
347#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
348
349/* domain represents a virtual machine, more than one devices
350 * across iommus may be owned in one domain, e.g. kvm guest.
351 */
352#define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
353
354/* si_domain contains mulitple devices */
355#define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
356
357/* define the limit of IOMMUs supported in each domain */
358#ifdef CONFIG_X86
359# define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
360#else
361# define IOMMU_UNITS_SUPPORTED 64
362#endif
363
364struct dmar_domain {
365 int id; /* domain id */
366 int nid; /* node id */
367 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
368 /* bitmap of iommus this domain uses*/
369
370 struct list_head devices; /* all devices' list */
371 struct iova_domain iovad; /* iova's that belong to this domain */
372
373 struct dma_pte *pgd; /* virtual address */
374 int gaw; /* max guest address width */
375
376 /* adjusted guest address width, 0 is level 2 30-bit */
377 int agaw;
378
379 int flags; /* flags to find out type of domain */
380
381 int iommu_coherency;/* indicate coherency of iommu access */
382 int iommu_snooping; /* indicate snooping control feature*/
383 int iommu_count; /* reference count of iommu */
384 int iommu_superpage;/* Level of superpages supported:
385 0 == 4KiB (no superpages), 1 == 2MiB,
386 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
387 spinlock_t iommu_lock; /* protect iommu set in domain */
388 u64 max_addr; /* maximum mapped address */
389};
390
391/* PCI domain-device relationship */
392struct device_domain_info {
393 struct list_head link; /* link to domain siblings */
394 struct list_head global; /* link to global list */
395 int segment; /* PCI domain */
396 u8 bus; /* PCI bus number */
397 u8 devfn; /* PCI devfn number */
398 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
399 struct intel_iommu *iommu; /* IOMMU used by this device */
400 struct dmar_domain *domain; /* pointer to domain */
401};
402
403static void flush_unmaps_timeout(unsigned long data);
404
405DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
406
407#define HIGH_WATER_MARK 250
408struct deferred_flush_tables {
409 int next;
410 struct iova *iova[HIGH_WATER_MARK];
411 struct dmar_domain *domain[HIGH_WATER_MARK];
412};
413
414static struct deferred_flush_tables *deferred_flush;
415
416/* bitmap for indexing intel_iommus */
417static int g_num_of_iommus;
418
419static DEFINE_SPINLOCK(async_umap_flush_lock);
420static LIST_HEAD(unmaps_to_do);
421
422static int timer_on;
423static long list_size;
424
425static void domain_remove_dev_info(struct dmar_domain *domain);
426
427#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
428int dmar_disabled = 0;
429#else
430int dmar_disabled = 1;
431#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
432
433int intel_iommu_enabled = 0;
434EXPORT_SYMBOL_GPL(intel_iommu_enabled);
435
436static int dmar_map_gfx = 1;
437static int dmar_forcedac;
438static int intel_iommu_strict;
439static int intel_iommu_superpage = 1;
440
441int intel_iommu_gfx_mapped;
442EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
443
444#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
445static DEFINE_SPINLOCK(device_domain_lock);
446static LIST_HEAD(device_domain_list);
447
448static struct iommu_ops intel_iommu_ops;
449
450static int __init intel_iommu_setup(char *str)
451{
452 if (!str)
453 return -EINVAL;
454 while (*str) {
455 if (!strncmp(str, "on", 2)) {
456 dmar_disabled = 0;
457 printk(KERN_INFO "Intel-IOMMU: enabled\n");
458 } else if (!strncmp(str, "off", 3)) {
459 dmar_disabled = 1;
460 printk(KERN_INFO "Intel-IOMMU: disabled\n");
461 } else if (!strncmp(str, "igfx_off", 8)) {
462 dmar_map_gfx = 0;
463 printk(KERN_INFO
464 "Intel-IOMMU: disable GFX device mapping\n");
465 } else if (!strncmp(str, "forcedac", 8)) {
466 printk(KERN_INFO
467 "Intel-IOMMU: Forcing DAC for PCI devices\n");
468 dmar_forcedac = 1;
469 } else if (!strncmp(str, "strict", 6)) {
470 printk(KERN_INFO
471 "Intel-IOMMU: disable batched IOTLB flush\n");
472 intel_iommu_strict = 1;
473 } else if (!strncmp(str, "sp_off", 6)) {
474 printk(KERN_INFO
475 "Intel-IOMMU: disable supported super page\n");
476 intel_iommu_superpage = 0;
477 }
478
479 str += strcspn(str, ",");
480 while (*str == ',')
481 str++;
482 }
483 return 0;
484}
485__setup("intel_iommu=", intel_iommu_setup);
486
487static struct kmem_cache *iommu_domain_cache;
488static struct kmem_cache *iommu_devinfo_cache;
489static struct kmem_cache *iommu_iova_cache;
490
491static inline void *alloc_pgtable_page(int node)
492{
493 struct page *page;
494 void *vaddr = NULL;
495
496 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
497 if (page)
498 vaddr = page_address(page);
499 return vaddr;
500}
501
502static inline void free_pgtable_page(void *vaddr)
503{
504 free_page((unsigned long)vaddr);
505}
506
507static inline void *alloc_domain_mem(void)
508{
509 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
510}
511
512static void free_domain_mem(void *vaddr)
513{
514 kmem_cache_free(iommu_domain_cache, vaddr);
515}
516
517static inline void * alloc_devinfo_mem(void)
518{
519 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
520}
521
522static inline void free_devinfo_mem(void *vaddr)
523{
524 kmem_cache_free(iommu_devinfo_cache, vaddr);
525}
526
527struct iova *alloc_iova_mem(void)
528{
529 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
530}
531
532void free_iova_mem(struct iova *iova)
533{
534 kmem_cache_free(iommu_iova_cache, iova);
535}
536
537
538static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
539{
540 unsigned long sagaw;
541 int agaw = -1;
542
543 sagaw = cap_sagaw(iommu->cap);
544 for (agaw = width_to_agaw(max_gaw);
545 agaw >= 0; agaw--) {
546 if (test_bit(agaw, &sagaw))
547 break;
548 }
549
550 return agaw;
551}
552
553/*
554 * Calculate max SAGAW for each iommu.
555 */
556int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
557{
558 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
559}
560
561/*
562 * calculate agaw for each iommu.
563 * "SAGAW" may be different across iommus, use a default agaw, and
564 * get a supported less agaw for iommus that don't support the default agaw.
565 */
566int iommu_calculate_agaw(struct intel_iommu *iommu)
567{
568 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
569}
570
571/* This functionin only returns single iommu in a domain */
572static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
573{
574 int iommu_id;
575
576 /* si_domain and vm domain should not get here. */
577 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
578 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
579
580 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
581 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
582 return NULL;
583
584 return g_iommus[iommu_id];
585}
586
587static void domain_update_iommu_coherency(struct dmar_domain *domain)
588{
589 int i;
590
591 i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
592
593 domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
594
595 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
596 if (!ecap_coherent(g_iommus[i]->ecap)) {
597 domain->iommu_coherency = 0;
598 break;
599 }
600 }
601}
602
603static void domain_update_iommu_snooping(struct dmar_domain *domain)
604{
605 int i;
606
607 domain->iommu_snooping = 1;
608
609 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
610 if (!ecap_sc_support(g_iommus[i]->ecap)) {
611 domain->iommu_snooping = 0;
612 break;
613 }
614 }
615}
616
617static void domain_update_iommu_superpage(struct dmar_domain *domain)
618{
619 struct dmar_drhd_unit *drhd;
620 struct intel_iommu *iommu = NULL;
621 int mask = 0xf;
622
623 if (!intel_iommu_superpage) {
624 domain->iommu_superpage = 0;
625 return;
626 }
627
628 /* set iommu_superpage to the smallest common denominator */
629 for_each_active_iommu(iommu, drhd) {
630 mask &= cap_super_page_val(iommu->cap);
631 if (!mask) {
632 break;
633 }
634 }
635 domain->iommu_superpage = fls(mask);
636}
637
638/* Some capabilities may be different across iommus */
639static void domain_update_iommu_cap(struct dmar_domain *domain)
640{
641 domain_update_iommu_coherency(domain);
642 domain_update_iommu_snooping(domain);
643 domain_update_iommu_superpage(domain);
644}
645
646static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
647{
648 struct dmar_drhd_unit *drhd = NULL;
649 int i;
650
651 for_each_drhd_unit(drhd) {
652 if (drhd->ignored)
653 continue;
654 if (segment != drhd->segment)
655 continue;
656
657 for (i = 0; i < drhd->devices_cnt; i++) {
658 if (drhd->devices[i] &&
659 drhd->devices[i]->bus->number == bus &&
660 drhd->devices[i]->devfn == devfn)
661 return drhd->iommu;
662 if (drhd->devices[i] &&
663 drhd->devices[i]->subordinate &&
664 drhd->devices[i]->subordinate->number <= bus &&
665 drhd->devices[i]->subordinate->subordinate >= bus)
666 return drhd->iommu;
667 }
668
669 if (drhd->include_all)
670 return drhd->iommu;
671 }
672
673 return NULL;
674}
675
676static void domain_flush_cache(struct dmar_domain *domain,
677 void *addr, int size)
678{
679 if (!domain->iommu_coherency)
680 clflush_cache_range(addr, size);
681}
682
683/* Gets context entry for a given bus and devfn */
684static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
685 u8 bus, u8 devfn)
686{
687 struct root_entry *root;
688 struct context_entry *context;
689 unsigned long phy_addr;
690 unsigned long flags;
691
692 spin_lock_irqsave(&iommu->lock, flags);
693 root = &iommu->root_entry[bus];
694 context = get_context_addr_from_root(root);
695 if (!context) {
696 context = (struct context_entry *)
697 alloc_pgtable_page(iommu->node);
698 if (!context) {
699 spin_unlock_irqrestore(&iommu->lock, flags);
700 return NULL;
701 }
702 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
703 phy_addr = virt_to_phys((void *)context);
704 set_root_value(root, phy_addr);
705 set_root_present(root);
706 __iommu_flush_cache(iommu, root, sizeof(*root));
707 }
708 spin_unlock_irqrestore(&iommu->lock, flags);
709 return &context[devfn];
710}
711
712static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
713{
714 struct root_entry *root;
715 struct context_entry *context;
716 int ret;
717 unsigned long flags;
718
719 spin_lock_irqsave(&iommu->lock, flags);
720 root = &iommu->root_entry[bus];
721 context = get_context_addr_from_root(root);
722 if (!context) {
723 ret = 0;
724 goto out;
725 }
726 ret = context_present(&context[devfn]);
727out:
728 spin_unlock_irqrestore(&iommu->lock, flags);
729 return ret;
730}
731
732static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
733{
734 struct root_entry *root;
735 struct context_entry *context;
736 unsigned long flags;
737
738 spin_lock_irqsave(&iommu->lock, flags);
739 root = &iommu->root_entry[bus];
740 context = get_context_addr_from_root(root);
741 if (context) {
742 context_clear_entry(&context[devfn]);
743 __iommu_flush_cache(iommu, &context[devfn], \
744 sizeof(*context));
745 }
746 spin_unlock_irqrestore(&iommu->lock, flags);
747}
748
749static void free_context_table(struct intel_iommu *iommu)
750{
751 struct root_entry *root;
752 int i;
753 unsigned long flags;
754 struct context_entry *context;
755
756 spin_lock_irqsave(&iommu->lock, flags);
757 if (!iommu->root_entry) {
758 goto out;
759 }
760 for (i = 0; i < ROOT_ENTRY_NR; i++) {
761 root = &iommu->root_entry[i];
762 context = get_context_addr_from_root(root);
763 if (context)
764 free_pgtable_page(context);
765 }
766 free_pgtable_page(iommu->root_entry);
767 iommu->root_entry = NULL;
768out:
769 spin_unlock_irqrestore(&iommu->lock, flags);
770}
771
772static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
773 unsigned long pfn, int target_level)
774{
775 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
776 struct dma_pte *parent, *pte = NULL;
777 int level = agaw_to_level(domain->agaw);
778 int offset;
779
780 BUG_ON(!domain->pgd);
781
782 if (addr_width < BITS_PER_LONG && pfn >> addr_width)
783 /* Address beyond IOMMU's addressing capabilities. */
784 return NULL;
785
786 parent = domain->pgd;
787
788 while (level > 0) {
789 void *tmp_page;
790
791 offset = pfn_level_offset(pfn, level);
792 pte = &parent[offset];
793 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
794 break;
795 if (level == target_level)
796 break;
797
798 if (!dma_pte_present(pte)) {
799 uint64_t pteval;
800
801 tmp_page = alloc_pgtable_page(domain->nid);
802
803 if (!tmp_page)
804 return NULL;
805
806 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
807 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
808 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
809 /* Someone else set it while we were thinking; use theirs. */
810 free_pgtable_page(tmp_page);
811 } else {
812 dma_pte_addr(pte);
813 domain_flush_cache(domain, pte, sizeof(*pte));
814 }
815 }
816 parent = phys_to_virt(dma_pte_addr(pte));
817 level--;
818 }
819
820 return pte;
821}
822
823
824/* return address's pte at specific level */
825static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
826 unsigned long pfn,
827 int level, int *large_page)
828{
829 struct dma_pte *parent, *pte = NULL;
830 int total = agaw_to_level(domain->agaw);
831 int offset;
832
833 parent = domain->pgd;
834 while (level <= total) {
835 offset = pfn_level_offset(pfn, total);
836 pte = &parent[offset];
837 if (level == total)
838 return pte;
839
840 if (!dma_pte_present(pte)) {
841 *large_page = total;
842 break;
843 }
844
845 if (pte->val & DMA_PTE_LARGE_PAGE) {
846 *large_page = total;
847 return pte;
848 }
849
850 parent = phys_to_virt(dma_pte_addr(pte));
851 total--;
852 }
853 return NULL;
854}
855
856/* clear last level pte, a tlb flush should be followed */
857static int dma_pte_clear_range(struct dmar_domain *domain,
858 unsigned long start_pfn,
859 unsigned long last_pfn)
860{
861 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
862 unsigned int large_page = 1;
863 struct dma_pte *first_pte, *pte;
864 int order;
865
866 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
867 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
868 BUG_ON(start_pfn > last_pfn);
869
870 /* we don't need lock here; nobody else touches the iova range */
871 do {
872 large_page = 1;
873 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
874 if (!pte) {
875 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
876 continue;
877 }
878 do {
879 dma_clear_pte(pte);
880 start_pfn += lvl_to_nr_pages(large_page);
881 pte++;
882 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
883
884 domain_flush_cache(domain, first_pte,
885 (void *)pte - (void *)first_pte);
886
887 } while (start_pfn && start_pfn <= last_pfn);
888
889 order = (large_page - 1) * 9;
890 return order;
891}
892
893static void dma_pte_free_level(struct dmar_domain *domain, int level,
894 struct dma_pte *pte, unsigned long pfn,
895 unsigned long start_pfn, unsigned long last_pfn)
896{
897 pfn = max(start_pfn, pfn);
898 pte = &pte[pfn_level_offset(pfn, level)];
899
900 do {
901 unsigned long level_pfn;
902 struct dma_pte *level_pte;
903
904 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
905 goto next;
906
907 level_pfn = pfn & level_mask(level - 1);
908 level_pte = phys_to_virt(dma_pte_addr(pte));
909
910 if (level > 2)
911 dma_pte_free_level(domain, level - 1, level_pte,
912 level_pfn, start_pfn, last_pfn);
913
914 /* If range covers entire pagetable, free it */
915 if (!(start_pfn > level_pfn ||
916 last_pfn < level_pfn + level_size(level) - 1)) {
917 dma_clear_pte(pte);
918 domain_flush_cache(domain, pte, sizeof(*pte));
919 free_pgtable_page(level_pte);
920 }
921next:
922 pfn += level_size(level);
923 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
924}
925
926/* free page table pages. last level pte should already be cleared */
927static void dma_pte_free_pagetable(struct dmar_domain *domain,
928 unsigned long start_pfn,
929 unsigned long last_pfn)
930{
931 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
932
933 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
934 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
935 BUG_ON(start_pfn > last_pfn);
936
937 /* We don't need lock here; nobody else touches the iova range */
938 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
939 domain->pgd, 0, start_pfn, last_pfn);
940
941 /* free pgd */
942 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
943 free_pgtable_page(domain->pgd);
944 domain->pgd = NULL;
945 }
946}
947
948/* iommu handling */
949static int iommu_alloc_root_entry(struct intel_iommu *iommu)
950{
951 struct root_entry *root;
952 unsigned long flags;
953
954 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
955 if (!root)
956 return -ENOMEM;
957
958 __iommu_flush_cache(iommu, root, ROOT_SIZE);
959
960 spin_lock_irqsave(&iommu->lock, flags);
961 iommu->root_entry = root;
962 spin_unlock_irqrestore(&iommu->lock, flags);
963
964 return 0;
965}
966
967static void iommu_set_root_entry(struct intel_iommu *iommu)
968{
969 void *addr;
970 u32 sts;
971 unsigned long flag;
972
973 addr = iommu->root_entry;
974
975 raw_spin_lock_irqsave(&iommu->register_lock, flag);
976 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
977
978 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
979
980 /* Make sure hardware complete it */
981 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
982 readl, (sts & DMA_GSTS_RTPS), sts);
983
984 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
985}
986
987static void iommu_flush_write_buffer(struct intel_iommu *iommu)
988{
989 u32 val;
990 unsigned long flag;
991
992 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
993 return;
994
995 raw_spin_lock_irqsave(&iommu->register_lock, flag);
996 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
997
998 /* Make sure hardware complete it */
999 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1000 readl, (!(val & DMA_GSTS_WBFS)), val);
1001
1002 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1003}
1004
1005/* return value determine if we need a write buffer flush */
1006static void __iommu_flush_context(struct intel_iommu *iommu,
1007 u16 did, u16 source_id, u8 function_mask,
1008 u64 type)
1009{
1010 u64 val = 0;
1011 unsigned long flag;
1012
1013 switch (type) {
1014 case DMA_CCMD_GLOBAL_INVL:
1015 val = DMA_CCMD_GLOBAL_INVL;
1016 break;
1017 case DMA_CCMD_DOMAIN_INVL:
1018 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1019 break;
1020 case DMA_CCMD_DEVICE_INVL:
1021 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1022 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1023 break;
1024 default:
1025 BUG();
1026 }
1027 val |= DMA_CCMD_ICC;
1028
1029 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1030 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1031
1032 /* Make sure hardware complete it */
1033 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1034 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1035
1036 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1037}
1038
1039/* return value determine if we need a write buffer flush */
1040static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1041 u64 addr, unsigned int size_order, u64 type)
1042{
1043 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1044 u64 val = 0, val_iva = 0;
1045 unsigned long flag;
1046
1047 switch (type) {
1048 case DMA_TLB_GLOBAL_FLUSH:
1049 /* global flush doesn't need set IVA_REG */
1050 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1051 break;
1052 case DMA_TLB_DSI_FLUSH:
1053 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1054 break;
1055 case DMA_TLB_PSI_FLUSH:
1056 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1057 /* Note: always flush non-leaf currently */
1058 val_iva = size_order | addr;
1059 break;
1060 default:
1061 BUG();
1062 }
1063 /* Note: set drain read/write */
1064#if 0
1065 /*
1066 * This is probably to be super secure.. Looks like we can
1067 * ignore it without any impact.
1068 */
1069 if (cap_read_drain(iommu->cap))
1070 val |= DMA_TLB_READ_DRAIN;
1071#endif
1072 if (cap_write_drain(iommu->cap))
1073 val |= DMA_TLB_WRITE_DRAIN;
1074
1075 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1076 /* Note: Only uses first TLB reg currently */
1077 if (val_iva)
1078 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1079 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1080
1081 /* Make sure hardware complete it */
1082 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1083 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1084
1085 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086
1087 /* check IOTLB invalidation granularity */
1088 if (DMA_TLB_IAIG(val) == 0)
1089 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1090 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1091 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1092 (unsigned long long)DMA_TLB_IIRG(type),
1093 (unsigned long long)DMA_TLB_IAIG(val));
1094}
1095
1096static struct device_domain_info *iommu_support_dev_iotlb(
1097 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1098{
1099 int found = 0;
1100 unsigned long flags;
1101 struct device_domain_info *info;
1102 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1103
1104 if (!ecap_dev_iotlb_support(iommu->ecap))
1105 return NULL;
1106
1107 if (!iommu->qi)
1108 return NULL;
1109
1110 spin_lock_irqsave(&device_domain_lock, flags);
1111 list_for_each_entry(info, &domain->devices, link)
1112 if (info->bus == bus && info->devfn == devfn) {
1113 found = 1;
1114 break;
1115 }
1116 spin_unlock_irqrestore(&device_domain_lock, flags);
1117
1118 if (!found || !info->dev)
1119 return NULL;
1120
1121 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1122 return NULL;
1123
1124 if (!dmar_find_matched_atsr_unit(info->dev))
1125 return NULL;
1126
1127 info->iommu = iommu;
1128
1129 return info;
1130}
1131
1132static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1133{
1134 if (!info)
1135 return;
1136
1137 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1138}
1139
1140static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1141{
1142 if (!info->dev || !pci_ats_enabled(info->dev))
1143 return;
1144
1145 pci_disable_ats(info->dev);
1146}
1147
1148static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1149 u64 addr, unsigned mask)
1150{
1151 u16 sid, qdep;
1152 unsigned long flags;
1153 struct device_domain_info *info;
1154
1155 spin_lock_irqsave(&device_domain_lock, flags);
1156 list_for_each_entry(info, &domain->devices, link) {
1157 if (!info->dev || !pci_ats_enabled(info->dev))
1158 continue;
1159
1160 sid = info->bus << 8 | info->devfn;
1161 qdep = pci_ats_queue_depth(info->dev);
1162 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1163 }
1164 spin_unlock_irqrestore(&device_domain_lock, flags);
1165}
1166
1167static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1168 unsigned long pfn, unsigned int pages, int map)
1169{
1170 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1171 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1172
1173 BUG_ON(pages == 0);
1174
1175 /*
1176 * Fallback to domain selective flush if no PSI support or the size is
1177 * too big.
1178 * PSI requires page size to be 2 ^ x, and the base address is naturally
1179 * aligned to the size
1180 */
1181 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1182 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1183 DMA_TLB_DSI_FLUSH);
1184 else
1185 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1186 DMA_TLB_PSI_FLUSH);
1187
1188 /*
1189 * In caching mode, changes of pages from non-present to present require
1190 * flush. However, device IOTLB doesn't need to be flushed in this case.
1191 */
1192 if (!cap_caching_mode(iommu->cap) || !map)
1193 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1194}
1195
1196static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1197{
1198 u32 pmen;
1199 unsigned long flags;
1200
1201 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1202 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1203 pmen &= ~DMA_PMEN_EPM;
1204 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1205
1206 /* wait for the protected region status bit to clear */
1207 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1208 readl, !(pmen & DMA_PMEN_PRS), pmen);
1209
1210 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1211}
1212
1213static int iommu_enable_translation(struct intel_iommu *iommu)
1214{
1215 u32 sts;
1216 unsigned long flags;
1217
1218 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1219 iommu->gcmd |= DMA_GCMD_TE;
1220 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1221
1222 /* Make sure hardware complete it */
1223 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1224 readl, (sts & DMA_GSTS_TES), sts);
1225
1226 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1227 return 0;
1228}
1229
1230static int iommu_disable_translation(struct intel_iommu *iommu)
1231{
1232 u32 sts;
1233 unsigned long flag;
1234
1235 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1236 iommu->gcmd &= ~DMA_GCMD_TE;
1237 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1238
1239 /* Make sure hardware complete it */
1240 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1241 readl, (!(sts & DMA_GSTS_TES)), sts);
1242
1243 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1244 return 0;
1245}
1246
1247
1248static int iommu_init_domains(struct intel_iommu *iommu)
1249{
1250 unsigned long ndomains;
1251 unsigned long nlongs;
1252
1253 ndomains = cap_ndoms(iommu->cap);
1254 pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1255 ndomains);
1256 nlongs = BITS_TO_LONGS(ndomains);
1257
1258 spin_lock_init(&iommu->lock);
1259
1260 /* TBD: there might be 64K domains,
1261 * consider other allocation for future chip
1262 */
1263 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1264 if (!iommu->domain_ids) {
1265 printk(KERN_ERR "Allocating domain id array failed\n");
1266 return -ENOMEM;
1267 }
1268 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1269 GFP_KERNEL);
1270 if (!iommu->domains) {
1271 printk(KERN_ERR "Allocating domain array failed\n");
1272 return -ENOMEM;
1273 }
1274
1275 /*
1276 * if Caching mode is set, then invalid translations are tagged
1277 * with domainid 0. Hence we need to pre-allocate it.
1278 */
1279 if (cap_caching_mode(iommu->cap))
1280 set_bit(0, iommu->domain_ids);
1281 return 0;
1282}
1283
1284
1285static void domain_exit(struct dmar_domain *domain);
1286static void vm_domain_exit(struct dmar_domain *domain);
1287
1288void free_dmar_iommu(struct intel_iommu *iommu)
1289{
1290 struct dmar_domain *domain;
1291 int i;
1292 unsigned long flags;
1293
1294 if ((iommu->domains) && (iommu->domain_ids)) {
1295 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1296 domain = iommu->domains[i];
1297 clear_bit(i, iommu->domain_ids);
1298
1299 spin_lock_irqsave(&domain->iommu_lock, flags);
1300 if (--domain->iommu_count == 0) {
1301 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1302 vm_domain_exit(domain);
1303 else
1304 domain_exit(domain);
1305 }
1306 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1307 }
1308 }
1309
1310 if (iommu->gcmd & DMA_GCMD_TE)
1311 iommu_disable_translation(iommu);
1312
1313 if (iommu->irq) {
1314 irq_set_handler_data(iommu->irq, NULL);
1315 /* This will mask the irq */
1316 free_irq(iommu->irq, iommu);
1317 destroy_irq(iommu->irq);
1318 }
1319
1320 kfree(iommu->domains);
1321 kfree(iommu->domain_ids);
1322
1323 g_iommus[iommu->seq_id] = NULL;
1324
1325 /* if all iommus are freed, free g_iommus */
1326 for (i = 0; i < g_num_of_iommus; i++) {
1327 if (g_iommus[i])
1328 break;
1329 }
1330
1331 if (i == g_num_of_iommus)
1332 kfree(g_iommus);
1333
1334 /* free context mapping */
1335 free_context_table(iommu);
1336}
1337
1338static struct dmar_domain *alloc_domain(void)
1339{
1340 struct dmar_domain *domain;
1341
1342 domain = alloc_domain_mem();
1343 if (!domain)
1344 return NULL;
1345
1346 domain->nid = -1;
1347 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1348 domain->flags = 0;
1349
1350 return domain;
1351}
1352
1353static int iommu_attach_domain(struct dmar_domain *domain,
1354 struct intel_iommu *iommu)
1355{
1356 int num;
1357 unsigned long ndomains;
1358 unsigned long flags;
1359
1360 ndomains = cap_ndoms(iommu->cap);
1361
1362 spin_lock_irqsave(&iommu->lock, flags);
1363
1364 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1365 if (num >= ndomains) {
1366 spin_unlock_irqrestore(&iommu->lock, flags);
1367 printk(KERN_ERR "IOMMU: no free domain ids\n");
1368 return -ENOMEM;
1369 }
1370
1371 domain->id = num;
1372 set_bit(num, iommu->domain_ids);
1373 set_bit(iommu->seq_id, domain->iommu_bmp);
1374 iommu->domains[num] = domain;
1375 spin_unlock_irqrestore(&iommu->lock, flags);
1376
1377 return 0;
1378}
1379
1380static void iommu_detach_domain(struct dmar_domain *domain,
1381 struct intel_iommu *iommu)
1382{
1383 unsigned long flags;
1384 int num, ndomains;
1385 int found = 0;
1386
1387 spin_lock_irqsave(&iommu->lock, flags);
1388 ndomains = cap_ndoms(iommu->cap);
1389 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1390 if (iommu->domains[num] == domain) {
1391 found = 1;
1392 break;
1393 }
1394 }
1395
1396 if (found) {
1397 clear_bit(num, iommu->domain_ids);
1398 clear_bit(iommu->seq_id, domain->iommu_bmp);
1399 iommu->domains[num] = NULL;
1400 }
1401 spin_unlock_irqrestore(&iommu->lock, flags);
1402}
1403
1404static struct iova_domain reserved_iova_list;
1405static struct lock_class_key reserved_rbtree_key;
1406
1407static int dmar_init_reserved_ranges(void)
1408{
1409 struct pci_dev *pdev = NULL;
1410 struct iova *iova;
1411 int i;
1412
1413 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1414
1415 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1416 &reserved_rbtree_key);
1417
1418 /* IOAPIC ranges shouldn't be accessed by DMA */
1419 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1420 IOVA_PFN(IOAPIC_RANGE_END));
1421 if (!iova) {
1422 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1423 return -ENODEV;
1424 }
1425
1426 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1427 for_each_pci_dev(pdev) {
1428 struct resource *r;
1429
1430 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1431 r = &pdev->resource[i];
1432 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1433 continue;
1434 iova = reserve_iova(&reserved_iova_list,
1435 IOVA_PFN(r->start),
1436 IOVA_PFN(r->end));
1437 if (!iova) {
1438 printk(KERN_ERR "Reserve iova failed\n");
1439 return -ENODEV;
1440 }
1441 }
1442 }
1443 return 0;
1444}
1445
1446static void domain_reserve_special_ranges(struct dmar_domain *domain)
1447{
1448 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1449}
1450
1451static inline int guestwidth_to_adjustwidth(int gaw)
1452{
1453 int agaw;
1454 int r = (gaw - 12) % 9;
1455
1456 if (r == 0)
1457 agaw = gaw;
1458 else
1459 agaw = gaw + 9 - r;
1460 if (agaw > 64)
1461 agaw = 64;
1462 return agaw;
1463}
1464
1465static int domain_init(struct dmar_domain *domain, int guest_width)
1466{
1467 struct intel_iommu *iommu;
1468 int adjust_width, agaw;
1469 unsigned long sagaw;
1470
1471 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1472 spin_lock_init(&domain->iommu_lock);
1473
1474 domain_reserve_special_ranges(domain);
1475
1476 /* calculate AGAW */
1477 iommu = domain_get_iommu(domain);
1478 if (guest_width > cap_mgaw(iommu->cap))
1479 guest_width = cap_mgaw(iommu->cap);
1480 domain->gaw = guest_width;
1481 adjust_width = guestwidth_to_adjustwidth(guest_width);
1482 agaw = width_to_agaw(adjust_width);
1483 sagaw = cap_sagaw(iommu->cap);
1484 if (!test_bit(agaw, &sagaw)) {
1485 /* hardware doesn't support it, choose a bigger one */
1486 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1487 agaw = find_next_bit(&sagaw, 5, agaw);
1488 if (agaw >= 5)
1489 return -ENODEV;
1490 }
1491 domain->agaw = agaw;
1492 INIT_LIST_HEAD(&domain->devices);
1493
1494 if (ecap_coherent(iommu->ecap))
1495 domain->iommu_coherency = 1;
1496 else
1497 domain->iommu_coherency = 0;
1498
1499 if (ecap_sc_support(iommu->ecap))
1500 domain->iommu_snooping = 1;
1501 else
1502 domain->iommu_snooping = 0;
1503
1504 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1505 domain->iommu_count = 1;
1506 domain->nid = iommu->node;
1507
1508 /* always allocate the top pgd */
1509 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1510 if (!domain->pgd)
1511 return -ENOMEM;
1512 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1513 return 0;
1514}
1515
1516static void domain_exit(struct dmar_domain *domain)
1517{
1518 struct dmar_drhd_unit *drhd;
1519 struct intel_iommu *iommu;
1520
1521 /* Domain 0 is reserved, so dont process it */
1522 if (!domain)
1523 return;
1524
1525 /* Flush any lazy unmaps that may reference this domain */
1526 if (!intel_iommu_strict)
1527 flush_unmaps_timeout(0);
1528
1529 domain_remove_dev_info(domain);
1530 /* destroy iovas */
1531 put_iova_domain(&domain->iovad);
1532
1533 /* clear ptes */
1534 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535
1536 /* free page tables */
1537 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1538
1539 for_each_active_iommu(iommu, drhd)
1540 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1541 iommu_detach_domain(domain, iommu);
1542
1543 free_domain_mem(domain);
1544}
1545
1546static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1547 u8 bus, u8 devfn, int translation)
1548{
1549 struct context_entry *context;
1550 unsigned long flags;
1551 struct intel_iommu *iommu;
1552 struct dma_pte *pgd;
1553 unsigned long num;
1554 unsigned long ndomains;
1555 int id;
1556 int agaw;
1557 struct device_domain_info *info = NULL;
1558
1559 pr_debug("Set context mapping for %02x:%02x.%d\n",
1560 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1561
1562 BUG_ON(!domain->pgd);
1563 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1564 translation != CONTEXT_TT_MULTI_LEVEL);
1565
1566 iommu = device_to_iommu(segment, bus, devfn);
1567 if (!iommu)
1568 return -ENODEV;
1569
1570 context = device_to_context_entry(iommu, bus, devfn);
1571 if (!context)
1572 return -ENOMEM;
1573 spin_lock_irqsave(&iommu->lock, flags);
1574 if (context_present(context)) {
1575 spin_unlock_irqrestore(&iommu->lock, flags);
1576 return 0;
1577 }
1578
1579 id = domain->id;
1580 pgd = domain->pgd;
1581
1582 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1583 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1584 int found = 0;
1585
1586 /* find an available domain id for this device in iommu */
1587 ndomains = cap_ndoms(iommu->cap);
1588 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1589 if (iommu->domains[num] == domain) {
1590 id = num;
1591 found = 1;
1592 break;
1593 }
1594 }
1595
1596 if (found == 0) {
1597 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1598 if (num >= ndomains) {
1599 spin_unlock_irqrestore(&iommu->lock, flags);
1600 printk(KERN_ERR "IOMMU: no free domain ids\n");
1601 return -EFAULT;
1602 }
1603
1604 set_bit(num, iommu->domain_ids);
1605 iommu->domains[num] = domain;
1606 id = num;
1607 }
1608
1609 /* Skip top levels of page tables for
1610 * iommu which has less agaw than default.
1611 * Unnecessary for PT mode.
1612 */
1613 if (translation != CONTEXT_TT_PASS_THROUGH) {
1614 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1615 pgd = phys_to_virt(dma_pte_addr(pgd));
1616 if (!dma_pte_present(pgd)) {
1617 spin_unlock_irqrestore(&iommu->lock, flags);
1618 return -ENOMEM;
1619 }
1620 }
1621 }
1622 }
1623
1624 context_set_domain_id(context, id);
1625
1626 if (translation != CONTEXT_TT_PASS_THROUGH) {
1627 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1628 translation = info ? CONTEXT_TT_DEV_IOTLB :
1629 CONTEXT_TT_MULTI_LEVEL;
1630 }
1631 /*
1632 * In pass through mode, AW must be programmed to indicate the largest
1633 * AGAW value supported by hardware. And ASR is ignored by hardware.
1634 */
1635 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1636 context_set_address_width(context, iommu->msagaw);
1637 else {
1638 context_set_address_root(context, virt_to_phys(pgd));
1639 context_set_address_width(context, iommu->agaw);
1640 }
1641
1642 context_set_translation_type(context, translation);
1643 context_set_fault_enable(context);
1644 context_set_present(context);
1645 domain_flush_cache(domain, context, sizeof(*context));
1646
1647 /*
1648 * It's a non-present to present mapping. If hardware doesn't cache
1649 * non-present entry we only need to flush the write-buffer. If the
1650 * _does_ cache non-present entries, then it does so in the special
1651 * domain #0, which we have to flush:
1652 */
1653 if (cap_caching_mode(iommu->cap)) {
1654 iommu->flush.flush_context(iommu, 0,
1655 (((u16)bus) << 8) | devfn,
1656 DMA_CCMD_MASK_NOBIT,
1657 DMA_CCMD_DEVICE_INVL);
1658 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1659 } else {
1660 iommu_flush_write_buffer(iommu);
1661 }
1662 iommu_enable_dev_iotlb(info);
1663 spin_unlock_irqrestore(&iommu->lock, flags);
1664
1665 spin_lock_irqsave(&domain->iommu_lock, flags);
1666 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1667 domain->iommu_count++;
1668 if (domain->iommu_count == 1)
1669 domain->nid = iommu->node;
1670 domain_update_iommu_cap(domain);
1671 }
1672 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1673 return 0;
1674}
1675
1676static int
1677domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1678 int translation)
1679{
1680 int ret;
1681 struct pci_dev *tmp, *parent;
1682
1683 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1684 pdev->bus->number, pdev->devfn,
1685 translation);
1686 if (ret)
1687 return ret;
1688
1689 /* dependent device mapping */
1690 tmp = pci_find_upstream_pcie_bridge(pdev);
1691 if (!tmp)
1692 return 0;
1693 /* Secondary interface's bus number and devfn 0 */
1694 parent = pdev->bus->self;
1695 while (parent != tmp) {
1696 ret = domain_context_mapping_one(domain,
1697 pci_domain_nr(parent->bus),
1698 parent->bus->number,
1699 parent->devfn, translation);
1700 if (ret)
1701 return ret;
1702 parent = parent->bus->self;
1703 }
1704 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1705 return domain_context_mapping_one(domain,
1706 pci_domain_nr(tmp->subordinate),
1707 tmp->subordinate->number, 0,
1708 translation);
1709 else /* this is a legacy PCI bridge */
1710 return domain_context_mapping_one(domain,
1711 pci_domain_nr(tmp->bus),
1712 tmp->bus->number,
1713 tmp->devfn,
1714 translation);
1715}
1716
1717static int domain_context_mapped(struct pci_dev *pdev)
1718{
1719 int ret;
1720 struct pci_dev *tmp, *parent;
1721 struct intel_iommu *iommu;
1722
1723 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1724 pdev->devfn);
1725 if (!iommu)
1726 return -ENODEV;
1727
1728 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1729 if (!ret)
1730 return ret;
1731 /* dependent device mapping */
1732 tmp = pci_find_upstream_pcie_bridge(pdev);
1733 if (!tmp)
1734 return ret;
1735 /* Secondary interface's bus number and devfn 0 */
1736 parent = pdev->bus->self;
1737 while (parent != tmp) {
1738 ret = device_context_mapped(iommu, parent->bus->number,
1739 parent->devfn);
1740 if (!ret)
1741 return ret;
1742 parent = parent->bus->self;
1743 }
1744 if (pci_is_pcie(tmp))
1745 return device_context_mapped(iommu, tmp->subordinate->number,
1746 0);
1747 else
1748 return device_context_mapped(iommu, tmp->bus->number,
1749 tmp->devfn);
1750}
1751
1752/* Returns a number of VTD pages, but aligned to MM page size */
1753static inline unsigned long aligned_nrpages(unsigned long host_addr,
1754 size_t size)
1755{
1756 host_addr &= ~PAGE_MASK;
1757 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1758}
1759
1760/* Return largest possible superpage level for a given mapping */
1761static inline int hardware_largepage_caps(struct dmar_domain *domain,
1762 unsigned long iov_pfn,
1763 unsigned long phy_pfn,
1764 unsigned long pages)
1765{
1766 int support, level = 1;
1767 unsigned long pfnmerge;
1768
1769 support = domain->iommu_superpage;
1770
1771 /* To use a large page, the virtual *and* physical addresses
1772 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1773 of them will mean we have to use smaller pages. So just
1774 merge them and check both at once. */
1775 pfnmerge = iov_pfn | phy_pfn;
1776
1777 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1778 pages >>= VTD_STRIDE_SHIFT;
1779 if (!pages)
1780 break;
1781 pfnmerge >>= VTD_STRIDE_SHIFT;
1782 level++;
1783 support--;
1784 }
1785 return level;
1786}
1787
1788static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1789 struct scatterlist *sg, unsigned long phys_pfn,
1790 unsigned long nr_pages, int prot)
1791{
1792 struct dma_pte *first_pte = NULL, *pte = NULL;
1793 phys_addr_t uninitialized_var(pteval);
1794 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1795 unsigned long sg_res = 0;
1796 unsigned int largepage_lvl = 0;
1797 unsigned long lvl_pages = 0;
1798
1799 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1800
1801 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1802 return -EINVAL;
1803
1804 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1805
1806 if (!sg) {
1807 sg_res = nr_pages;
1808 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1809 }
1810
1811 while (nr_pages > 0) {
1812 uint64_t tmp;
1813
1814 if (!sg_res) {
1815 sg_res = aligned_nrpages(sg->offset, sg->length);
1816 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1817 sg->dma_length = sg->length;
1818 pteval = page_to_phys(sg_page(sg)) | prot;
1819 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1820 }
1821
1822 if (!pte) {
1823 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1824
1825 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1826 if (!pte)
1827 return -ENOMEM;
1828 /* It is large page*/
1829 if (largepage_lvl > 1) {
1830 pteval |= DMA_PTE_LARGE_PAGE;
1831 /* Ensure that old small page tables are removed to make room
1832 for superpage, if they exist. */
1833 dma_pte_clear_range(domain, iov_pfn,
1834 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1835 dma_pte_free_pagetable(domain, iov_pfn,
1836 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1837 } else {
1838 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1839 }
1840
1841 }
1842 /* We don't need lock here, nobody else
1843 * touches the iova range
1844 */
1845 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1846 if (tmp) {
1847 static int dumps = 5;
1848 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1849 iov_pfn, tmp, (unsigned long long)pteval);
1850 if (dumps) {
1851 dumps--;
1852 debug_dma_dump_mappings(NULL);
1853 }
1854 WARN_ON(1);
1855 }
1856
1857 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1858
1859 BUG_ON(nr_pages < lvl_pages);
1860 BUG_ON(sg_res < lvl_pages);
1861
1862 nr_pages -= lvl_pages;
1863 iov_pfn += lvl_pages;
1864 phys_pfn += lvl_pages;
1865 pteval += lvl_pages * VTD_PAGE_SIZE;
1866 sg_res -= lvl_pages;
1867
1868 /* If the next PTE would be the first in a new page, then we
1869 need to flush the cache on the entries we've just written.
1870 And then we'll need to recalculate 'pte', so clear it and
1871 let it get set again in the if (!pte) block above.
1872
1873 If we're done (!nr_pages) we need to flush the cache too.
1874
1875 Also if we've been setting superpages, we may need to
1876 recalculate 'pte' and switch back to smaller pages for the
1877 end of the mapping, if the trailing size is not enough to
1878 use another superpage (i.e. sg_res < lvl_pages). */
1879 pte++;
1880 if (!nr_pages || first_pte_in_page(pte) ||
1881 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1882 domain_flush_cache(domain, first_pte,
1883 (void *)pte - (void *)first_pte);
1884 pte = NULL;
1885 }
1886
1887 if (!sg_res && nr_pages)
1888 sg = sg_next(sg);
1889 }
1890 return 0;
1891}
1892
1893static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1894 struct scatterlist *sg, unsigned long nr_pages,
1895 int prot)
1896{
1897 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1898}
1899
1900static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1901 unsigned long phys_pfn, unsigned long nr_pages,
1902 int prot)
1903{
1904 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1905}
1906
1907static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1908{
1909 if (!iommu)
1910 return;
1911
1912 clear_context_table(iommu, bus, devfn);
1913 iommu->flush.flush_context(iommu, 0, 0, 0,
1914 DMA_CCMD_GLOBAL_INVL);
1915 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1916}
1917
1918static void domain_remove_dev_info(struct dmar_domain *domain)
1919{
1920 struct device_domain_info *info;
1921 unsigned long flags;
1922 struct intel_iommu *iommu;
1923
1924 spin_lock_irqsave(&device_domain_lock, flags);
1925 while (!list_empty(&domain->devices)) {
1926 info = list_entry(domain->devices.next,
1927 struct device_domain_info, link);
1928 list_del(&info->link);
1929 list_del(&info->global);
1930 if (info->dev)
1931 info->dev->dev.archdata.iommu = NULL;
1932 spin_unlock_irqrestore(&device_domain_lock, flags);
1933
1934 iommu_disable_dev_iotlb(info);
1935 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1936 iommu_detach_dev(iommu, info->bus, info->devfn);
1937 free_devinfo_mem(info);
1938
1939 spin_lock_irqsave(&device_domain_lock, flags);
1940 }
1941 spin_unlock_irqrestore(&device_domain_lock, flags);
1942}
1943
1944/*
1945 * find_domain
1946 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1947 */
1948static struct dmar_domain *
1949find_domain(struct pci_dev *pdev)
1950{
1951 struct device_domain_info *info;
1952
1953 /* No lock here, assumes no domain exit in normal case */
1954 info = pdev->dev.archdata.iommu;
1955 if (info)
1956 return info->domain;
1957 return NULL;
1958}
1959
1960/* domain is initialized */
1961static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1962{
1963 struct dmar_domain *domain, *found = NULL;
1964 struct intel_iommu *iommu;
1965 struct dmar_drhd_unit *drhd;
1966 struct device_domain_info *info, *tmp;
1967 struct pci_dev *dev_tmp;
1968 unsigned long flags;
1969 int bus = 0, devfn = 0;
1970 int segment;
1971 int ret;
1972
1973 domain = find_domain(pdev);
1974 if (domain)
1975 return domain;
1976
1977 segment = pci_domain_nr(pdev->bus);
1978
1979 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1980 if (dev_tmp) {
1981 if (pci_is_pcie(dev_tmp)) {
1982 bus = dev_tmp->subordinate->number;
1983 devfn = 0;
1984 } else {
1985 bus = dev_tmp->bus->number;
1986 devfn = dev_tmp->devfn;
1987 }
1988 spin_lock_irqsave(&device_domain_lock, flags);
1989 list_for_each_entry(info, &device_domain_list, global) {
1990 if (info->segment == segment &&
1991 info->bus == bus && info->devfn == devfn) {
1992 found = info->domain;
1993 break;
1994 }
1995 }
1996 spin_unlock_irqrestore(&device_domain_lock, flags);
1997 /* pcie-pci bridge already has a domain, uses it */
1998 if (found) {
1999 domain = found;
2000 goto found_domain;
2001 }
2002 }
2003
2004 domain = alloc_domain();
2005 if (!domain)
2006 goto error;
2007
2008 /* Allocate new domain for the device */
2009 drhd = dmar_find_matched_drhd_unit(pdev);
2010 if (!drhd) {
2011 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2012 pci_name(pdev));
2013 return NULL;
2014 }
2015 iommu = drhd->iommu;
2016
2017 ret = iommu_attach_domain(domain, iommu);
2018 if (ret) {
2019 free_domain_mem(domain);
2020 goto error;
2021 }
2022
2023 if (domain_init(domain, gaw)) {
2024 domain_exit(domain);
2025 goto error;
2026 }
2027
2028 /* register pcie-to-pci device */
2029 if (dev_tmp) {
2030 info = alloc_devinfo_mem();
2031 if (!info) {
2032 domain_exit(domain);
2033 goto error;
2034 }
2035 info->segment = segment;
2036 info->bus = bus;
2037 info->devfn = devfn;
2038 info->dev = NULL;
2039 info->domain = domain;
2040 /* This domain is shared by devices under p2p bridge */
2041 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2042
2043 /* pcie-to-pci bridge already has a domain, uses it */
2044 found = NULL;
2045 spin_lock_irqsave(&device_domain_lock, flags);
2046 list_for_each_entry(tmp, &device_domain_list, global) {
2047 if (tmp->segment == segment &&
2048 tmp->bus == bus && tmp->devfn == devfn) {
2049 found = tmp->domain;
2050 break;
2051 }
2052 }
2053 if (found) {
2054 spin_unlock_irqrestore(&device_domain_lock, flags);
2055 free_devinfo_mem(info);
2056 domain_exit(domain);
2057 domain = found;
2058 } else {
2059 list_add(&info->link, &domain->devices);
2060 list_add(&info->global, &device_domain_list);
2061 spin_unlock_irqrestore(&device_domain_lock, flags);
2062 }
2063 }
2064
2065found_domain:
2066 info = alloc_devinfo_mem();
2067 if (!info)
2068 goto error;
2069 info->segment = segment;
2070 info->bus = pdev->bus->number;
2071 info->devfn = pdev->devfn;
2072 info->dev = pdev;
2073 info->domain = domain;
2074 spin_lock_irqsave(&device_domain_lock, flags);
2075 /* somebody is fast */
2076 found = find_domain(pdev);
2077 if (found != NULL) {
2078 spin_unlock_irqrestore(&device_domain_lock, flags);
2079 if (found != domain) {
2080 domain_exit(domain);
2081 domain = found;
2082 }
2083 free_devinfo_mem(info);
2084 return domain;
2085 }
2086 list_add(&info->link, &domain->devices);
2087 list_add(&info->global, &device_domain_list);
2088 pdev->dev.archdata.iommu = info;
2089 spin_unlock_irqrestore(&device_domain_lock, flags);
2090 return domain;
2091error:
2092 /* recheck it here, maybe others set it */
2093 return find_domain(pdev);
2094}
2095
2096static int iommu_identity_mapping;
2097#define IDENTMAP_ALL 1
2098#define IDENTMAP_GFX 2
2099#define IDENTMAP_AZALIA 4
2100
2101static int iommu_domain_identity_map(struct dmar_domain *domain,
2102 unsigned long long start,
2103 unsigned long long end)
2104{
2105 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2106 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2107
2108 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2109 dma_to_mm_pfn(last_vpfn))) {
2110 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2111 return -ENOMEM;
2112 }
2113
2114 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2115 start, end, domain->id);
2116 /*
2117 * RMRR range might have overlap with physical memory range,
2118 * clear it first
2119 */
2120 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2121
2122 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2123 last_vpfn - first_vpfn + 1,
2124 DMA_PTE_READ|DMA_PTE_WRITE);
2125}
2126
2127static int iommu_prepare_identity_map(struct pci_dev *pdev,
2128 unsigned long long start,
2129 unsigned long long end)
2130{
2131 struct dmar_domain *domain;
2132 int ret;
2133
2134 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2135 if (!domain)
2136 return -ENOMEM;
2137
2138 /* For _hardware_ passthrough, don't bother. But for software
2139 passthrough, we do it anyway -- it may indicate a memory
2140 range which is reserved in E820, so which didn't get set
2141 up to start with in si_domain */
2142 if (domain == si_domain && hw_pass_through) {
2143 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2144 pci_name(pdev), start, end);
2145 return 0;
2146 }
2147
2148 printk(KERN_INFO
2149 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2150 pci_name(pdev), start, end);
2151
2152 if (end < start) {
2153 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2154 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2155 dmi_get_system_info(DMI_BIOS_VENDOR),
2156 dmi_get_system_info(DMI_BIOS_VERSION),
2157 dmi_get_system_info(DMI_PRODUCT_VERSION));
2158 ret = -EIO;
2159 goto error;
2160 }
2161
2162 if (end >> agaw_to_width(domain->agaw)) {
2163 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2164 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2165 agaw_to_width(domain->agaw),
2166 dmi_get_system_info(DMI_BIOS_VENDOR),
2167 dmi_get_system_info(DMI_BIOS_VERSION),
2168 dmi_get_system_info(DMI_PRODUCT_VERSION));
2169 ret = -EIO;
2170 goto error;
2171 }
2172
2173 ret = iommu_domain_identity_map(domain, start, end);
2174 if (ret)
2175 goto error;
2176
2177 /* context entry init */
2178 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2179 if (ret)
2180 goto error;
2181
2182 return 0;
2183
2184 error:
2185 domain_exit(domain);
2186 return ret;
2187}
2188
2189static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2190 struct pci_dev *pdev)
2191{
2192 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2193 return 0;
2194 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2195 rmrr->end_address);
2196}
2197
2198#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2199static inline void iommu_prepare_isa(void)
2200{
2201 struct pci_dev *pdev;
2202 int ret;
2203
2204 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2205 if (!pdev)
2206 return;
2207
2208 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2209 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2210
2211 if (ret)
2212 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2213 "floppy might not work\n");
2214
2215}
2216#else
2217static inline void iommu_prepare_isa(void)
2218{
2219 return;
2220}
2221#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2222
2223static int md_domain_init(struct dmar_domain *domain, int guest_width);
2224
2225static int __init si_domain_init(int hw)
2226{
2227 struct dmar_drhd_unit *drhd;
2228 struct intel_iommu *iommu;
2229 int nid, ret = 0;
2230
2231 si_domain = alloc_domain();
2232 if (!si_domain)
2233 return -EFAULT;
2234
2235 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2236
2237 for_each_active_iommu(iommu, drhd) {
2238 ret = iommu_attach_domain(si_domain, iommu);
2239 if (ret) {
2240 domain_exit(si_domain);
2241 return -EFAULT;
2242 }
2243 }
2244
2245 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2246 domain_exit(si_domain);
2247 return -EFAULT;
2248 }
2249
2250 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2251
2252 if (hw)
2253 return 0;
2254
2255 for_each_online_node(nid) {
2256 unsigned long start_pfn, end_pfn;
2257 int i;
2258
2259 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2260 ret = iommu_domain_identity_map(si_domain,
2261 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2262 if (ret)
2263 return ret;
2264 }
2265 }
2266
2267 return 0;
2268}
2269
2270static void domain_remove_one_dev_info(struct dmar_domain *domain,
2271 struct pci_dev *pdev);
2272static int identity_mapping(struct pci_dev *pdev)
2273{
2274 struct device_domain_info *info;
2275
2276 if (likely(!iommu_identity_mapping))
2277 return 0;
2278
2279 info = pdev->dev.archdata.iommu;
2280 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2281 return (info->domain == si_domain);
2282
2283 return 0;
2284}
2285
2286static int domain_add_dev_info(struct dmar_domain *domain,
2287 struct pci_dev *pdev,
2288 int translation)
2289{
2290 struct device_domain_info *info;
2291 unsigned long flags;
2292 int ret;
2293
2294 info = alloc_devinfo_mem();
2295 if (!info)
2296 return -ENOMEM;
2297
2298 info->segment = pci_domain_nr(pdev->bus);
2299 info->bus = pdev->bus->number;
2300 info->devfn = pdev->devfn;
2301 info->dev = pdev;
2302 info->domain = domain;
2303
2304 spin_lock_irqsave(&device_domain_lock, flags);
2305 list_add(&info->link, &domain->devices);
2306 list_add(&info->global, &device_domain_list);
2307 pdev->dev.archdata.iommu = info;
2308 spin_unlock_irqrestore(&device_domain_lock, flags);
2309
2310 ret = domain_context_mapping(domain, pdev, translation);
2311 if (ret) {
2312 spin_lock_irqsave(&device_domain_lock, flags);
2313 list_del(&info->link);
2314 list_del(&info->global);
2315 pdev->dev.archdata.iommu = NULL;
2316 spin_unlock_irqrestore(&device_domain_lock, flags);
2317 free_devinfo_mem(info);
2318 return ret;
2319 }
2320
2321 return 0;
2322}
2323
2324static bool device_has_rmrr(struct pci_dev *dev)
2325{
2326 struct dmar_rmrr_unit *rmrr;
2327 int i;
2328
2329 for_each_rmrr_units(rmrr) {
2330 for (i = 0; i < rmrr->devices_cnt; i++) {
2331 /*
2332 * Return TRUE if this RMRR contains the device that
2333 * is passed in.
2334 */
2335 if (rmrr->devices[i] == dev)
2336 return true;
2337 }
2338 }
2339 return false;
2340}
2341
2342static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2343{
2344
2345 /*
2346 * We want to prevent any device associated with an RMRR from
2347 * getting placed into the SI Domain. This is done because
2348 * problems exist when devices are moved in and out of domains
2349 * and their respective RMRR info is lost. We exempt USB devices
2350 * from this process due to their usage of RMRRs that are known
2351 * to not be needed after BIOS hand-off to OS.
2352 */
2353 if (device_has_rmrr(pdev) &&
2354 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2355 return 0;
2356
2357 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2358 return 1;
2359
2360 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2361 return 1;
2362
2363 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2364 return 0;
2365
2366 /*
2367 * We want to start off with all devices in the 1:1 domain, and
2368 * take them out later if we find they can't access all of memory.
2369 *
2370 * However, we can't do this for PCI devices behind bridges,
2371 * because all PCI devices behind the same bridge will end up
2372 * with the same source-id on their transactions.
2373 *
2374 * Practically speaking, we can't change things around for these
2375 * devices at run-time, because we can't be sure there'll be no
2376 * DMA transactions in flight for any of their siblings.
2377 *
2378 * So PCI devices (unless they're on the root bus) as well as
2379 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2380 * the 1:1 domain, just in _case_ one of their siblings turns out
2381 * not to be able to map all of memory.
2382 */
2383 if (!pci_is_pcie(pdev)) {
2384 if (!pci_is_root_bus(pdev->bus))
2385 return 0;
2386 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2387 return 0;
2388 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2389 return 0;
2390
2391 /*
2392 * At boot time, we don't yet know if devices will be 64-bit capable.
2393 * Assume that they will -- if they turn out not to be, then we can
2394 * take them out of the 1:1 domain later.
2395 */
2396 if (!startup) {
2397 /*
2398 * If the device's dma_mask is less than the system's memory
2399 * size then this is not a candidate for identity mapping.
2400 */
2401 u64 dma_mask = pdev->dma_mask;
2402
2403 if (pdev->dev.coherent_dma_mask &&
2404 pdev->dev.coherent_dma_mask < dma_mask)
2405 dma_mask = pdev->dev.coherent_dma_mask;
2406
2407 return dma_mask >= dma_get_required_mask(&pdev->dev);
2408 }
2409
2410 return 1;
2411}
2412
2413static int __init iommu_prepare_static_identity_mapping(int hw)
2414{
2415 struct pci_dev *pdev = NULL;
2416 int ret;
2417
2418 ret = si_domain_init(hw);
2419 if (ret)
2420 return -EFAULT;
2421
2422 for_each_pci_dev(pdev) {
2423 if (iommu_should_identity_map(pdev, 1)) {
2424 ret = domain_add_dev_info(si_domain, pdev,
2425 hw ? CONTEXT_TT_PASS_THROUGH :
2426 CONTEXT_TT_MULTI_LEVEL);
2427 if (ret) {
2428 /* device not associated with an iommu */
2429 if (ret == -ENODEV)
2430 continue;
2431 return ret;
2432 }
2433 pr_info("IOMMU: %s identity mapping for device %s\n",
2434 hw ? "hardware" : "software", pci_name(pdev));
2435 }
2436 }
2437
2438 return 0;
2439}
2440
2441static int __init init_dmars(void)
2442{
2443 struct dmar_drhd_unit *drhd;
2444 struct dmar_rmrr_unit *rmrr;
2445 struct pci_dev *pdev;
2446 struct intel_iommu *iommu;
2447 int i, ret;
2448
2449 /*
2450 * for each drhd
2451 * allocate root
2452 * initialize and program root entry to not present
2453 * endfor
2454 */
2455 for_each_drhd_unit(drhd) {
2456 /*
2457 * lock not needed as this is only incremented in the single
2458 * threaded kernel __init code path all other access are read
2459 * only
2460 */
2461 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2462 g_num_of_iommus++;
2463 continue;
2464 }
2465 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2466 IOMMU_UNITS_SUPPORTED);
2467 }
2468
2469 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2470 GFP_KERNEL);
2471 if (!g_iommus) {
2472 printk(KERN_ERR "Allocating global iommu array failed\n");
2473 ret = -ENOMEM;
2474 goto error;
2475 }
2476
2477 deferred_flush = kzalloc(g_num_of_iommus *
2478 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2479 if (!deferred_flush) {
2480 ret = -ENOMEM;
2481 goto error;
2482 }
2483
2484 for_each_drhd_unit(drhd) {
2485 if (drhd->ignored)
2486 continue;
2487
2488 iommu = drhd->iommu;
2489 g_iommus[iommu->seq_id] = iommu;
2490
2491 ret = iommu_init_domains(iommu);
2492 if (ret)
2493 goto error;
2494
2495 /*
2496 * TBD:
2497 * we could share the same root & context tables
2498 * among all IOMMU's. Need to Split it later.
2499 */
2500 ret = iommu_alloc_root_entry(iommu);
2501 if (ret) {
2502 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2503 goto error;
2504 }
2505 if (!ecap_pass_through(iommu->ecap))
2506 hw_pass_through = 0;
2507 }
2508
2509 /*
2510 * Start from the sane iommu hardware state.
2511 */
2512 for_each_drhd_unit(drhd) {
2513 if (drhd->ignored)
2514 continue;
2515
2516 iommu = drhd->iommu;
2517
2518 /*
2519 * If the queued invalidation is already initialized by us
2520 * (for example, while enabling interrupt-remapping) then
2521 * we got the things already rolling from a sane state.
2522 */
2523 if (iommu->qi)
2524 continue;
2525
2526 /*
2527 * Clear any previous faults.
2528 */
2529 dmar_fault(-1, iommu);
2530 /*
2531 * Disable queued invalidation if supported and already enabled
2532 * before OS handover.
2533 */
2534 dmar_disable_qi(iommu);
2535 }
2536
2537 for_each_drhd_unit(drhd) {
2538 if (drhd->ignored)
2539 continue;
2540
2541 iommu = drhd->iommu;
2542
2543 if (dmar_enable_qi(iommu)) {
2544 /*
2545 * Queued Invalidate not enabled, use Register Based
2546 * Invalidate
2547 */
2548 iommu->flush.flush_context = __iommu_flush_context;
2549 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2550 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2551 "invalidation\n",
2552 iommu->seq_id,
2553 (unsigned long long)drhd->reg_base_addr);
2554 } else {
2555 iommu->flush.flush_context = qi_flush_context;
2556 iommu->flush.flush_iotlb = qi_flush_iotlb;
2557 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2558 "invalidation\n",
2559 iommu->seq_id,
2560 (unsigned long long)drhd->reg_base_addr);
2561 }
2562 }
2563
2564 if (iommu_pass_through)
2565 iommu_identity_mapping |= IDENTMAP_ALL;
2566
2567#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2568 iommu_identity_mapping |= IDENTMAP_GFX;
2569#endif
2570
2571 check_tylersburg_isoch();
2572
2573 /*
2574 * If pass through is not set or not enabled, setup context entries for
2575 * identity mappings for rmrr, gfx, and isa and may fall back to static
2576 * identity mapping if iommu_identity_mapping is set.
2577 */
2578 if (iommu_identity_mapping) {
2579 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2580 if (ret) {
2581 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2582 goto error;
2583 }
2584 }
2585 /*
2586 * For each rmrr
2587 * for each dev attached to rmrr
2588 * do
2589 * locate drhd for dev, alloc domain for dev
2590 * allocate free domain
2591 * allocate page table entries for rmrr
2592 * if context not allocated for bus
2593 * allocate and init context
2594 * set present in root table for this bus
2595 * init context with domain, translation etc
2596 * endfor
2597 * endfor
2598 */
2599 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2600 for_each_rmrr_units(rmrr) {
2601 for (i = 0; i < rmrr->devices_cnt; i++) {
2602 pdev = rmrr->devices[i];
2603 /*
2604 * some BIOS lists non-exist devices in DMAR
2605 * table.
2606 */
2607 if (!pdev)
2608 continue;
2609 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2610 if (ret)
2611 printk(KERN_ERR
2612 "IOMMU: mapping reserved region failed\n");
2613 }
2614 }
2615
2616 iommu_prepare_isa();
2617
2618 /*
2619 * for each drhd
2620 * enable fault log
2621 * global invalidate context cache
2622 * global invalidate iotlb
2623 * enable translation
2624 */
2625 for_each_drhd_unit(drhd) {
2626 if (drhd->ignored) {
2627 /*
2628 * we always have to disable PMRs or DMA may fail on
2629 * this device
2630 */
2631 if (force_on)
2632 iommu_disable_protect_mem_regions(drhd->iommu);
2633 continue;
2634 }
2635 iommu = drhd->iommu;
2636
2637 iommu_flush_write_buffer(iommu);
2638
2639 ret = dmar_set_interrupt(iommu);
2640 if (ret)
2641 goto error;
2642
2643 iommu_set_root_entry(iommu);
2644
2645 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2646 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2647
2648 ret = iommu_enable_translation(iommu);
2649 if (ret)
2650 goto error;
2651
2652 iommu_disable_protect_mem_regions(iommu);
2653 }
2654
2655 return 0;
2656error:
2657 for_each_drhd_unit(drhd) {
2658 if (drhd->ignored)
2659 continue;
2660 iommu = drhd->iommu;
2661 free_iommu(iommu);
2662 }
2663 kfree(g_iommus);
2664 return ret;
2665}
2666
2667/* This takes a number of _MM_ pages, not VTD pages */
2668static struct iova *intel_alloc_iova(struct device *dev,
2669 struct dmar_domain *domain,
2670 unsigned long nrpages, uint64_t dma_mask)
2671{
2672 struct pci_dev *pdev = to_pci_dev(dev);
2673 struct iova *iova = NULL;
2674
2675 /* Restrict dma_mask to the width that the iommu can handle */
2676 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2677
2678 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2679 /*
2680 * First try to allocate an io virtual address in
2681 * DMA_BIT_MASK(32) and if that fails then try allocating
2682 * from higher range
2683 */
2684 iova = alloc_iova(&domain->iovad, nrpages,
2685 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2686 if (iova)
2687 return iova;
2688 }
2689 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2690 if (unlikely(!iova)) {
2691 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2692 nrpages, pci_name(pdev));
2693 return NULL;
2694 }
2695
2696 return iova;
2697}
2698
2699static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2700{
2701 struct dmar_domain *domain;
2702 int ret;
2703
2704 domain = get_domain_for_dev(pdev,
2705 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2706 if (!domain) {
2707 printk(KERN_ERR
2708 "Allocating domain for %s failed", pci_name(pdev));
2709 return NULL;
2710 }
2711
2712 /* make sure context mapping is ok */
2713 if (unlikely(!domain_context_mapped(pdev))) {
2714 ret = domain_context_mapping(domain, pdev,
2715 CONTEXT_TT_MULTI_LEVEL);
2716 if (ret) {
2717 printk(KERN_ERR
2718 "Domain context map for %s failed",
2719 pci_name(pdev));
2720 return NULL;
2721 }
2722 }
2723
2724 return domain;
2725}
2726
2727static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2728{
2729 struct device_domain_info *info;
2730
2731 /* No lock here, assumes no domain exit in normal case */
2732 info = dev->dev.archdata.iommu;
2733 if (likely(info))
2734 return info->domain;
2735
2736 return __get_valid_domain_for_dev(dev);
2737}
2738
2739static int iommu_dummy(struct pci_dev *pdev)
2740{
2741 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2742}
2743
2744/* Check if the pdev needs to go through non-identity map and unmap process.*/
2745static int iommu_no_mapping(struct device *dev)
2746{
2747 struct pci_dev *pdev;
2748 int found;
2749
2750 if (unlikely(dev->bus != &pci_bus_type))
2751 return 1;
2752
2753 pdev = to_pci_dev(dev);
2754 if (iommu_dummy(pdev))
2755 return 1;
2756
2757 if (!iommu_identity_mapping)
2758 return 0;
2759
2760 found = identity_mapping(pdev);
2761 if (found) {
2762 if (iommu_should_identity_map(pdev, 0))
2763 return 1;
2764 else {
2765 /*
2766 * 32 bit DMA is removed from si_domain and fall back
2767 * to non-identity mapping.
2768 */
2769 domain_remove_one_dev_info(si_domain, pdev);
2770 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2771 pci_name(pdev));
2772 return 0;
2773 }
2774 } else {
2775 /*
2776 * In case of a detached 64 bit DMA device from vm, the device
2777 * is put into si_domain for identity mapping.
2778 */
2779 if (iommu_should_identity_map(pdev, 0)) {
2780 int ret;
2781 ret = domain_add_dev_info(si_domain, pdev,
2782 hw_pass_through ?
2783 CONTEXT_TT_PASS_THROUGH :
2784 CONTEXT_TT_MULTI_LEVEL);
2785 if (!ret) {
2786 printk(KERN_INFO "64bit %s uses identity mapping\n",
2787 pci_name(pdev));
2788 return 1;
2789 }
2790 }
2791 }
2792
2793 return 0;
2794}
2795
2796static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2797 size_t size, int dir, u64 dma_mask)
2798{
2799 struct pci_dev *pdev = to_pci_dev(hwdev);
2800 struct dmar_domain *domain;
2801 phys_addr_t start_paddr;
2802 struct iova *iova;
2803 int prot = 0;
2804 int ret;
2805 struct intel_iommu *iommu;
2806 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2807
2808 BUG_ON(dir == DMA_NONE);
2809
2810 if (iommu_no_mapping(hwdev))
2811 return paddr;
2812
2813 domain = get_valid_domain_for_dev(pdev);
2814 if (!domain)
2815 return 0;
2816
2817 iommu = domain_get_iommu(domain);
2818 size = aligned_nrpages(paddr, size);
2819
2820 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2821 if (!iova)
2822 goto error;
2823
2824 /*
2825 * Check if DMAR supports zero-length reads on write only
2826 * mappings..
2827 */
2828 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2829 !cap_zlr(iommu->cap))
2830 prot |= DMA_PTE_READ;
2831 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2832 prot |= DMA_PTE_WRITE;
2833 /*
2834 * paddr - (paddr + size) might be partial page, we should map the whole
2835 * page. Note: if two part of one page are separately mapped, we
2836 * might have two guest_addr mapping to the same host paddr, but this
2837 * is not a big problem
2838 */
2839 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2840 mm_to_dma_pfn(paddr_pfn), size, prot);
2841 if (ret)
2842 goto error;
2843
2844 /* it's a non-present to present mapping. Only flush if caching mode */
2845 if (cap_caching_mode(iommu->cap))
2846 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2847 else
2848 iommu_flush_write_buffer(iommu);
2849
2850 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2851 start_paddr += paddr & ~PAGE_MASK;
2852 return start_paddr;
2853
2854error:
2855 if (iova)
2856 __free_iova(&domain->iovad, iova);
2857 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2858 pci_name(pdev), size, (unsigned long long)paddr, dir);
2859 return 0;
2860}
2861
2862static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2863 unsigned long offset, size_t size,
2864 enum dma_data_direction dir,
2865 struct dma_attrs *attrs)
2866{
2867 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2868 dir, to_pci_dev(dev)->dma_mask);
2869}
2870
2871static void flush_unmaps(void)
2872{
2873 int i, j;
2874
2875 timer_on = 0;
2876
2877 /* just flush them all */
2878 for (i = 0; i < g_num_of_iommus; i++) {
2879 struct intel_iommu *iommu = g_iommus[i];
2880 if (!iommu)
2881 continue;
2882
2883 if (!deferred_flush[i].next)
2884 continue;
2885
2886 /* In caching mode, global flushes turn emulation expensive */
2887 if (!cap_caching_mode(iommu->cap))
2888 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2889 DMA_TLB_GLOBAL_FLUSH);
2890 for (j = 0; j < deferred_flush[i].next; j++) {
2891 unsigned long mask;
2892 struct iova *iova = deferred_flush[i].iova[j];
2893 struct dmar_domain *domain = deferred_flush[i].domain[j];
2894
2895 /* On real hardware multiple invalidations are expensive */
2896 if (cap_caching_mode(iommu->cap))
2897 iommu_flush_iotlb_psi(iommu, domain->id,
2898 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2899 else {
2900 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2901 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2902 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2903 }
2904 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2905 }
2906 deferred_flush[i].next = 0;
2907 }
2908
2909 list_size = 0;
2910}
2911
2912static void flush_unmaps_timeout(unsigned long data)
2913{
2914 unsigned long flags;
2915
2916 spin_lock_irqsave(&async_umap_flush_lock, flags);
2917 flush_unmaps();
2918 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2919}
2920
2921static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2922{
2923 unsigned long flags;
2924 int next, iommu_id;
2925 struct intel_iommu *iommu;
2926
2927 spin_lock_irqsave(&async_umap_flush_lock, flags);
2928 if (list_size == HIGH_WATER_MARK)
2929 flush_unmaps();
2930
2931 iommu = domain_get_iommu(dom);
2932 iommu_id = iommu->seq_id;
2933
2934 next = deferred_flush[iommu_id].next;
2935 deferred_flush[iommu_id].domain[next] = dom;
2936 deferred_flush[iommu_id].iova[next] = iova;
2937 deferred_flush[iommu_id].next++;
2938
2939 if (!timer_on) {
2940 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2941 timer_on = 1;
2942 }
2943 list_size++;
2944 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2945}
2946
2947static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2948 size_t size, enum dma_data_direction dir,
2949 struct dma_attrs *attrs)
2950{
2951 struct pci_dev *pdev = to_pci_dev(dev);
2952 struct dmar_domain *domain;
2953 unsigned long start_pfn, last_pfn;
2954 struct iova *iova;
2955 struct intel_iommu *iommu;
2956
2957 if (iommu_no_mapping(dev))
2958 return;
2959
2960 domain = find_domain(pdev);
2961 BUG_ON(!domain);
2962
2963 iommu = domain_get_iommu(domain);
2964
2965 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2966 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2967 (unsigned long long)dev_addr))
2968 return;
2969
2970 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2971 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2972
2973 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2974 pci_name(pdev), start_pfn, last_pfn);
2975
2976 /* clear the whole page */
2977 dma_pte_clear_range(domain, start_pfn, last_pfn);
2978
2979 /* free page tables */
2980 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2981
2982 if (intel_iommu_strict) {
2983 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2984 last_pfn - start_pfn + 1, 0);
2985 /* free iova */
2986 __free_iova(&domain->iovad, iova);
2987 } else {
2988 add_unmap(domain, iova);
2989 /*
2990 * queue up the release of the unmap to save the 1/6th of the
2991 * cpu used up by the iotlb flush operation...
2992 */
2993 }
2994}
2995
2996static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2997 dma_addr_t *dma_handle, gfp_t flags,
2998 struct dma_attrs *attrs)
2999{
3000 void *vaddr;
3001 int order;
3002
3003 size = PAGE_ALIGN(size);
3004 order = get_order(size);
3005
3006 if (!iommu_no_mapping(hwdev))
3007 flags &= ~(GFP_DMA | GFP_DMA32);
3008 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3009 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3010 flags |= GFP_DMA;
3011 else
3012 flags |= GFP_DMA32;
3013 }
3014
3015 vaddr = (void *)__get_free_pages(flags, order);
3016 if (!vaddr)
3017 return NULL;
3018 memset(vaddr, 0, size);
3019
3020 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3021 DMA_BIDIRECTIONAL,
3022 hwdev->coherent_dma_mask);
3023 if (*dma_handle)
3024 return vaddr;
3025 free_pages((unsigned long)vaddr, order);
3026 return NULL;
3027}
3028
3029static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3030 dma_addr_t dma_handle, struct dma_attrs *attrs)
3031{
3032 int order;
3033
3034 size = PAGE_ALIGN(size);
3035 order = get_order(size);
3036
3037 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3038 free_pages((unsigned long)vaddr, order);
3039}
3040
3041static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3042 int nelems, enum dma_data_direction dir,
3043 struct dma_attrs *attrs)
3044{
3045 struct pci_dev *pdev = to_pci_dev(hwdev);
3046 struct dmar_domain *domain;
3047 unsigned long start_pfn, last_pfn;
3048 struct iova *iova;
3049 struct intel_iommu *iommu;
3050
3051 if (iommu_no_mapping(hwdev))
3052 return;
3053
3054 domain = find_domain(pdev);
3055 BUG_ON(!domain);
3056
3057 iommu = domain_get_iommu(domain);
3058
3059 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3060 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3061 (unsigned long long)sglist[0].dma_address))
3062 return;
3063
3064 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3065 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3066
3067 /* clear the whole page */
3068 dma_pte_clear_range(domain, start_pfn, last_pfn);
3069
3070 /* free page tables */
3071 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3072
3073 if (intel_iommu_strict) {
3074 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3075 last_pfn - start_pfn + 1, 0);
3076 /* free iova */
3077 __free_iova(&domain->iovad, iova);
3078 } else {
3079 add_unmap(domain, iova);
3080 /*
3081 * queue up the release of the unmap to save the 1/6th of the
3082 * cpu used up by the iotlb flush operation...
3083 */
3084 }
3085}
3086
3087static int intel_nontranslate_map_sg(struct device *hddev,
3088 struct scatterlist *sglist, int nelems, int dir)
3089{
3090 int i;
3091 struct scatterlist *sg;
3092
3093 for_each_sg(sglist, sg, nelems, i) {
3094 BUG_ON(!sg_page(sg));
3095 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3096 sg->dma_length = sg->length;
3097 }
3098 return nelems;
3099}
3100
3101static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3102 enum dma_data_direction dir, struct dma_attrs *attrs)
3103{
3104 int i;
3105 struct pci_dev *pdev = to_pci_dev(hwdev);
3106 struct dmar_domain *domain;
3107 size_t size = 0;
3108 int prot = 0;
3109 struct iova *iova = NULL;
3110 int ret;
3111 struct scatterlist *sg;
3112 unsigned long start_vpfn;
3113 struct intel_iommu *iommu;
3114
3115 BUG_ON(dir == DMA_NONE);
3116 if (iommu_no_mapping(hwdev))
3117 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3118
3119 domain = get_valid_domain_for_dev(pdev);
3120 if (!domain)
3121 return 0;
3122
3123 iommu = domain_get_iommu(domain);
3124
3125 for_each_sg(sglist, sg, nelems, i)
3126 size += aligned_nrpages(sg->offset, sg->length);
3127
3128 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3129 pdev->dma_mask);
3130 if (!iova) {
3131 sglist->dma_length = 0;
3132 return 0;
3133 }
3134
3135 /*
3136 * Check if DMAR supports zero-length reads on write only
3137 * mappings..
3138 */
3139 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3140 !cap_zlr(iommu->cap))
3141 prot |= DMA_PTE_READ;
3142 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3143 prot |= DMA_PTE_WRITE;
3144
3145 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3146
3147 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3148 if (unlikely(ret)) {
3149 /* clear the page */
3150 dma_pte_clear_range(domain, start_vpfn,
3151 start_vpfn + size - 1);
3152 /* free page tables */
3153 dma_pte_free_pagetable(domain, start_vpfn,
3154 start_vpfn + size - 1);
3155 /* free iova */
3156 __free_iova(&domain->iovad, iova);
3157 return 0;
3158 }
3159
3160 /* it's a non-present to present mapping. Only flush if caching mode */
3161 if (cap_caching_mode(iommu->cap))
3162 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3163 else
3164 iommu_flush_write_buffer(iommu);
3165
3166 return nelems;
3167}
3168
3169static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3170{
3171 return !dma_addr;
3172}
3173
3174struct dma_map_ops intel_dma_ops = {
3175 .alloc = intel_alloc_coherent,
3176 .free = intel_free_coherent,
3177 .map_sg = intel_map_sg,
3178 .unmap_sg = intel_unmap_sg,
3179 .map_page = intel_map_page,
3180 .unmap_page = intel_unmap_page,
3181 .mapping_error = intel_mapping_error,
3182};
3183
3184static inline int iommu_domain_cache_init(void)
3185{
3186 int ret = 0;
3187
3188 iommu_domain_cache = kmem_cache_create("iommu_domain",
3189 sizeof(struct dmar_domain),
3190 0,
3191 SLAB_HWCACHE_ALIGN,
3192
3193 NULL);
3194 if (!iommu_domain_cache) {
3195 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3196 ret = -ENOMEM;
3197 }
3198
3199 return ret;
3200}
3201
3202static inline int iommu_devinfo_cache_init(void)
3203{
3204 int ret = 0;
3205
3206 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3207 sizeof(struct device_domain_info),
3208 0,
3209 SLAB_HWCACHE_ALIGN,
3210 NULL);
3211 if (!iommu_devinfo_cache) {
3212 printk(KERN_ERR "Couldn't create devinfo cache\n");
3213 ret = -ENOMEM;
3214 }
3215
3216 return ret;
3217}
3218
3219static inline int iommu_iova_cache_init(void)
3220{
3221 int ret = 0;
3222
3223 iommu_iova_cache = kmem_cache_create("iommu_iova",
3224 sizeof(struct iova),
3225 0,
3226 SLAB_HWCACHE_ALIGN,
3227 NULL);
3228 if (!iommu_iova_cache) {
3229 printk(KERN_ERR "Couldn't create iova cache\n");
3230 ret = -ENOMEM;
3231 }
3232
3233 return ret;
3234}
3235
3236static int __init iommu_init_mempool(void)
3237{
3238 int ret;
3239 ret = iommu_iova_cache_init();
3240 if (ret)
3241 return ret;
3242
3243 ret = iommu_domain_cache_init();
3244 if (ret)
3245 goto domain_error;
3246
3247 ret = iommu_devinfo_cache_init();
3248 if (!ret)
3249 return ret;
3250
3251 kmem_cache_destroy(iommu_domain_cache);
3252domain_error:
3253 kmem_cache_destroy(iommu_iova_cache);
3254
3255 return -ENOMEM;
3256}
3257
3258static void __init iommu_exit_mempool(void)
3259{
3260 kmem_cache_destroy(iommu_devinfo_cache);
3261 kmem_cache_destroy(iommu_domain_cache);
3262 kmem_cache_destroy(iommu_iova_cache);
3263
3264}
3265
3266static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3267{
3268 struct dmar_drhd_unit *drhd;
3269 u32 vtbar;
3270 int rc;
3271
3272 /* We know that this device on this chipset has its own IOMMU.
3273 * If we find it under a different IOMMU, then the BIOS is lying
3274 * to us. Hope that the IOMMU for this device is actually
3275 * disabled, and it needs no translation...
3276 */
3277 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3278 if (rc) {
3279 /* "can't" happen */
3280 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3281 return;
3282 }
3283 vtbar &= 0xffff0000;
3284
3285 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3286 drhd = dmar_find_matched_drhd_unit(pdev);
3287 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3288 TAINT_FIRMWARE_WORKAROUND,
3289 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3290 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3291}
3292DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3293
3294static void __init init_no_remapping_devices(void)
3295{
3296 struct dmar_drhd_unit *drhd;
3297
3298 for_each_drhd_unit(drhd) {
3299 if (!drhd->include_all) {
3300 int i;
3301 for (i = 0; i < drhd->devices_cnt; i++)
3302 if (drhd->devices[i] != NULL)
3303 break;
3304 /* ignore DMAR unit if no pci devices exist */
3305 if (i == drhd->devices_cnt)
3306 drhd->ignored = 1;
3307 }
3308 }
3309
3310 for_each_drhd_unit(drhd) {
3311 int i;
3312 if (drhd->ignored || drhd->include_all)
3313 continue;
3314
3315 for (i = 0; i < drhd->devices_cnt; i++)
3316 if (drhd->devices[i] &&
3317 !IS_GFX_DEVICE(drhd->devices[i]))
3318 break;
3319
3320 if (i < drhd->devices_cnt)
3321 continue;
3322
3323 /* This IOMMU has *only* gfx devices. Either bypass it or
3324 set the gfx_mapped flag, as appropriate */
3325 if (dmar_map_gfx) {
3326 intel_iommu_gfx_mapped = 1;
3327 } else {
3328 drhd->ignored = 1;
3329 for (i = 0; i < drhd->devices_cnt; i++) {
3330 if (!drhd->devices[i])
3331 continue;
3332 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3333 }
3334 }
3335 }
3336}
3337
3338#ifdef CONFIG_SUSPEND
3339static int init_iommu_hw(void)
3340{
3341 struct dmar_drhd_unit *drhd;
3342 struct intel_iommu *iommu = NULL;
3343
3344 for_each_active_iommu(iommu, drhd)
3345 if (iommu->qi)
3346 dmar_reenable_qi(iommu);
3347
3348 for_each_iommu(iommu, drhd) {
3349 if (drhd->ignored) {
3350 /*
3351 * we always have to disable PMRs or DMA may fail on
3352 * this device
3353 */
3354 if (force_on)
3355 iommu_disable_protect_mem_regions(iommu);
3356 continue;
3357 }
3358
3359 iommu_flush_write_buffer(iommu);
3360
3361 iommu_set_root_entry(iommu);
3362
3363 iommu->flush.flush_context(iommu, 0, 0, 0,
3364 DMA_CCMD_GLOBAL_INVL);
3365 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3366 DMA_TLB_GLOBAL_FLUSH);
3367 if (iommu_enable_translation(iommu))
3368 return 1;
3369 iommu_disable_protect_mem_regions(iommu);
3370 }
3371
3372 return 0;
3373}
3374
3375static void iommu_flush_all(void)
3376{
3377 struct dmar_drhd_unit *drhd;
3378 struct intel_iommu *iommu;
3379
3380 for_each_active_iommu(iommu, drhd) {
3381 iommu->flush.flush_context(iommu, 0, 0, 0,
3382 DMA_CCMD_GLOBAL_INVL);
3383 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3384 DMA_TLB_GLOBAL_FLUSH);
3385 }
3386}
3387
3388static int iommu_suspend(void)
3389{
3390 struct dmar_drhd_unit *drhd;
3391 struct intel_iommu *iommu = NULL;
3392 unsigned long flag;
3393
3394 for_each_active_iommu(iommu, drhd) {
3395 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3396 GFP_ATOMIC);
3397 if (!iommu->iommu_state)
3398 goto nomem;
3399 }
3400
3401 iommu_flush_all();
3402
3403 for_each_active_iommu(iommu, drhd) {
3404 iommu_disable_translation(iommu);
3405
3406 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3407
3408 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3409 readl(iommu->reg + DMAR_FECTL_REG);
3410 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3411 readl(iommu->reg + DMAR_FEDATA_REG);
3412 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3413 readl(iommu->reg + DMAR_FEADDR_REG);
3414 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3415 readl(iommu->reg + DMAR_FEUADDR_REG);
3416
3417 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3418 }
3419 return 0;
3420
3421nomem:
3422 for_each_active_iommu(iommu, drhd)
3423 kfree(iommu->iommu_state);
3424
3425 return -ENOMEM;
3426}
3427
3428static void iommu_resume(void)
3429{
3430 struct dmar_drhd_unit *drhd;
3431 struct intel_iommu *iommu = NULL;
3432 unsigned long flag;
3433
3434 if (init_iommu_hw()) {
3435 if (force_on)
3436 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3437 else
3438 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3439 return;
3440 }
3441
3442 for_each_active_iommu(iommu, drhd) {
3443
3444 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3445
3446 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3447 iommu->reg + DMAR_FECTL_REG);
3448 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3449 iommu->reg + DMAR_FEDATA_REG);
3450 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3451 iommu->reg + DMAR_FEADDR_REG);
3452 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3453 iommu->reg + DMAR_FEUADDR_REG);
3454
3455 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3456 }
3457
3458 for_each_active_iommu(iommu, drhd)
3459 kfree(iommu->iommu_state);
3460}
3461
3462static struct syscore_ops iommu_syscore_ops = {
3463 .resume = iommu_resume,
3464 .suspend = iommu_suspend,
3465};
3466
3467static void __init init_iommu_pm_ops(void)
3468{
3469 register_syscore_ops(&iommu_syscore_ops);
3470}
3471
3472#else
3473static inline void init_iommu_pm_ops(void) {}
3474#endif /* CONFIG_PM */
3475
3476LIST_HEAD(dmar_rmrr_units);
3477
3478static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3479{
3480 list_add(&rmrr->list, &dmar_rmrr_units);
3481}
3482
3483
3484int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3485{
3486 struct acpi_dmar_reserved_memory *rmrr;
3487 struct dmar_rmrr_unit *rmrru;
3488
3489 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3490 if (!rmrru)
3491 return -ENOMEM;
3492
3493 rmrru->hdr = header;
3494 rmrr = (struct acpi_dmar_reserved_memory *)header;
3495 rmrru->base_address = rmrr->base_address;
3496 rmrru->end_address = rmrr->end_address;
3497
3498 dmar_register_rmrr_unit(rmrru);
3499 return 0;
3500}
3501
3502static int __init
3503rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3504{
3505 struct acpi_dmar_reserved_memory *rmrr;
3506 int ret;
3507
3508 rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3509 ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3510 ((void *)rmrr) + rmrr->header.length,
3511 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3512
3513 if (ret || (rmrru->devices_cnt == 0)) {
3514 list_del(&rmrru->list);
3515 kfree(rmrru);
3516 }
3517 return ret;
3518}
3519
3520static LIST_HEAD(dmar_atsr_units);
3521
3522int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3523{
3524 struct acpi_dmar_atsr *atsr;
3525 struct dmar_atsr_unit *atsru;
3526
3527 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3528 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3529 if (!atsru)
3530 return -ENOMEM;
3531
3532 atsru->hdr = hdr;
3533 atsru->include_all = atsr->flags & 0x1;
3534
3535 list_add(&atsru->list, &dmar_atsr_units);
3536
3537 return 0;
3538}
3539
3540static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3541{
3542 int rc;
3543 struct acpi_dmar_atsr *atsr;
3544
3545 if (atsru->include_all)
3546 return 0;
3547
3548 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3549 rc = dmar_parse_dev_scope((void *)(atsr + 1),
3550 (void *)atsr + atsr->header.length,
3551 &atsru->devices_cnt, &atsru->devices,
3552 atsr->segment);
3553 if (rc || !atsru->devices_cnt) {
3554 list_del(&atsru->list);
3555 kfree(atsru);
3556 }
3557
3558 return rc;
3559}
3560
3561int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3562{
3563 int i;
3564 struct pci_bus *bus;
3565 struct acpi_dmar_atsr *atsr;
3566 struct dmar_atsr_unit *atsru;
3567
3568 dev = pci_physfn(dev);
3569
3570 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3571 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3572 if (atsr->segment == pci_domain_nr(dev->bus))
3573 goto found;
3574 }
3575
3576 return 0;
3577
3578found:
3579 for (bus = dev->bus; bus; bus = bus->parent) {
3580 struct pci_dev *bridge = bus->self;
3581
3582 if (!bridge || !pci_is_pcie(bridge) ||
3583 bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3584 return 0;
3585
3586 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3587 for (i = 0; i < atsru->devices_cnt; i++)
3588 if (atsru->devices[i] == bridge)
3589 return 1;
3590 break;
3591 }
3592 }
3593
3594 if (atsru->include_all)
3595 return 1;
3596
3597 return 0;
3598}
3599
3600int __init dmar_parse_rmrr_atsr_dev(void)
3601{
3602 struct dmar_rmrr_unit *rmrr, *rmrr_n;
3603 struct dmar_atsr_unit *atsr, *atsr_n;
3604 int ret = 0;
3605
3606 list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3607 ret = rmrr_parse_dev(rmrr);
3608 if (ret)
3609 return ret;
3610 }
3611
3612 list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3613 ret = atsr_parse_dev(atsr);
3614 if (ret)
3615 return ret;
3616 }
3617
3618 return ret;
3619}
3620
3621/*
3622 * Here we only respond to action of unbound device from driver.
3623 *
3624 * Added device is not attached to its DMAR domain here yet. That will happen
3625 * when mapping the device to iova.
3626 */
3627static int device_notifier(struct notifier_block *nb,
3628 unsigned long action, void *data)
3629{
3630 struct device *dev = data;
3631 struct pci_dev *pdev = to_pci_dev(dev);
3632 struct dmar_domain *domain;
3633
3634 if (iommu_no_mapping(dev))
3635 return 0;
3636
3637 domain = find_domain(pdev);
3638 if (!domain)
3639 return 0;
3640
3641 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3642 domain_remove_one_dev_info(domain, pdev);
3643
3644 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3645 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3646 list_empty(&domain->devices))
3647 domain_exit(domain);
3648 }
3649
3650 return 0;
3651}
3652
3653static struct notifier_block device_nb = {
3654 .notifier_call = device_notifier,
3655};
3656
3657int __init intel_iommu_init(void)
3658{
3659 int ret = 0;
3660 struct dmar_drhd_unit *drhd;
3661
3662 /* VT-d is required for a TXT/tboot launch, so enforce that */
3663 force_on = tboot_force_iommu();
3664
3665 if (dmar_table_init()) {
3666 if (force_on)
3667 panic("tboot: Failed to initialize DMAR table\n");
3668 return -ENODEV;
3669 }
3670
3671 /*
3672 * Disable translation if already enabled prior to OS handover.
3673 */
3674 for_each_drhd_unit(drhd) {
3675 struct intel_iommu *iommu;
3676
3677 if (drhd->ignored)
3678 continue;
3679
3680 iommu = drhd->iommu;
3681 if (iommu->gcmd & DMA_GCMD_TE)
3682 iommu_disable_translation(iommu);
3683 }
3684
3685 if (dmar_dev_scope_init() < 0) {
3686 if (force_on)
3687 panic("tboot: Failed to initialize DMAR device scope\n");
3688 return -ENODEV;
3689 }
3690
3691 if (no_iommu || dmar_disabled)
3692 return -ENODEV;
3693
3694 if (iommu_init_mempool()) {
3695 if (force_on)
3696 panic("tboot: Failed to initialize iommu memory\n");
3697 return -ENODEV;
3698 }
3699
3700 if (list_empty(&dmar_rmrr_units))
3701 printk(KERN_INFO "DMAR: No RMRR found\n");
3702
3703 if (list_empty(&dmar_atsr_units))
3704 printk(KERN_INFO "DMAR: No ATSR found\n");
3705
3706 if (dmar_init_reserved_ranges()) {
3707 if (force_on)
3708 panic("tboot: Failed to reserve iommu ranges\n");
3709 return -ENODEV;
3710 }
3711
3712 init_no_remapping_devices();
3713
3714 ret = init_dmars();
3715 if (ret) {
3716 if (force_on)
3717 panic("tboot: Failed to initialize DMARs\n");
3718 printk(KERN_ERR "IOMMU: dmar init failed\n");
3719 put_iova_domain(&reserved_iova_list);
3720 iommu_exit_mempool();
3721 return ret;
3722 }
3723 printk(KERN_INFO
3724 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3725
3726 init_timer(&unmap_timer);
3727#ifdef CONFIG_SWIOTLB
3728 swiotlb = 0;
3729#endif
3730 dma_ops = &intel_dma_ops;
3731
3732 init_iommu_pm_ops();
3733
3734 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3735
3736 bus_register_notifier(&pci_bus_type, &device_nb);
3737
3738 intel_iommu_enabled = 1;
3739
3740 return 0;
3741}
3742
3743static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3744 struct pci_dev *pdev)
3745{
3746 struct pci_dev *tmp, *parent;
3747
3748 if (!iommu || !pdev)
3749 return;
3750
3751 /* dependent device detach */
3752 tmp = pci_find_upstream_pcie_bridge(pdev);
3753 /* Secondary interface's bus number and devfn 0 */
3754 if (tmp) {
3755 parent = pdev->bus->self;
3756 while (parent != tmp) {
3757 iommu_detach_dev(iommu, parent->bus->number,
3758 parent->devfn);
3759 parent = parent->bus->self;
3760 }
3761 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3762 iommu_detach_dev(iommu,
3763 tmp->subordinate->number, 0);
3764 else /* this is a legacy PCI bridge */
3765 iommu_detach_dev(iommu, tmp->bus->number,
3766 tmp->devfn);
3767 }
3768}
3769
3770static void domain_remove_one_dev_info(struct dmar_domain *domain,
3771 struct pci_dev *pdev)
3772{
3773 struct device_domain_info *info;
3774 struct intel_iommu *iommu;
3775 unsigned long flags;
3776 int found = 0;
3777 struct list_head *entry, *tmp;
3778
3779 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3780 pdev->devfn);
3781 if (!iommu)
3782 return;
3783
3784 spin_lock_irqsave(&device_domain_lock, flags);
3785 list_for_each_safe(entry, tmp, &domain->devices) {
3786 info = list_entry(entry, struct device_domain_info, link);
3787 if (info->segment == pci_domain_nr(pdev->bus) &&
3788 info->bus == pdev->bus->number &&
3789 info->devfn == pdev->devfn) {
3790 list_del(&info->link);
3791 list_del(&info->global);
3792 if (info->dev)
3793 info->dev->dev.archdata.iommu = NULL;
3794 spin_unlock_irqrestore(&device_domain_lock, flags);
3795
3796 iommu_disable_dev_iotlb(info);
3797 iommu_detach_dev(iommu, info->bus, info->devfn);
3798 iommu_detach_dependent_devices(iommu, pdev);
3799 free_devinfo_mem(info);
3800
3801 spin_lock_irqsave(&device_domain_lock, flags);
3802
3803 if (found)
3804 break;
3805 else
3806 continue;
3807 }
3808
3809 /* if there is no other devices under the same iommu
3810 * owned by this domain, clear this iommu in iommu_bmp
3811 * update iommu count and coherency
3812 */
3813 if (iommu == device_to_iommu(info->segment, info->bus,
3814 info->devfn))
3815 found = 1;
3816 }
3817
3818 spin_unlock_irqrestore(&device_domain_lock, flags);
3819
3820 if (found == 0) {
3821 unsigned long tmp_flags;
3822 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3823 clear_bit(iommu->seq_id, domain->iommu_bmp);
3824 domain->iommu_count--;
3825 domain_update_iommu_cap(domain);
3826 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3827
3828 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3829 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3830 spin_lock_irqsave(&iommu->lock, tmp_flags);
3831 clear_bit(domain->id, iommu->domain_ids);
3832 iommu->domains[domain->id] = NULL;
3833 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3834 }
3835 }
3836}
3837
3838static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3839{
3840 struct device_domain_info *info;
3841 struct intel_iommu *iommu;
3842 unsigned long flags1, flags2;
3843
3844 spin_lock_irqsave(&device_domain_lock, flags1);
3845 while (!list_empty(&domain->devices)) {
3846 info = list_entry(domain->devices.next,
3847 struct device_domain_info, link);
3848 list_del(&info->link);
3849 list_del(&info->global);
3850 if (info->dev)
3851 info->dev->dev.archdata.iommu = NULL;
3852
3853 spin_unlock_irqrestore(&device_domain_lock, flags1);
3854
3855 iommu_disable_dev_iotlb(info);
3856 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3857 iommu_detach_dev(iommu, info->bus, info->devfn);
3858 iommu_detach_dependent_devices(iommu, info->dev);
3859
3860 /* clear this iommu in iommu_bmp, update iommu count
3861 * and capabilities
3862 */
3863 spin_lock_irqsave(&domain->iommu_lock, flags2);
3864 if (test_and_clear_bit(iommu->seq_id,
3865 domain->iommu_bmp)) {
3866 domain->iommu_count--;
3867 domain_update_iommu_cap(domain);
3868 }
3869 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3870
3871 free_devinfo_mem(info);
3872 spin_lock_irqsave(&device_domain_lock, flags1);
3873 }
3874 spin_unlock_irqrestore(&device_domain_lock, flags1);
3875}
3876
3877/* domain id for virtual machine, it won't be set in context */
3878static unsigned long vm_domid;
3879
3880static struct dmar_domain *iommu_alloc_vm_domain(void)
3881{
3882 struct dmar_domain *domain;
3883
3884 domain = alloc_domain_mem();
3885 if (!domain)
3886 return NULL;
3887
3888 domain->id = vm_domid++;
3889 domain->nid = -1;
3890 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3891 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3892
3893 return domain;
3894}
3895
3896static int md_domain_init(struct dmar_domain *domain, int guest_width)
3897{
3898 int adjust_width;
3899
3900 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3901 spin_lock_init(&domain->iommu_lock);
3902
3903 domain_reserve_special_ranges(domain);
3904
3905 /* calculate AGAW */
3906 domain->gaw = guest_width;
3907 adjust_width = guestwidth_to_adjustwidth(guest_width);
3908 domain->agaw = width_to_agaw(adjust_width);
3909
3910 INIT_LIST_HEAD(&domain->devices);
3911
3912 domain->iommu_count = 0;
3913 domain->iommu_coherency = 0;
3914 domain->iommu_snooping = 0;
3915 domain->iommu_superpage = 0;
3916 domain->max_addr = 0;
3917 domain->nid = -1;
3918
3919 /* always allocate the top pgd */
3920 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3921 if (!domain->pgd)
3922 return -ENOMEM;
3923 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3924 return 0;
3925}
3926
3927static void iommu_free_vm_domain(struct dmar_domain *domain)
3928{
3929 unsigned long flags;
3930 struct dmar_drhd_unit *drhd;
3931 struct intel_iommu *iommu;
3932 unsigned long i;
3933 unsigned long ndomains;
3934
3935 for_each_drhd_unit(drhd) {
3936 if (drhd->ignored)
3937 continue;
3938 iommu = drhd->iommu;
3939
3940 ndomains = cap_ndoms(iommu->cap);
3941 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3942 if (iommu->domains[i] == domain) {
3943 spin_lock_irqsave(&iommu->lock, flags);
3944 clear_bit(i, iommu->domain_ids);
3945 iommu->domains[i] = NULL;
3946 spin_unlock_irqrestore(&iommu->lock, flags);
3947 break;
3948 }
3949 }
3950 }
3951}
3952
3953static void vm_domain_exit(struct dmar_domain *domain)
3954{
3955 /* Domain 0 is reserved, so dont process it */
3956 if (!domain)
3957 return;
3958
3959 vm_domain_remove_all_dev_info(domain);
3960 /* destroy iovas */
3961 put_iova_domain(&domain->iovad);
3962
3963 /* clear ptes */
3964 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3965
3966 /* free page tables */
3967 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3968
3969 iommu_free_vm_domain(domain);
3970 free_domain_mem(domain);
3971}
3972
3973static int intel_iommu_domain_init(struct iommu_domain *domain)
3974{
3975 struct dmar_domain *dmar_domain;
3976
3977 dmar_domain = iommu_alloc_vm_domain();
3978 if (!dmar_domain) {
3979 printk(KERN_ERR
3980 "intel_iommu_domain_init: dmar_domain == NULL\n");
3981 return -ENOMEM;
3982 }
3983 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3984 printk(KERN_ERR
3985 "intel_iommu_domain_init() failed\n");
3986 vm_domain_exit(dmar_domain);
3987 return -ENOMEM;
3988 }
3989 domain_update_iommu_cap(dmar_domain);
3990 domain->priv = dmar_domain;
3991
3992 return 0;
3993}
3994
3995static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3996{
3997 struct dmar_domain *dmar_domain = domain->priv;
3998
3999 domain->priv = NULL;
4000 vm_domain_exit(dmar_domain);
4001}
4002
4003static int intel_iommu_attach_device(struct iommu_domain *domain,
4004 struct device *dev)
4005{
4006 struct dmar_domain *dmar_domain = domain->priv;
4007 struct pci_dev *pdev = to_pci_dev(dev);
4008 struct intel_iommu *iommu;
4009 int addr_width;
4010
4011 /* normally pdev is not mapped */
4012 if (unlikely(domain_context_mapped(pdev))) {
4013 struct dmar_domain *old_domain;
4014
4015 old_domain = find_domain(pdev);
4016 if (old_domain) {
4017 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4018 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4019 domain_remove_one_dev_info(old_domain, pdev);
4020 else
4021 domain_remove_dev_info(old_domain);
4022 }
4023 }
4024
4025 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4026 pdev->devfn);
4027 if (!iommu)
4028 return -ENODEV;
4029
4030 /* check if this iommu agaw is sufficient for max mapped address */
4031 addr_width = agaw_to_width(iommu->agaw);
4032 if (addr_width > cap_mgaw(iommu->cap))
4033 addr_width = cap_mgaw(iommu->cap);
4034
4035 if (dmar_domain->max_addr > (1LL << addr_width)) {
4036 printk(KERN_ERR "%s: iommu width (%d) is not "
4037 "sufficient for the mapped address (%llx)\n",
4038 __func__, addr_width, dmar_domain->max_addr);
4039 return -EFAULT;
4040 }
4041 dmar_domain->gaw = addr_width;
4042
4043 /*
4044 * Knock out extra levels of page tables if necessary
4045 */
4046 while (iommu->agaw < dmar_domain->agaw) {
4047 struct dma_pte *pte;
4048
4049 pte = dmar_domain->pgd;
4050 if (dma_pte_present(pte)) {
4051 dmar_domain->pgd = (struct dma_pte *)
4052 phys_to_virt(dma_pte_addr(pte));
4053 free_pgtable_page(pte);
4054 }
4055 dmar_domain->agaw--;
4056 }
4057
4058 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4059}
4060
4061static void intel_iommu_detach_device(struct iommu_domain *domain,
4062 struct device *dev)
4063{
4064 struct dmar_domain *dmar_domain = domain->priv;
4065 struct pci_dev *pdev = to_pci_dev(dev);
4066
4067 domain_remove_one_dev_info(dmar_domain, pdev);
4068}
4069
4070static int intel_iommu_map(struct iommu_domain *domain,
4071 unsigned long iova, phys_addr_t hpa,
4072 size_t size, int iommu_prot)
4073{
4074 struct dmar_domain *dmar_domain = domain->priv;
4075 u64 max_addr;
4076 int prot = 0;
4077 int ret;
4078
4079 if (iommu_prot & IOMMU_READ)
4080 prot |= DMA_PTE_READ;
4081 if (iommu_prot & IOMMU_WRITE)
4082 prot |= DMA_PTE_WRITE;
4083 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4084 prot |= DMA_PTE_SNP;
4085
4086 max_addr = iova + size;
4087 if (dmar_domain->max_addr < max_addr) {
4088 u64 end;
4089
4090 /* check if minimum agaw is sufficient for mapped address */
4091 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4092 if (end < max_addr) {
4093 printk(KERN_ERR "%s: iommu width (%d) is not "
4094 "sufficient for the mapped address (%llx)\n",
4095 __func__, dmar_domain->gaw, max_addr);
4096 return -EFAULT;
4097 }
4098 dmar_domain->max_addr = max_addr;
4099 }
4100 /* Round up size to next multiple of PAGE_SIZE, if it and
4101 the low bits of hpa would take us onto the next page */
4102 size = aligned_nrpages(hpa, size);
4103 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4104 hpa >> VTD_PAGE_SHIFT, size, prot);
4105 return ret;
4106}
4107
4108static size_t intel_iommu_unmap(struct iommu_domain *domain,
4109 unsigned long iova, size_t size)
4110{
4111 struct dmar_domain *dmar_domain = domain->priv;
4112 int order;
4113
4114 order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4115 (iova + size - 1) >> VTD_PAGE_SHIFT);
4116
4117 if (dmar_domain->max_addr == iova + size)
4118 dmar_domain->max_addr = iova;
4119
4120 return PAGE_SIZE << order;
4121}
4122
4123static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4124 unsigned long iova)
4125{
4126 struct dmar_domain *dmar_domain = domain->priv;
4127 struct dma_pte *pte;
4128 u64 phys = 0;
4129
4130 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4131 if (pte)
4132 phys = dma_pte_addr(pte);
4133
4134 return phys;
4135}
4136
4137static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4138 unsigned long cap)
4139{
4140 struct dmar_domain *dmar_domain = domain->priv;
4141
4142 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4143 return dmar_domain->iommu_snooping;
4144 if (cap == IOMMU_CAP_INTR_REMAP)
4145 return intr_remapping_enabled;
4146
4147 return 0;
4148}
4149
4150/*
4151 * Group numbers are arbitrary. Device with the same group number
4152 * indicate the iommu cannot differentiate between them. To avoid
4153 * tracking used groups we just use the seg|bus|devfn of the lowest
4154 * level we're able to differentiate devices
4155 */
4156static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
4157{
4158 struct pci_dev *pdev = to_pci_dev(dev);
4159 struct pci_dev *bridge;
4160 union {
4161 struct {
4162 u8 devfn;
4163 u8 bus;
4164 u16 segment;
4165 } pci;
4166 u32 group;
4167 } id;
4168
4169 if (iommu_no_mapping(dev))
4170 return -ENODEV;
4171
4172 id.pci.segment = pci_domain_nr(pdev->bus);
4173 id.pci.bus = pdev->bus->number;
4174 id.pci.devfn = pdev->devfn;
4175
4176 if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn))
4177 return -ENODEV;
4178
4179 bridge = pci_find_upstream_pcie_bridge(pdev);
4180 if (bridge) {
4181 if (pci_is_pcie(bridge)) {
4182 id.pci.bus = bridge->subordinate->number;
4183 id.pci.devfn = 0;
4184 } else {
4185 id.pci.bus = bridge->bus->number;
4186 id.pci.devfn = bridge->devfn;
4187 }
4188 }
4189
4190 if (!pdev->is_virtfn && iommu_group_mf)
4191 id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
4192
4193 *groupid = id.group;
4194
4195 return 0;
4196}
4197
4198static struct iommu_ops intel_iommu_ops = {
4199 .domain_init = intel_iommu_domain_init,
4200 .domain_destroy = intel_iommu_domain_destroy,
4201 .attach_dev = intel_iommu_attach_device,
4202 .detach_dev = intel_iommu_detach_device,
4203 .map = intel_iommu_map,
4204 .unmap = intel_iommu_unmap,
4205 .iova_to_phys = intel_iommu_iova_to_phys,
4206 .domain_has_cap = intel_iommu_domain_has_cap,
4207 .device_group = intel_iommu_device_group,
4208 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4209};
4210
4211static void __devinit quirk_iommu_g4x_gfx(struct pci_dev *dev)
4212{
4213 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4214 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4215 dmar_map_gfx = 0;
4216}
4217
4218DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4219DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4220DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4221DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4222DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4223DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4224DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4225
4226static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4227{
4228 /*
4229 * Mobile 4 Series Chipset neglects to set RWBF capability,
4230 * but needs it. Same seems to hold for the desktop versions.
4231 */
4232 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4233 rwbf_quirk = 1;
4234}
4235
4236DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4237DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4238DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4239DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4240DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4241DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4242DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4243
4244#define GGC 0x52
4245#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4246#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4247#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4248#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4249#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4250#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4251#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4252#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4253
4254static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4255{
4256 unsigned short ggc;
4257
4258 if (pci_read_config_word(dev, GGC, &ggc))
4259 return;
4260
4261 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4262 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4263 dmar_map_gfx = 0;
4264 } else if (dmar_map_gfx) {
4265 /* we have to ensure the gfx device is idle before we flush */
4266 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4267 intel_iommu_strict = 1;
4268 }
4269}
4270DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4271DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4272DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4273DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4274
4275/* On Tylersburg chipsets, some BIOSes have been known to enable the
4276 ISOCH DMAR unit for the Azalia sound device, but not give it any
4277 TLB entries, which causes it to deadlock. Check for that. We do
4278 this in a function called from init_dmars(), instead of in a PCI
4279 quirk, because we don't want to print the obnoxious "BIOS broken"
4280 message if VT-d is actually disabled.
4281*/
4282static void __init check_tylersburg_isoch(void)
4283{
4284 struct pci_dev *pdev;
4285 uint32_t vtisochctrl;
4286
4287 /* If there's no Azalia in the system anyway, forget it. */
4288 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4289 if (!pdev)
4290 return;
4291 pci_dev_put(pdev);
4292
4293 /* System Management Registers. Might be hidden, in which case
4294 we can't do the sanity check. But that's OK, because the
4295 known-broken BIOSes _don't_ actually hide it, so far. */
4296 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4297 if (!pdev)
4298 return;
4299
4300 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4301 pci_dev_put(pdev);
4302 return;
4303 }
4304
4305 pci_dev_put(pdev);
4306
4307 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4308 if (vtisochctrl & 1)
4309 return;
4310
4311 /* Drop all bits other than the number of TLB entries */
4312 vtisochctrl &= 0x1c;
4313
4314 /* If we have the recommended number of TLB entries (16), fine. */
4315 if (vtisochctrl == 0x10)
4316 return;
4317
4318 /* Zero TLB entries? You get to ride the short bus to school. */
4319 if (!vtisochctrl) {
4320 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4321 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4322 dmi_get_system_info(DMI_BIOS_VENDOR),
4323 dmi_get_system_info(DMI_BIOS_VERSION),
4324 dmi_get_system_info(DMI_PRODUCT_VERSION));
4325 iommu_identity_mapping |= IDENTMAP_AZALIA;
4326 return;
4327 }
4328
4329 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4330 vtisochctrl);
4331}