From 1e59964f242bcd8a164e93a4f6022d28159b8a0f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:20 +0200 Subject: [PATCH 01/43] PCI/P2PDMA: Separate the mmap() support from the core logic ANBZ: #31969 commit f58ef9d1d1355b15443719df95081f193067ab88 upstream. Currently the P2PDMA code requires a pgmap and a struct page to function. The was serving three important purposes: - DMA API compatibility, where scatterlist required a struct page as input - Life cycle management, the percpu_ref is used to prevent UAF during device hot unplug - A way to get the P2P provider data through the pci_p2pdma_pagemap The DMA API now has a new flow, and has gained phys_addr_t support, so it no longer needs struct pages to perform P2P mapping. Lifecycle management can be delegated to the user, DMABUF for instance has a suitable invalidation protocol that does not require struct page. Finding the P2P provider data can also be managed by the caller without need to look it up from the phys_addr. Split the P2PDMA code into two layers. The optional upper layer, effectively, provides a way to mmap() P2P memory into a VMA by providing struct page, pgmap, a genalloc and sysfs. The lower layer provides the actual P2P infrastructure and is wrapped up in a new struct p2pdma_provider. Rework the mmap layer to use new p2pdma_provider based APIs. Drivers that do not want to put P2P memory into VMA's can allocate a struct p2pdma_provider after probe() starts and free it before remove() completes. When DMA mapping the driver must convey the struct p2pdma_provider to the DMA mapping code along with a phys_addr of the MMIO BAR slice to map. The driver must ensure that no DMA mapping outlives the lifetime of the struct p2pdma_provider. The intended target of this new API layer is DMABUF. There is usually only a single p2pdma_provider for a DMABUF exporter. Most drivers can establish the p2pdma_provider during probe, access the single instance during DMABUF attach and use that to drive the DMA mapping. DMABUF provides an invalidation mechanism that can guarantee all DMA is halted and the DMA mappings are undone prior to destroying the struct p2pdma_provider. This ensures there is no UAF through DMABUFs that are lingering past driver removal. The new p2pdma_provider layer cannot be used to create P2P memory that can be mapped into VMA's, be used with pin_user_pages(), O_DIRECT, and so on. These use cases must still use the mmap() layer. The p2pdma_provider layer is principally for DMABUF-like use cases where DMABUF natively manages the life cycle and access instead of vmas/pin_user_pages()/struct page. In addition, remove the bus_off field from pci_p2pdma_map_state since it duplicates information already available in the pgmap structure. The bus_offset is only used in one location (pci_p2pdma_bus_addr_map) and is always identical to pgmap->bus_offset. Signed-off-by: Jason Gunthorpe Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-1-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/pci/p2pdma.c | 43 ++++++++++++++++++++------------------ include/linux/pci-p2pdma.h | 19 ++++++++++++----- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index c50cf81b1ee0..6b7c34d25961 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -28,9 +28,8 @@ struct pci_p2pdma { }; struct pci_p2pdma_pagemap { - struct pci_dev *provider; - u64 bus_offset; struct dev_pagemap pgmap; + struct p2pdma_provider mem; }; static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) @@ -211,8 +210,8 @@ static void p2pdma_page_free(struct page *page) { struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page)); /* safe to dereference while a reference is held to the percpu ref */ - struct pci_p2pdma *p2pdma = - rcu_dereference_protected(pgmap->provider->p2pdma, 1); + struct pci_p2pdma *p2pdma = rcu_dereference_protected( + to_pci_dev(pgmap->mem.owner)->p2pdma, 1); struct percpu_ref *ref; gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page), @@ -277,14 +276,15 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) static void pci_p2pdma_unmap_mappings(void *data) { - struct pci_dev *pdev = data; + struct pci_p2pdma_pagemap *p2p_pgmap = data; /* * Removing the alloc attribute from sysfs will call * unmap_mapping_range() on the inode, teardown any existing userspace * mappings and prevent new ones from being created. */ - sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr, + sysfs_remove_file_from_group(&p2p_pgmap->mem.owner->kobj, + &p2pmem_alloc_attr.attr, p2pmem_group.name); } @@ -335,10 +335,9 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->ops = &p2pdma_pgmap_ops; - - p2p_pgmap->provider = pdev; - p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - - pci_resource_start(pdev, bar); + p2p_pgmap->mem.owner = &pdev->dev; + p2p_pgmap->mem.bus_offset = + pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { @@ -347,7 +346,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, } error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, - pdev); + p2p_pgmap); if (error) goto pages_free; @@ -987,16 +986,16 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); -static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, - struct device *dev) +static enum pci_p2pdma_map_type +pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev) { enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED; - struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider; + struct pci_dev *pdev = to_pci_dev(provider->owner); struct pci_dev *client; struct pci_p2pdma *p2pdma; int dist; - if (!provider->p2pdma) + if (!pdev->p2pdma) return PCI_P2PDMA_MAP_NOT_SUPPORTED; if (!dev_is_pci(dev)) @@ -1005,7 +1004,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, client = to_pci_dev(dev); rcu_read_lock(); - p2pdma = rcu_dereference(provider->p2pdma); + p2pdma = rcu_dereference(pdev->p2pdma); if (p2pdma) type = xa_to_value(xa_load(&p2pdma->map_types, @@ -1013,7 +1012,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, rcu_read_unlock(); if (type == PCI_P2PDMA_MAP_UNKNOWN) - return calc_map_type_and_dist(provider, client, &dist, true); + return calc_map_type_and_dist(pdev, client, &dist, true); return type; } @@ -1021,9 +1020,13 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page) { - state->pgmap = page_pgmap(page); - state->map = pci_p2pdma_map_type(state->pgmap, dev); - state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset; + struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page)); + + if (state->mem == &p2p_pgmap->mem) + return; + + state->mem = &p2p_pgmap->mem; + state->map = pci_p2pdma_map_type(&p2p_pgmap->mem, dev); } /** diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 075c20b161d9..27a2c399f47d 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -16,6 +16,16 @@ struct block_device; struct scatterlist; +/** + * struct p2pdma_provider + * + * A p2pdma provider is a range of MMIO address space available to the CPU. + */ +struct p2pdma_provider { + struct device *owner; + u64 bus_offset; +}; + #ifdef CONFIG_PCI_P2PDMA int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); @@ -144,11 +154,11 @@ enum pci_p2pdma_map_type { }; struct pci_p2pdma_map_state { - struct dev_pagemap *pgmap; + struct p2pdma_provider *mem; enum pci_p2pdma_map_type map; - u64 bus_off; }; + /* helper for pci_p2pdma_state(), do not use directly */ void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page); @@ -167,8 +177,7 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page) { if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { - if (state->pgmap != page_pgmap(page)) - __pci_p2pdma_update_state(state, dev, page); + __pci_p2pdma_update_state(state, dev, page); return state->map; } return PCI_P2PDMA_MAP_NONE; @@ -186,7 +195,7 @@ static inline dma_addr_t pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) { WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); - return paddr + state->bus_off; + return paddr + state->mem->bus_offset; } #endif /* _LINUX_PCI_P2P_H */ -- Gitee From b406d32e5ea54a90571cee3473b6e7d652ece1a7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 4 Jan 2026 14:51:28 +0200 Subject: [PATCH 02/43] PCI/P2PDMA: Add missing struct p2pdma_provider documentation ANBZ: #31969 commit 80d9411c00e805488b631c91034e9b6c14a6dbdc upstream. Two fields in struct p2pdma_provider were not documented, which resulted in the following kernel-doc warning: Warning: include/linux/pci-p2pdma.h:26 struct member 'owner' not described in 'p2pdma_provider' Warning: include/linux/pci-p2pdma.h:26 struct member 'bus_offset' not described in 'p2pdma_provider' Repro: $ scripts/kernel-doc -none include/linux/pci-p2pdma.h Fixes: f58ef9d1d135 ("PCI/P2PDMA: Separate the mmap() support from the core logic") Reported-by: Bjorn Helgaas Closes: https://lore.kernel.org/all/20260102234033.GA246107@bhelgaas Signed-off-by: Leon Romanovsky Signed-off-by: Bjorn Helgaas Reviewed-by: Logan Gunthorpe Link: https://patch.msgid.link/20260104-fix-p2p-kdoc-v1-1-6d181233f8bc@nvidia.com Signed-off-by: Ruidong Tian --- include/linux/pci-p2pdma.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 27a2c399f47d..c0034b9b561e 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -20,6 +20,8 @@ struct scatterlist; * struct p2pdma_provider * * A p2pdma provider is a range of MMIO address space available to the CPU. + * @owner: Device to which this provider belongs. + * @bus_offset: Bus offset for p2p communication. */ struct p2pdma_provider { struct device *owner; -- Gitee From d0d4bf764dcd127875b0d2738be0d22bceace13d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:21 +0200 Subject: [PATCH 03/43] PCI/P2PDMA: Simplify bus address mapping API ANBZ: #31969 commit d4504262f745e48c1739c8b864f779b4b0f9de80 upstream. Update the pci_p2pdma_bus_addr_map() function to take a direct pointer to the p2pdma_provider structure instead of the pci_p2pdma_map_state. This simplifies the API by removing the need for callers to extract the provider from the state structure. The change updates all callers across the kernel (block layer, IOMMU, DMA direct, and HMM) to pass the provider pointer directly, making the code more explicit and reducing unnecessary indirection. This also removes the runtime warning check since callers now have direct control over which provider they use. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-2-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/iommu/dma-iommu.c | 4 ++-- include/linux/pci-p2pdma.h | 7 +++---- kernel/dma/direct.c | 4 ++-- mm/hmm.c | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 95eecab1f472..6ae0c35e62ca 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1456,8 +1456,8 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, * as a bus address, __finalise_sg() will copy the dma * address into the output segment. */ - s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, - sg_phys(s)); + s->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(s)); sg_dma_len(s) = sg->length; sg_dma_mark_bus_address(s); continue; diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index c0034b9b561e..17bacd304208 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -188,16 +188,15 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, /** * pci_p2pdma_bus_addr_map - Translate a physical address to a bus address * for a PCI_P2PDMA_MAP_BUS_ADDR transfer. - * @state: P2P state structure + * @provider: P2P provider structure * @paddr: physical address to map * * Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer. */ static inline dma_addr_t -pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) +pci_p2pdma_bus_addr_map(struct p2pdma_provider *provider, phys_addr_t paddr) { - WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); - return paddr + state->mem->bus_offset; + return paddr + provider->bus_offset; } #endif /* _LINUX_PCI_P2P_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b078209a72e7..0a53785b521a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -495,8 +495,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, } break; case PCI_P2PDMA_MAP_BUS_ADDR: - sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, - sg_phys(sg)); + sg->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(sg)); sg_dma_len(sg) = sg->length; sg_dma_mark_bus_address(sg); continue; diff --git a/mm/hmm.c b/mm/hmm.c index 53272798a055..197a3687c102 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -761,7 +761,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, break; case PCI_P2PDMA_MAP_BUS_ADDR: pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED; - return pci_p2pdma_bus_addr_map(p2pdma_state, paddr); + return pci_p2pdma_bus_addr_map(p2pdma_state->mem, paddr); default: return DMA_MAPPING_ERROR; } -- Gitee From 3d500aa393def6bb6c57c102336f8cee10a22fe4 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:22 +0200 Subject: [PATCH 04/43] PCI/P2PDMA: Refactor to separate core P2P functionality from memory allocation ANBZ: #31969 commit 372d6d1b8ae3cdfe6b0638a0a848c6865ec94567 upstream. Refactor the PCI P2PDMA subsystem to separate the core peer-to-peer DMA functionality from the optional memory allocation layer. This creates a two-tier architecture: The core layer provides P2P mapping functionality for physical addresses based on PCI device MMIO BARs and integrates with the DMA API for mapping operations. This layer is required for all P2PDMA users. The optional upper layer provides memory allocation capabilities including gen_pool allocator, struct page support, and sysfs interface for user space access. This separation allows subsystems like DMABUF to use only the core P2P mapping functionality without the overhead of memory allocation features they don't need. The core functionality is now available through the new pcim_p2pdma_provider() function that returns a p2pdma_provider structure. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-3-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/pci/p2pdma.c | 151 +++++++++++++++++++++++++++++-------- include/linux/pci-p2pdma.h | 11 +++ 2 files changed, 131 insertions(+), 31 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 6b7c34d25961..6a78a218a0ff 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -25,11 +25,12 @@ struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; struct xarray map_types; + struct p2pdma_provider mem[PCI_STD_NUM_BARS]; }; struct pci_p2pdma_pagemap { struct dev_pagemap pgmap; - struct p2pdma_provider mem; + struct p2pdma_provider *mem; }; static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) @@ -211,7 +212,7 @@ static void p2pdma_page_free(struct page *page) struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page)); /* safe to dereference while a reference is held to the percpu ref */ struct pci_p2pdma *p2pdma = rcu_dereference_protected( - to_pci_dev(pgmap->mem.owner)->p2pdma, 1); + to_pci_dev(pgmap->mem->owner)->p2pdma, 1); struct percpu_ref *ref; gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page), @@ -234,44 +235,123 @@ static void pci_p2pdma_release(void *data) /* Flush and disable pci_alloc_p2p_mem() */ pdev->p2pdma = NULL; - synchronize_rcu(); + if (p2pdma->pool) + synchronize_rcu(); + xa_destroy(&p2pdma->map_types); + + if (!p2pdma->pool) + return; gen_pool_destroy(p2pdma->pool); sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); - xa_destroy(&p2pdma->map_types); } -static int pci_p2pdma_setup(struct pci_dev *pdev) +/** + * pcim_p2pdma_init - Initialise peer-to-peer DMA providers + * @pdev: The PCI device to enable P2PDMA for + * + * This function initializes the peer-to-peer DMA infrastructure + * for a PCI device. It allocates and sets up the necessary data + * structures to support P2PDMA operations, including mapping type + * tracking. + */ +int pcim_p2pdma_init(struct pci_dev *pdev) { - int error = -ENOMEM; struct pci_p2pdma *p2p; + int i, ret; + + p2p = rcu_dereference_protected(pdev->p2pdma, 1); + if (p2p) + return 0; p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL); if (!p2p) return -ENOMEM; xa_init(&p2p->map_types); + /* + * Iterate over all standard PCI BARs and record only those that + * correspond to MMIO regions. Skip non-memory resources (e.g. I/O + * port BARs) since they cannot be used for peer-to-peer (P2P) + * transactions. + */ + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; - p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); - if (!p2p->pool) - goto out; + p2p->mem[i].owner = &pdev->dev; + p2p->mem[i].bus_offset = + pci_bus_address(pdev, i) - pci_resource_start(pdev, i); + } - error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); - if (error) - goto out_pool_destroy; + ret = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); + if (ret) + goto out_p2p; - error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); - if (error) + rcu_assign_pointer(pdev->p2pdma, p2p); + return 0; + +out_p2p: + devm_kfree(&pdev->dev, p2p); + return ret; +} +EXPORT_SYMBOL_GPL(pcim_p2pdma_init); + +/** + * pcim_p2pdma_provider - Get peer-to-peer DMA provider + * @pdev: The PCI device to enable P2PDMA for + * @bar: BAR index to get provider + * + * This function gets peer-to-peer DMA provider for a PCI device. The lifetime + * of the provider (and of course the MMIO) is bound to the lifetime of the + * driver. A driver calling this function must ensure that all references to the + * provider, and any DMA mappings created for any MMIO, are all cleaned up + * before the driver remove() completes. + * + * Since P2P is almost always shared with a second driver this means some system + * to notify, invalidate and revoke the MMIO's DMA must be in place to use this + * function. For example a revoke can be built using DMABUF. + */ +struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar) +{ + struct pci_p2pdma *p2p; + + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) + return NULL; + + p2p = rcu_dereference_protected(pdev->p2pdma, 1); + if (WARN_ON(!p2p)) + /* Someone forgot to call to pcim_p2pdma_init() before */ + return NULL; + + return &p2p->mem[bar]; +} +EXPORT_SYMBOL_GPL(pcim_p2pdma_provider); + +static int pci_p2pdma_setup_pool(struct pci_dev *pdev) +{ + struct pci_p2pdma *p2pdma; + int ret; + + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); + if (p2pdma->pool) + /* We already setup pools, do nothing, */ + return 0; + + p2pdma->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); + if (!p2pdma->pool) + return -ENOMEM; + + ret = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); + if (ret) goto out_pool_destroy; - rcu_assign_pointer(pdev->p2pdma, p2p); return 0; out_pool_destroy: - gen_pool_destroy(p2p->pool); -out: - devm_kfree(&pdev->dev, p2p); - return error; + gen_pool_destroy(p2pdma->pool); + p2pdma->pool = NULL; + return ret; } static void pci_p2pdma_unmap_mappings(void *data) @@ -283,7 +363,7 @@ static void pci_p2pdma_unmap_mappings(void *data) * unmap_mapping_range() on the inode, teardown any existing userspace * mappings and prevent new ones from being created. */ - sysfs_remove_file_from_group(&p2p_pgmap->mem.owner->kobj, + sysfs_remove_file_from_group(&p2p_pgmap->mem->owner->kobj, &p2pmem_alloc_attr.attr, p2pmem_group.name); } @@ -302,6 +382,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { struct pci_p2pdma_pagemap *p2p_pgmap; + struct p2pdma_provider *mem; struct dev_pagemap *pgmap; struct pci_p2pdma *p2pdma; void *addr; @@ -319,11 +400,21 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, if (size + offset > pci_resource_len(pdev, bar)) return -EINVAL; - if (!pdev->p2pdma) { - error = pci_p2pdma_setup(pdev); - if (error) - return error; - } + error = pcim_p2pdma_init(pdev); + if (error) + return error; + + error = pci_p2pdma_setup_pool(pdev); + if (error) + return error; + + mem = pcim_p2pdma_provider(pdev, bar); + /* + * We checked validity of BAR prior to call + * to pcim_p2pdma_provider. It should never return NULL. + */ + if (WARN_ON(!mem)) + return -EINVAL; p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL); if (!p2p_pgmap) @@ -335,9 +426,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->ops = &p2pdma_pgmap_ops; - p2p_pgmap->mem.owner = &pdev->dev; - p2p_pgmap->mem.bus_offset = - pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); + p2p_pgmap->mem = mem; addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { @@ -1022,11 +1111,11 @@ void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, { struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page)); - if (state->mem == &p2p_pgmap->mem) + if (state->mem == p2p_pgmap->mem) return; - state->mem = &p2p_pgmap->mem; - state->map = pci_p2pdma_map_type(&p2p_pgmap->mem, dev); + state->mem = p2p_pgmap->mem; + state->map = pci_p2pdma_map_type(p2p_pgmap->mem, dev); } /** diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 17bacd304208..25d7cd85ed28 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -29,6 +29,8 @@ struct p2pdma_provider { }; #ifdef CONFIG_PCI_P2PDMA +int pcim_p2pdma_init(struct pci_dev *pdev); +struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar); int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, @@ -47,6 +49,15 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); #else /* CONFIG_PCI_P2PDMA */ +static inline int pcim_p2pdma_init(struct pci_dev *pdev) +{ + return -EOPNOTSUPP; +} +static inline struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, + int bar) +{ + return NULL; +} static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { -- Gitee From 26249c09b592be8934dffb3c228ea26533698990 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:23 +0200 Subject: [PATCH 05/43] PCI/P2PDMA: Provide an access to pci_p2pdma_map_type() function ANBZ: #31969 commit 395698bd2cd7639b85784a4a8f5ddb7a581e353c upstream. Provide an access to pci_p2pdma_map_type() function to allow subsystems to determine the appropriate mapping type for P2PDMA transfers between a provider and target device. The pci_p2pdma_map_type() function is the core P2P layer version of the existing public, but struct page focused, pci_p2pdma_state() function. It returns the same result. It is required to use the p2p subsystem from drivers that don't use the struct page layer. Like __pci_p2pdma_update_state() it is not an exported function. The idea is that only subsystem code will implement mapping helpers for taking in phys_addr_t lists, this is deliberately not made accessible to every driver to prevent abuse. Following patches will use this function to implement a shared DMA mapping helper for DMABUF. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-4-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/pci/p2pdma.c | 14 ++++++- include/linux/pci-p2pdma.h | 85 +++++++++++++++++++++----------------- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 6a78a218a0ff..c87ab4e3cabc 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -1075,8 +1075,18 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); -static enum pci_p2pdma_map_type -pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev) +/** + * pci_p2pdma_map_type - Determine the mapping type for P2PDMA transfers + * @provider: P2PDMA provider structure + * @dev: Target device for the transfer + * + * Determines how peer-to-peer DMA transfers should be mapped between + * the provider and the target device. The mapping type indicates whether + * the transfer can be done directly through PCI switches or must go + * through the host bridge. + */ +enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider, + struct device *dev) { enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED; struct pci_dev *pdev = to_pci_dev(provider->owner); diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 25d7cd85ed28..4eb6a99caf8a 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -28,6 +28,45 @@ struct p2pdma_provider { u64 bus_offset; }; +enum pci_p2pdma_map_type { + /* + * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before + * the mapping type has been calculated. Exported routines for the API + * will never return this value. + */ + PCI_P2PDMA_MAP_UNKNOWN = 0, + + /* + * Not a PCI P2PDMA transfer. + */ + PCI_P2PDMA_MAP_NONE, + + /* + * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will + * traverse the host bridge and the host bridge is not in the + * allowlist. DMA Mapping routines should return an error when + * this is returned. + */ + PCI_P2PDMA_MAP_NOT_SUPPORTED, + + /* + * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to + * each other directly through a PCI switch and the transaction will + * not traverse the host bridge. Such a mapping should program + * the DMA engine with PCI bus addresses. + */ + PCI_P2PDMA_MAP_BUS_ADDR, + + /* + * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk + * to each other, but the transaction traverses a host bridge on the + * allowlist. In this case, a normal mapping either with CPU physical + * addresses (in the case of dma-direct) or IOVA addresses (in the + * case of IOMMUs) should be used to program the DMA engine. + */ + PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, +}; + #ifdef CONFIG_PCI_P2PDMA int pcim_p2pdma_init(struct pci_dev *pdev); struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar); @@ -48,6 +87,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); +enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider, + struct device *dev); #else /* CONFIG_PCI_P2PDMA */ static inline int pcim_p2pdma_init(struct pci_dev *pdev) { @@ -113,6 +154,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page, { return sprintf(page, "none\n"); } +static inline enum pci_p2pdma_map_type +pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev) +{ + return PCI_P2PDMA_MAP_NOT_SUPPORTED; +} #endif /* CONFIG_PCI_P2PDMA */ @@ -127,45 +173,6 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client) return pci_p2pmem_find_many(&client, 1); } -enum pci_p2pdma_map_type { - /* - * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before - * the mapping type has been calculated. Exported routines for the API - * will never return this value. - */ - PCI_P2PDMA_MAP_UNKNOWN = 0, - - /* - * Not a PCI P2PDMA transfer. - */ - PCI_P2PDMA_MAP_NONE, - - /* - * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will - * traverse the host bridge and the host bridge is not in the - * allowlist. DMA Mapping routines should return an error when - * this is returned. - */ - PCI_P2PDMA_MAP_NOT_SUPPORTED, - - /* - * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to - * each other directly through a PCI switch and the transaction will - * not traverse the host bridge. Such a mapping should program - * the DMA engine with PCI bus addresses. - */ - PCI_P2PDMA_MAP_BUS_ADDR, - - /* - * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk - * to each other, but the transaction traverses a host bridge on the - * allowlist. In this case, a normal mapping either with CPU physical - * addresses (in the case of dma-direct) or IOVA addresses (in the - * case of IOMMUs) should be used to program the DMA engine. - */ - PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, -}; - struct pci_p2pdma_map_state { struct p2pdma_provider *mem; enum pci_p2pdma_map_type map; -- Gitee From b1ae641b163352f236216f68dd773821006d3d71 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Nov 2025 11:28:24 +0200 Subject: [PATCH 06/43] PCI/P2PDMA: Document DMABUF model ANBZ: #31969 commit 50d44fce53b6474147fcfa0a27a6e5fd290dd8ed upstream. Reflect latest changes in p2p implementation to support DMABUF lifecycle. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-5-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- Documentation/driver-api/pci/p2pdma.rst | 97 +++++++++++++++++++------ 1 file changed, 74 insertions(+), 23 deletions(-) diff --git a/Documentation/driver-api/pci/p2pdma.rst b/Documentation/driver-api/pci/p2pdma.rst index d0b241628cf1..280673b50350 100644 --- a/Documentation/driver-api/pci/p2pdma.rst +++ b/Documentation/driver-api/pci/p2pdma.rst @@ -9,22 +9,48 @@ between two devices on the bus. This type of transaction is henceforth called Peer-to-Peer (or P2P). However, there are a number of issues that make P2P transactions tricky to do in a perfectly safe way. -One of the biggest issues is that PCI doesn't require forwarding -transactions between hierarchy domains, and in PCIe, each Root Port -defines a separate hierarchy domain. To make things worse, there is no -simple way to determine if a given Root Complex supports this or not. -(See PCIe r4.0, sec 1.3.1). Therefore, as of this writing, the kernel -only supports doing P2P when the endpoints involved are all behind the -same PCI bridge, as such devices are all in the same PCI hierarchy -domain, and the spec guarantees that all transactions within the -hierarchy will be routable, but it does not require routing -between hierarchies. - -The second issue is that to make use of existing interfaces in Linux, -memory that is used for P2P transactions needs to be backed by struct -pages. However, PCI BARs are not typically cache coherent so there are -a few corner case gotchas with these pages so developers need to -be careful about what they do with them. +For PCIe the routing of Transaction Layer Packets (TLPs) is well-defined up +until they reach a host bridge or root port. If the path includes PCIe switches +then based on the ACS settings the transaction can route entirely within +the PCIe hierarchy and never reach the root port. The kernel will evaluate +the PCIe topology and always permit P2P in these well-defined cases. + +However, if the P2P transaction reaches the host bridge then it might have to +hairpin back out the same root port, be routed inside the CPU SOC to another +PCIe root port, or routed internally to the SOC. + +The PCIe specification doesn't define the forwarding of transactions between +hierarchy domains and kernel defaults to blocking such routing. There is an +allow list to allow detecting known-good HW, in which case P2P between any +two PCIe devices will be permitted. + +Since P2P inherently is doing transactions between two devices it requires two +drivers to be co-operating inside the kernel. The providing driver has to convey +its MMIO to the consuming driver. To meet the driver model lifecycle rules the +MMIO must have all DMA mapping removed, all CPU accesses prevented, all page +table mappings undone before the providing driver completes remove(). + +This requires the providing and consuming driver to actively work together to +guarantee that the consuming driver has stopped using the MMIO during a removal +cycle. This is done by either a synchronous invalidation shutdown or waiting +for all usage refcounts to reach zero. + +At the lowest level the P2P subsystem offers a naked struct p2p_provider that +delegates lifecycle management to the providing driver. It is expected that +drivers using this option will wrap their MMIO memory in DMABUF and use DMABUF +to provide an invalidation shutdown. These MMIO addresess have no struct page, and +if used with mmap() must create special PTEs. As such there are very few +kernel uAPIs that can accept pointers to them; in particular they cannot be used +with read()/write(), including O_DIRECT. + +Building on this, the subsystem offers a layer to wrap the MMIO in a ZONE_DEVICE +pgmap of MEMORY_DEVICE_PCI_P2PDMA to create struct pages. The lifecycle of +pgmap ensures that when the pgmap is destroyed all other drivers have stopped +using the MMIO. This option works with O_DIRECT flows, in some cases, if the +underlying subsystem supports handling MEMORY_DEVICE_PCI_P2PDMA through +FOLL_PCI_P2PDMA. The use of FOLL_LONGTERM is prevented. As this relies on pgmap +it also relies on architecture support along with alignment and minimum size +limitations. Driver Writer's Guide @@ -114,14 +140,39 @@ allocating scatter-gather lists with P2P memory. Struct Page Caveats ------------------- -Driver writers should be very careful about not passing these special -struct pages to code that isn't prepared for it. At this time, the kernel -interfaces do not have any checks for ensuring this. This obviously -precludes passing these pages to userspace. +While the MEMORY_DEVICE_PCI_P2PDMA pages can be installed in VMAs, +pin_user_pages() and related will not return them unless FOLL_PCI_P2PDMA is set. -P2P memory is also technically IO memory but should never have any side -effects behind it. Thus, the order of loads and stores should not be important -and ioreadX(), iowriteX() and friends should not be necessary. +The MEMORY_DEVICE_PCI_P2PDMA pages require care to support in the kernel. The +KVA is still MMIO and must still be accessed through the normal +readX()/writeX()/etc helpers. Direct CPU access (e.g. memcpy) is forbidden, just +like any other MMIO mapping. While this will actually work on some +architectures, others will experience corruption or just crash in the kernel. +Supporting FOLL_PCI_P2PDMA in a subsystem requires scrubbing it to ensure no CPU +access happens. + + +Usage With DMABUF +================= + +DMABUF provides an alternative to the above struct page-based +client/provider/orchestrator system and should be used when struct page +doesn't exist. In this mode the exporting driver will wrap +some of its MMIO in a DMABUF and give the DMABUF FD to userspace. + +Userspace can then pass the FD to an importing driver which will ask the +exporting driver to map it to the importer. + +In this case the initiator and target pci_devices are known and the P2P subsystem +is used to determine the mapping type. The phys_addr_t-based DMA API is used to +establish the dma_addr_t. + +Lifecycle is controlled by DMABUF move_notify(). When the exporting driver wants +to remove() it must deliver an invalidation shutdown to all DMABUF importing +drivers through move_notify() and synchronously DMA unmap all the MMIO. + +No importing driver can continue to have a DMA map to the MMIO after the +exporting driver has destroyed its p2p_provider. P2P DMA Support Library -- Gitee From 38abfc09ab370a24da4cdc9b54d4833b58c1ed32 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:25 +0200 Subject: [PATCH 07/43] dma-buf: provide phys_vec to scatter-gather mapping routine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 commit 3aa31a8bb11e47c0ff2b306988d1756b810c1c3c upstream. Add dma_buf_phys_vec_to_sgt() and dma_buf_free_sgt() helpers to convert an array of MMIO physical address ranges into scatter-gather tables with proper DMA mapping. These common functions are a starting point and support any PCI drivers creating mappings from their BAR's MMIO addresses. VFIO is one case, as shortly will be RDMA. We can review existing DRM drivers to refactor them separately. We hope this will evolve into routines to help common DRM that include mixed CPU and MMIO mappings. Compared to the dma_map_resource() abuse this implementation handles the complicated PCI P2P scenarios properly, especially when an IOMMU is enabled: - Direct bus address mapping without IOVA allocation for PCI_P2PDMA_MAP_BUS_ADDR, using pci_p2pdma_bus_addr_map(). This happens if the IOMMU is enabled but the PCIe switch ACS flags allow transactions to avoid the host bridge. Further, this handles the slightly obscure, case of MMIO with a phys_addr_t that is different from the physical BAR programming (bus offset). The phys_addr_t is converted to a dma_addr_t and accommodates this effect. This enables certain real systems to work, especially on ARM platforms. - Mapping through host bridge with IOVA allocation and DMA_ATTR_MMIO attribute for MMIO memory regions (PCI_P2PDMA_MAP_THRU_HOST_BRIDGE). This happens when the IOMMU is enabled and the ACS flags are forcing all traffic to the IOMMU - ie for virtualization systems. - Cases where P2P is not supported through the host bridge/CPU. The P2P subsystem is the proper place to detect this and block it. Helper functions fill_sg_entry() and calc_sg_nents() handle the scatter-gather table construction, splitting large regions into UINT_MAX-sized chunks to fit within sg->length field limits. Since the physical address based DMA API forbids use of the CPU list of the scatterlist this will produce a mangled scatterlist that has a fully zero-length and NULL'd CPU list. The list is 0 length, all the struct page pointers are NULL and zero sized. This is stronger and more robust than the existing mangle_sg_table() technique. It is a future project to migrate DMABUF as a subsystem away from using scatterlist for this data structure. Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Christian König Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-6-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson [Ruidong: delete " in EXPORT_SYMBOL_NS_GPL] Signed-off-by: Ruidong Tian --- drivers/dma-buf/Makefile | 2 +- drivers/dma-buf/dma-buf-mapping.c | 248 ++++++++++++++++++++++++++++++ include/linux/dma-buf-mapping.h | 17 ++ include/linux/dma-buf.h | 11 ++ 4 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 drivers/dma-buf/dma-buf-mapping.c create mode 100644 include/linux/dma-buf-mapping.h diff --git a/drivers/dma-buf/Makefile b/drivers/dma-buf/Makefile index 70ec901edf2c..2008fb7481b3 100644 --- a/drivers/dma-buf/Makefile +++ b/drivers/dma-buf/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \ - dma-fence-unwrap.o dma-resv.o + dma-fence-unwrap.o dma-resv.o dma-buf-mapping.o obj-$(CONFIG_DMABUF_HEAPS) += dma-heap.o obj-$(CONFIG_DMABUF_HEAPS) += heaps/ obj-$(CONFIG_SYNC_FILE) += sync_file.o diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c new file mode 100644 index 000000000000..8f0c79865e37 --- /dev/null +++ b/drivers/dma-buf/dma-buf-mapping.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * DMA BUF Mapping Helpers + * + */ +#include +#include + +static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, size_t length, + dma_addr_t addr) +{ + unsigned int len, nents; + int i; + + nents = DIV_ROUND_UP(length, UINT_MAX); + for (i = 0; i < nents; i++) { + len = min_t(size_t, length, UINT_MAX); + length -= len; + /* + * DMABUF abuses scatterlist to create a scatterlist + * that does not have any CPU list, only the DMA list. + * Always set the page related values to NULL to ensure + * importers can't use it. The phys_addr based DMA API + * does not require the CPU list for mapping or unmapping. + */ + sg_set_page(sgl, NULL, 0, 0); + sg_dma_address(sgl) = addr + i * UINT_MAX; + sg_dma_len(sgl) = len; + sgl = sg_next(sgl); + } + + return sgl; +} + +static unsigned int calc_sg_nents(struct dma_iova_state *state, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size) +{ + unsigned int nents = 0; + size_t i; + + if (!state || !dma_use_iova(state)) { + for (i = 0; i < nr_ranges; i++) + nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX); + } else { + /* + * In IOVA case, there is only one SG entry which spans + * for whole IOVA address space, but we need to make sure + * that it fits sg->length, maybe we need more. + */ + nents = DIV_ROUND_UP(size, UINT_MAX); + } + + return nents; +} + +/** + * struct dma_buf_dma - holds DMA mapping information + * @sgt: Scatter-gather table + * @state: DMA IOVA state relevant in IOMMU-based DMA + * @size: Total size of DMA transfer + */ +struct dma_buf_dma { + struct sg_table sgt; + struct dma_iova_state *state; + size_t size; +}; + +/** + * dma_buf_phys_vec_to_sgt - Returns the scatterlist table of the attachment + * from arrays of physical vectors. This funciton is intended for MMIO memory + * only. + * @attach: [in] attachment whose scatterlist is to be returned + * @provider: [in] p2pdma provider + * @phys_vec: [in] array of physical vectors + * @nr_ranges: [in] number of entries in phys_vec array + * @size: [in] total size of phys_vec + * @dir: [in] direction of DMA transfer + * + * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR + * on error. May return -EINTR if it is interrupted by a signal. + * + * On success, the DMA addresses and lengths in the returned scatterlist are + * PAGE_SIZE aligned. + * + * A mapping must be unmapped by using dma_buf_free_sgt(). + * + * NOTE: This function is intended for exporters. If direct traffic routing is + * mandatory exporter should call routing pci_p2pdma_map_type() before calling + * this function. + */ +struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach, + struct p2pdma_provider *provider, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size, + enum dma_data_direction dir) +{ + unsigned int nents, mapped_len = 0; + struct dma_buf_dma *dma; + struct scatterlist *sgl; + dma_addr_t addr; + size_t i; + int ret; + + dma_resv_assert_held(attach->dmabuf->resv); + + if (WARN_ON(!attach || !attach->dmabuf || !provider)) + /* This function is supposed to work on MMIO memory only */ + return ERR_PTR(-EINVAL); + + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) + return ERR_PTR(-ENOMEM); + + switch (pci_p2pdma_map_type(provider, attach->dev)) { + case PCI_P2PDMA_MAP_BUS_ADDR: + /* + * There is no need in IOVA at all for this flow. + */ + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + dma->state = kzalloc(sizeof(*dma->state), GFP_KERNEL); + if (!dma->state) { + ret = -ENOMEM; + goto err_free_dma; + } + + dma_iova_try_alloc(attach->dev, dma->state, 0, size); + break; + default: + ret = -EINVAL; + goto err_free_dma; + } + + nents = calc_sg_nents(dma->state, phys_vec, nr_ranges, size); + ret = sg_alloc_table(&dma->sgt, nents, GFP_KERNEL | __GFP_ZERO); + if (ret) + goto err_free_state; + + sgl = dma->sgt.sgl; + + for (i = 0; i < nr_ranges; i++) { + if (!dma->state) { + addr = pci_p2pdma_bus_addr_map(provider, + phys_vec[i].paddr); + } else if (dma_use_iova(dma->state)) { + ret = dma_iova_link(attach->dev, dma->state, + phys_vec[i].paddr, 0, + phys_vec[i].len, dir, + DMA_ATTR_MMIO); + if (ret) + goto err_unmap_dma; + + mapped_len += phys_vec[i].len; + } else { + addr = dma_map_phys(attach->dev, phys_vec[i].paddr, + phys_vec[i].len, dir, + DMA_ATTR_MMIO); + ret = dma_mapping_error(attach->dev, addr); + if (ret) + goto err_unmap_dma; + } + + if (!dma->state || !dma_use_iova(dma->state)) + sgl = fill_sg_entry(sgl, phys_vec[i].len, addr); + } + + if (dma->state && dma_use_iova(dma->state)) { + WARN_ON_ONCE(mapped_len != size); + ret = dma_iova_sync(attach->dev, dma->state, 0, mapped_len); + if (ret) + goto err_unmap_dma; + + sgl = fill_sg_entry(sgl, mapped_len, dma->state->addr); + } + + dma->size = size; + + /* + * No CPU list included — set orig_nents = 0 so others can detect + * this via SG table (use nents only). + */ + dma->sgt.orig_nents = 0; + + + /* + * SGL must be NULL to indicate that SGL is the last one + * and we allocated correct number of entries in sg_alloc_table() + */ + WARN_ON_ONCE(sgl); + return &dma->sgt; + +err_unmap_dma: + if (!i || !dma->state) { + ; /* Do nothing */ + } else if (dma_use_iova(dma->state)) { + dma_iova_destroy(attach->dev, dma->state, mapped_len, dir, + DMA_ATTR_MMIO); + } else { + for_each_sgtable_dma_sg(&dma->sgt, sgl, i) + dma_unmap_phys(attach->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, DMA_ATTR_MMIO); + } + sg_free_table(&dma->sgt); +err_free_state: + kfree(dma->state); +err_free_dma: + kfree(dma); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_NS_GPL(dma_buf_phys_vec_to_sgt, DMA_BUF); + +/** + * dma_buf_free_sgt- unmaps the buffer + * @attach: [in] attachment to unmap buffer from + * @sgt: [in] scatterlist info of the buffer to unmap + * @dir: [in] direction of DMA transfer + * + * This unmaps a DMA mapping for @attached obtained + * by dma_buf_phys_vec_to_sgt(). + */ +void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct dma_buf_dma *dma = container_of(sgt, struct dma_buf_dma, sgt); + int i; + + dma_resv_assert_held(attach->dmabuf->resv); + + if (!dma->state) { + ; /* Do nothing */ + } else if (dma_use_iova(dma->state)) { + dma_iova_destroy(attach->dev, dma->state, dma->size, dir, + DMA_ATTR_MMIO); + } else { + struct scatterlist *sgl; + + for_each_sgtable_dma_sg(sgt, sgl, i) + dma_unmap_phys(attach->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, DMA_ATTR_MMIO); + } + + sg_free_table(sgt); + kfree(dma->state); + kfree(dma); + +} +EXPORT_SYMBOL_NS_GPL(dma_buf_free_sgt, DMA_BUF); diff --git a/include/linux/dma-buf-mapping.h b/include/linux/dma-buf-mapping.h new file mode 100644 index 000000000000..a3c0ce2d3a42 --- /dev/null +++ b/include/linux/dma-buf-mapping.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * DMA BUF Mapping Helpers + * + */ +#ifndef __DMA_BUF_MAPPING_H__ +#define __DMA_BUF_MAPPING_H__ +#include + +struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach, + struct p2pdma_provider *provider, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size, + enum dma_data_direction dir); +void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt, + enum dma_data_direction dir); +#endif diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index e39741e8d211..28066eb28fd5 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -22,6 +22,7 @@ #include #include #include +#include struct device; struct dma_buf; @@ -529,6 +530,16 @@ struct dma_buf_export_info { void *priv; }; +/** + * struct dma_buf_phys_vec - describe continuous chunk of memory + * @paddr: physical address of that chunk + * @len: Length of this chunk + */ +struct dma_buf_phys_vec { + phys_addr_t paddr; + size_t len; +}; + /** * DEFINE_DMA_BUF_EXPORT_INFO - helper macro for exporters * @name: export-info name -- Gitee From dffb5374a865390e8a5b7ce41e2e5926008a60da Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Tue, 25 Nov 2025 17:11:18 -0800 Subject: [PATCH 08/43] dma-buf: fix integer overflow in fill_sg_entry() for buffers >= 8GiB ANBZ: #31969 commit 590d745680309f8d956c3f0a97270fe65013b272 upstream. fill_sg_entry() splits large DMA buffers into multiple scatter-gather entries, each holding up to UINT_MAX bytes. When calculating the DMA address for entries beyond the second one, the expression (i * UINT_MAX) causes integer overflow due to 32-bit arithmetic. This manifests when the input arg length >= 8 GiB results in looping for i >= 2. Fix by casting i to dma_addr_t before multiplication. Fixes: 3aa31a8bb11e ("dma-buf: provide phys_vec to scatter-gather mapping routine") Signed-off-by: Alex Mastro Reviewed-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20251125-dma-buf-overflow-v1-1-b70ea1e6c4ba@fb.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/dma-buf/dma-buf-mapping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c index 8f0c79865e37..e352f53fd806 100644 --- a/drivers/dma-buf/dma-buf-mapping.c +++ b/drivers/dma-buf/dma-buf-mapping.c @@ -24,7 +24,7 @@ static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, size_t length, * does not require the CPU list for mapping or unmapping. */ sg_set_page(sgl, NULL, 0, 0); - sg_dma_address(sgl) = addr + i * UINT_MAX; + sg_dma_address(sgl) = addr + (dma_addr_t)i * UINT_MAX; sg_dma_len(sgl) = len; sgl = sg_next(sgl); } -- Gitee From 6573020e1ce2fc71bc635e1a05929818a81670ec Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 20 Nov 2025 11:28:26 +0200 Subject: [PATCH 09/43] vfio: Export vfio device get and put registration helpers ANBZ: #31969 commit 64a5dedcff801072154a806102d731ecdf0e7552 upstream. These helpers are useful for managing additional references taken on the device from other associated VFIO modules. Original-patch-by: Jason Gunthorpe Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-7-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/vfio/vfio_main.c | 2 ++ include/linux/vfio.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 83774e0541a3..58c5a8b65f44 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -172,11 +172,13 @@ void vfio_device_put_registration(struct vfio_device *device) if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } +EXPORT_SYMBOL_GPL(vfio_device_put_registration); bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } +EXPORT_SYMBOL_GPL(vfio_device_try_get_registration); /* * VFIO driver API diff --git a/include/linux/vfio.h b/include/linux/vfio.h index b589feb39191..318abae0666c 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -279,6 +279,8 @@ static inline void vfio_put_device(struct vfio_device *device) int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); +bool vfio_device_try_get_registration(struct vfio_device *device); +void vfio_device_put_registration(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set); -- Gitee From bc788a6faafb1173916d42fb6b70d3f4f8f01f25 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 20 Nov 2025 11:28:27 +0200 Subject: [PATCH 10/43] vfio/pci: Share the core device pointer while invoking feature functions ANBZ: #31969 commit 47d13c939d89d348966857b24bb15092398ed8bb upstream. There is no need to share the main device pointer (struct vfio_device *) with all the feature functions as they only need the core device pointer. Therefore, extract the core device pointer once in the caller (vfio_pci_core_ioctl_feature) and share it instead. Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-8-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/vfio/pci/vfio_pci_core.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index f998a64e2055..f4259a1ed65c 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -301,11 +301,9 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, return 0; } -static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_entry(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -322,12 +320,10 @@ static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, } static int vfio_pci_core_pm_entry_with_wakeup( - struct vfio_device *device, u32 flags, + struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_low_power_entry_with_wakeup __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); struct vfio_device_low_power_entry_with_wakeup entry; struct eventfd_ctx *efdctx; int ret; @@ -378,11 +374,9 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) up_write(&vdev->memory_lock); } -static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_exit(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -1523,11 +1517,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, } EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); -static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, - uuid_t __user *arg, size_t argsz) +static int vfio_pci_core_feature_token(struct vfio_pci_core_device *vdev, + u32 flags, uuid_t __user *arg, + size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); uuid_t uuid; int ret; @@ -1554,16 +1547,19 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) { + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + switch (flags & VFIO_DEVICE_FEATURE_MASK) { case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: - return vfio_pci_core_pm_entry(device, flags, arg, argsz); + return vfio_pci_core_pm_entry(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP: - return vfio_pci_core_pm_entry_with_wakeup(device, flags, + return vfio_pci_core_pm_entry_with_wakeup(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: - return vfio_pci_core_pm_exit(device, flags, arg, argsz); + return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: - return vfio_pci_core_feature_token(device, flags, arg, argsz); + return vfio_pci_core_feature_token(vdev, flags, arg, argsz); default: return -ENOTTY; } -- Gitee From f8ae063575de92c5c36f20f31c8fae9a6e6a497d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:28 +0200 Subject: [PATCH 11/43] vfio/pci: Enable peer-to-peer DMA transactions by default ANBZ: #31969 commit 35c3503908d320989d76b735580fd93aa497185f upstream. Make sure that all VFIO PCI devices have peer-to-peer capabilities enables, so we would be able to export their MMIO memory through DMABUF, VFIO has always supported P2P mappings with itself. VFIO type 1 insecurely reads PFNs directly out of a VMA's PTEs and programs them into the IOMMU allowing any two VFIO devices to perform P2P to each other. All existing VMMs use this capability to export P2P into a VM where the VM could setup any kind of DMA it likes. Projects like DPDK/SPDK are also known to make use of this, though less frequently. As a first step to more properly integrating VFIO with the P2P subsystem unconditionally enable P2P support for VFIO PCI devices. The struct p2pdma_provider will act has a handle to the P2P subsystem to do things like DMA mapping. While real PCI devices have to support P2P (they can't even tell if an IOVA is P2P or not) there may be fake PCI devices that may trigger some kind of catastrophic system failure. To date VFIO has never tripped up on such a case, but if one is discovered the plan is to add a PCI quirk and have pcim_p2pdma_init() fail. This will fully block the broken device throughout any users of the P2P subsystem in the kernel. Thus P2P through DMABUF will follow the historical VFIO model and be unconditionally enabled by vfio-pci. Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-9-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/vfio/pci/vfio_pci_core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index f4259a1ed65c..ce9c3e18d7ed 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -30,6 +30,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_EEH) #include #endif @@ -2127,6 +2128,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); + int ret; vdev->pdev = to_pci_dev(core_vdev->dev); vdev->irq_type = VFIO_PCI_NUM_IRQS; @@ -2136,6 +2138,9 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); + ret = pcim_p2pdma_init(vdev->pdev); + if (ret && ret != -EOPNOTSUPP) + return ret; init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); -- Gitee From 8e8852973384b436a1f4b0a59a45750097d27f5f Mon Sep 17 00:00:00 2001 From: Nipun Gupta Date: Fri, 15 Sep 2023 10:24:22 +0530 Subject: [PATCH 12/43] vfio: add bus master feature to device feature ioctl ANBZ: #31969 commit f59a7b6af012619199c55f587a6c4ed681639b32 upstream. add bus mastering control to VFIO_DEVICE_FEATURE IOCTL. The VFIO user can use this feature to enable or disable the Bus Mastering of a device bound to VFIO. Co-developed-by: Shubham Rohila Signed-off-by: Shubham Rohila Signed-off-by: Nipun Gupta Link: https://lore.kernel.org/r/20230915045423.31630-2-nipun.gupta@amd.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- include/uapi/linux/vfio.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index aa9135a95c9c..26a1e5b39b2e 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1471,6 +1471,27 @@ struct vfio_device_feature_mig_data_size { #define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9 +/** + * Upon VFIO_DEVICE_FEATURE_SET, set or clear the BUS mastering for the device + * based on the operation specified in op flag. + * + * The functionality is incorporated for devices that needs bus master control, + * but the in-band device interface lacks the support. Consequently, it is not + * applicable to PCI devices, as bus master control for PCI devices is managed + * in-band through the configuration space. At present, this feature is supported + * only for CDX devices. + * When the device's BUS MASTER setting is configured as CLEAR, it will result in + * blocking all incoming DMA requests from the device. On the other hand, configuring + * the device's BUS MASTER setting as SET (enable) will grant the device the + * capability to perform DMA to the host memory. + */ +struct vfio_device_feature_bus_master { + __u32 op; +#define VFIO_DEVICE_FEATURE_CLEAR_MASTER 0 /* Clear Bus Master */ +#define VFIO_DEVICE_FEATURE_SET_MASTER 1 /* Set Bus Master */ +}; +#define VFIO_DEVICE_FEATURE_BUS_MASTER 10 + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- Gitee From e661f5e82bf89497cca3ad2da79cbe8603f31825 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Fri, 8 May 2026 17:05:05 +0800 Subject: [PATCH 13/43] anolis: vfio/pci: Add local get_file_active() helper for 6.6 dma-buf support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 The iommufd dma-buf support backport introduces calls to get_file_active(), but this helper was only added upstream in v6.7 (commit 61d4fb0b349e ("file, i915: fix file reference for mmap_singleton()")). On a 6.6-based tree this results in a build failure: drivers/vfio/pci/vfio_pci_dmabuf.c: In function ‘vfio_pci_dma_buf_move’: drivers/vfio/pci/vfio_pci_dmabuf.c:317:22: error: implicit declaration of function ‘get_file_active’ Fully backporting the new file_ref and SLAB_TYPESAFE_BY_RCU based infrastructure from 6.7+ would require invasive changes across the VFS and file lifetime management code, which is out of scope for this tree. Instead, provide a local, simplified get_file_active() implementation in vfio_pci_dmabuf.c for kernels that do not have the upstream helper. The local version: - uses rcu_read_lock()/rcu_read_unlock() to protect the struct file - reads the file pointer from dmabuf->file - uses atomic_long_inc_not_zero(&file->f_count) to conditionally acquire an active reference, or returns NULL if the file is already being torn down This is sufficient and safe for the VFIO dma-buf users because: - all callers hold vdev->memory_lock in write mode while iterating the vdev->dmabufs list, so dmabuf->file is stable - the extra f_count reference is always dropped with fput(), as in the upstream code - we do not change the global file lifetime or SLAB behaviour This fixes the build of the vfio_pci dmabuf/iommufd support on top of a 6.6-based kernel without pulling in the full 6.7 VFS changes. Signed-off-by: Shuai Xue Signed-off-by: Ruidong Tian --- drivers/vfio/pci/vfio_pci_priv.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index d60e5e43cb55..c085fd5f2fce 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -119,4 +119,14 @@ static inline void vfio_pci_liveupdate_cleanup(void) } #endif /* CONFIG_VFIO_PCI_LIVEUPDATE */ +static inline struct file *get_file_active(struct file **f) +{ + struct file *file; + rcu_read_lock(); + file = *f; + if (file && !atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + rcu_read_unlock(); + return file; +} #endif -- Gitee From 7eedb7849fcb45d1e7a5ef72ed22fc12fd50ea4a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:29 +0200 Subject: [PATCH 14/43] vfio/pci: Add dma-buf export support for MMIO regions ANBZ: #31969 commit 5d74781ebc86c5fa9e9d6934024c505412de9b52 upstream. Add support for exporting PCI device MMIO regions through dma-buf, enabling safe sharing of non-struct page memory with controlled lifetime management. This allows RDMA and other subsystems to import dma-buf FDs and build them into memory regions for PCI P2P operations. The implementation provides a revocable attachment mechanism using dma-buf move operations. MMIO regions are normally pinned as BARs don't change physical addresses, but access is revoked when the VFIO device is closed or a PCI reset is issued. This ensures kernel self-defense against potentially hostile userspace. Currently VFIO can take MMIO regions from the device's BAR and map them into a PFNMAP VMA with special PTEs. This mapping type ensures the memory cannot be used with things like pin_user_pages(), hmm, and so on. In practice only the user process CPU and KVM can safely make use of these VMA. When VFIO shuts down these VMAs are cleaned by unmap_mapping_range() to prevent any UAF of the MMIO beyond driver unbind. However, VFIO type 1 has an insecure behavior where it uses follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back into the IOMMU. This has a long history of enabling P2P DMA inside VMs, but has serious lifetime problems by allowing a UAF of the MMIO after the VFIO driver has been unbound. Introduce DMABUF as a new safe way to export a FD based handle for the MMIO regions. This can be consumed by existing DMABUF importers like RDMA or DRM without opening an UAF. A following series will add an importer to iommufd to obsolete the type 1 code and allow safe UAF-free MMIO P2P in VM cases. DMABUF has a built in synchronous invalidation mechanism called move_notify. VFIO keeps track of all drivers importing its MMIO and can invoke a synchronous invalidation callback to tell the importing drivers to DMA unmap and forget about the MMIO pfns. This process is being called revoke. This synchronous invalidation fully prevents any lifecycle problems. VFIO will do this before unbinding its driver ensuring there is no UAF of the MMIO beyond the driver lifecycle. Further, VFIO has additional behavior to block access to the MMIO during things like Function Level Reset. This is because some poor platforms may experience a MCE type crash when touching MMIO of a PCI device that is undergoing a reset. Today this is done by using unmap_mapping_range() on the VMAs. Extend that into the DMABUF world and temporarily revoke the MMIO from the DMABUF importers during FLR as well. This will more robustly prevent an errant P2P from possibly upsetting the platform. A DMABUF FD is a preferred handle for MMIO compared to using something like a pgmap because: - VFIO is supported, including its P2P feature, on archs that don't support pgmap - PCI devices have all sorts of BAR sizes, including ones smaller than a section so a pgmap cannot always be created - It is undesirable to waste a lot of memory for struct pages, especially for a case like a GPU with ~100GB of BAR size - We want a synchronous revoke semantic to support FLR with light hardware requirements Use the P2P subsystem to help generate the DMA mapping. This is a significant upgrade over the abuse of dma_map_resource() that has historically been used by DMABUF exporters. Experience with an OOT version of this patch shows that real systems do need this. This approach deals with all the P2P scenarios: - Non-zero PCI bus_offset - ACS flags routing traffic to the IOMMU - ACS flags that bypass the IOMMU - though vfio noiommu is required to hit this. There will be further work to formalize the revoke semantic in DMABUF. For now this acts like a move_notify dynamic exporter where importer fault handling will get a failure when they attempt to map. This means that only fully restartable fault capable importers can import the VFIO DMABUFs. A future revoke semantic should open this up to more HW as the HW only needs to invalidate, not handle restartable faults. Signed-off-by: Jason Gunthorpe Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-10-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/vfio/pci/Kconfig | 3 + drivers/vfio/pci/Makefile | 1 + drivers/vfio/pci/vfio_pci.c | 5 + drivers/vfio/pci/vfio_pci_config.c | 22 +- drivers/vfio/pci/vfio_pci_core.c | 18 +- drivers/vfio/pci/vfio_pci_dmabuf.c | 316 +++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_priv.h | 23 +++ include/linux/vfio_pci_core.h | 42 ++++ include/uapi/linux/vfio.h | 28 +++ 9 files changed, 453 insertions(+), 5 deletions(-) create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index ccc5abbc415a..aaa0836ac7b7 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -60,6 +60,9 @@ config VFIO_PCI_ZDEV_KVM To enable s390x KVM vfio-pci extensions, say Y. +config VFIO_PCI_DMABUF + def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER + config VFIO_PCI_LIVEUPDATE bool "VFIO PCI support for Live Update (EXPERIMENTAL)" depends on VFIO_PCI && PCI_LIVEUPDATE diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 4addb1a56fea..019101a48aff 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,6 +2,7 @@ vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o vfio-pci-y := vfio_pci.o diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 14e09c457cfe..ac2f34866fe1 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -146,6 +146,10 @@ const struct vfio_device_ops vfio_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +static const struct vfio_pci_device_ops vfio_pci_dev_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_core_device *vdev; @@ -160,6 +164,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); + vdev->pci_ops = &vfio_pci_dev_ops; ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 8ea38e7421df..a821de69e2a1 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); - if (!new_mem) + if (!new_mem) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } /* * If the user is writing mem/io enable (new_mem/io) and we @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, *virt_cmd &= cpu_to_le16(~mask); *virt_cmd |= cpu_to_le16(new_cmd & mask); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) { - if (state >= PCI_D3hot) + if (state >= PCI_D3hot) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } vfio_pci_set_power_state(vdev, state); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ce9c3e18d7ed..3a475cd4fa17 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -289,6 +289,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, * semaphore. */ vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); + if (vdev->pm_runtime_engaged) { up_write(&vdev->memory_lock); return -EINVAL; @@ -372,6 +374,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) */ down_write(&vdev->memory_lock); __vfio_pci_runtime_pm_exit(vdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -736,6 +740,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #endif vfio_pci_core_disable(vdev); + vfio_pci_dma_buf_cleanup(vdev); + mutex_lock(&vdev->igate); if (vdev->err_trigger) { eventfd_ctx_put(vdev->err_trigger); @@ -1272,7 +1278,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, */ vfio_pci_set_power_state(vdev, PCI_D0); + vfio_pci_dma_buf_move(vdev, true); ret = pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); return ret; @@ -1561,6 +1570,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: return vfio_pci_core_feature_token(vdev, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_DMA_BUF: + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); default: return -ENOTTY; } @@ -2141,6 +2152,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) ret = pcim_p2pdma_init(vdev->pdev); if (ret && ret != -EOPNOTSUPP) return ret; + INIT_LIST_HEAD(&vdev->dmabufs); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2505,6 +2517,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, break; } + vfio_pci_dma_buf_move(vdev, true); vfio_pci_zap_bars(vdev); } @@ -2533,8 +2546,11 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, err_undo: list_for_each_entry_from_reverse(vdev, &dev_set->device_list, - vdev.dev_set_list) + vdev.dev_set_list) { + if (vdev->vdev.open_count && __vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); + } list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) pm_runtime_put(&vdev->pdev->dev); diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c new file mode 100644 index 000000000000..e003bf550f76 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + */ +#include +#include +#include + +#include "vfio_pci_priv.h" + +MODULE_IMPORT_NS(DMA_BUF); + +struct vfio_pci_dma_buf { + struct dma_buf *dmabuf; + struct vfio_pci_core_device *vdev; + struct list_head dmabufs_elm; + size_t size; + struct dma_buf_phys_vec *phys_vec; + struct p2pdma_provider *provider; + u32 nr_ranges; + u8 revoked : 1; +}; + +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + if (!attachment->peer2peer) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + return 0; +} + +static struct sg_table * +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(priv->dmabuf->resv); + + if (priv->revoked) + return ERR_PTR(-ENODEV); + + return dma_buf_phys_vec_to_sgt(attachment, priv->provider, + priv->phys_vec, priv->nr_ranges, + priv->size, dir); +} + +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + dma_buf_free_sgt(attachment, sgt, dir); +} + +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + /* + * Either this or vfio_pci_dma_buf_cleanup() will remove from the list. + * The refcount prevents both. + */ + if (priv->vdev) { + down_write(&priv->vdev->memory_lock); + list_del_init(&priv->dmabufs_elm); + up_write(&priv->vdev->memory_lock); + vfio_device_put_registration(&priv->vdev->vdev); + } + kfree(priv->phys_vec); + kfree(priv); +} + +static const struct dma_buf_ops vfio_pci_dmabuf_ops = { + .attach = vfio_pci_dma_buf_attach, + .map_dma_buf = vfio_pci_dma_buf_map, + .unmap_dma_buf = vfio_pci_dma_buf_unmap, + .release = vfio_pci_dma_buf_release, +}; + +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + phys_addr_t max_addr; + unsigned int i; + + max_addr = start + len; + for (i = 0; i < nr_ranges; i++) { + phys_addr_t end; + + if (!dma_ranges[i].length) + return -EINVAL; + + if (check_add_overflow(start, dma_ranges[i].offset, + &phys_vec[i].paddr) || + check_add_overflow(phys_vec[i].paddr, + dma_ranges[i].length, &end)) + return -EOVERFLOW; + if (end > max_addr) + return -EINVAL; + + phys_vec[i].len = dma_ranges[i].length; + } + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec); + +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct pci_dev *pdev = vdev->pdev; + + *provider = pcim_p2pdma_provider(pdev, region_index); + if (!*provider) + return -EINVAL; + + return vfio_pci_core_fill_phys_vec( + phys_vec, dma_ranges, nr_ranges, + pci_resource_start(pdev, region_index), + pci_resource_len(pdev, region_index)); +} +EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys); + +static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf, + struct vfio_region_dma_range *dma_ranges, + size_t *lengthp) +{ + size_t length = 0; + u32 i; + + for (i = 0; i < dma_buf->nr_ranges; i++) { + u64 offset = dma_ranges[i].offset; + u64 len = dma_ranges[i].length; + + if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (check_add_overflow(length, len, &length)) + return -EINVAL; + } + + /* + * dma_iova_try_alloc() will WARN on if userspace proposes a size that + * is too big, eg with lots of ranges. + */ + if ((u64)(length) & DMA_IOVA_USE_SWIOTLB) + return -EINVAL; + + *lengthp = length; + return 0; +} + +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + struct vfio_device_feature_dma_buf get_dma_buf = {}; + struct vfio_region_dma_range *dma_ranges; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct vfio_pci_dma_buf *priv; + size_t length; + int ret; + + if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys) + return -EOPNOTSUPP; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(get_dma_buf)); + if (ret != 1) + return ret; + + if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf))) + return -EFAULT; + + if (!get_dma_buf.nr_ranges || get_dma_buf.flags) + return -EINVAL; + + /* + * For PCI the region_index is the BAR number like everything else. + */ + if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX) + return -ENODEV; + + dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, + sizeof(*dma_ranges)); + if (IS_ERR(dma_ranges)) + return PTR_ERR(dma_ranges); + + ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length); + if (ret) + goto err_free_ranges; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto err_free_ranges; + } + priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec), + GFP_KERNEL); + if (!priv->phys_vec) { + ret = -ENOMEM; + goto err_free_priv; + } + + priv->vdev = vdev; + priv->nr_ranges = get_dma_buf.nr_ranges; + priv->size = length; + ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider, + get_dma_buf.region_index, + priv->phys_vec, dma_ranges, + priv->nr_ranges); + if (ret) + goto err_free_phys; + + kfree(dma_ranges); + dma_ranges = NULL; + + if (!vfio_device_try_get_registration(&vdev->vdev)) { + ret = -ENODEV; + goto err_free_phys; + } + + exp_info.ops = &vfio_pci_dmabuf_ops; + exp_info.size = priv->size; + exp_info.flags = get_dma_buf.open_flags; + exp_info.priv = priv; + + priv->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(priv->dmabuf)) { + ret = PTR_ERR(priv->dmabuf); + goto err_dev_put; + } + + /* dma_buf_put() now frees priv */ + INIT_LIST_HEAD(&priv->dmabufs_elm); + down_write(&vdev->memory_lock); + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = !__vfio_pci_memory_enabled(vdev); + list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); + dma_resv_unlock(priv->dmabuf->resv); + up_write(&vdev->memory_lock); + + /* + * dma_buf_fd() consumes the reference, when the file closes the dmabuf + * will be released. + */ + ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags); + if (ret < 0) + goto err_dma_buf; + return ret; + +err_dma_buf: + dma_buf_put(priv->dmabuf); +err_dev_put: + vfio_device_put_registration(&vdev->vdev); +err_free_phys: + kfree(priv->phys_vec); +err_free_priv: + kfree(priv); +err_free_ranges: + kfree(dma_ranges); + return ret; +} + +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + lockdep_assert_held_write(&vdev->memory_lock); + + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + if (priv->revoked != revoked) { + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + } + fput(priv->dmabuf->file); + } +} + +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + down_write(&vdev->memory_lock); + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + dma_resv_lock(priv->dmabuf->resv, NULL); + list_del_init(&priv->dmabufs_elm); + priv->vdev = NULL; + priv->revoked = true; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + vfio_device_put_registration(&vdev->vdev); + fput(priv->dmabuf->file); + } + up_write(&vdev->memory_lock); +} diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index c085fd5f2fce..8b898641c496 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -105,6 +105,29 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; } +#ifdef CONFIG_VFIO_PCI_DMABUF +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz); +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); +#else +static inline int +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + return -ENOTTY; +} +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ +} +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, + bool revoked) +{ +} +#endif + #ifdef CONFIG_VFIO_PCI_LIVEUPDATE int __init vfio_pci_liveupdate_init(void); void vfio_pci_liveupdate_cleanup(void); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 01d6abd6a1cc..c6e9df4ecfd6 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -27,6 +27,8 @@ struct vfio_pci_core_device; struct vfio_pci_core_device_ser; struct vfio_pci_region; +struct p2pdma_provider; +struct dma_buf_phys_vec; struct vfio_pci_regops { ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, @@ -50,9 +52,48 @@ struct vfio_pci_region { u32 flags; }; +struct vfio_pci_device_ops { + int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +}; + +#if IS_ENABLED(CONFIG_VFIO_PCI_DMABUF) +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len); +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +#else +static inline int +vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + return -EINVAL; +} +static inline int vfio_pci_core_get_dmabuf_phys( + struct vfio_pci_core_device *vdev, struct p2pdma_provider **provider, + unsigned int region_index, struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, size_t nr_ranges) +{ + return -EOPNOTSUPP; +} +#endif + struct vfio_pci_core_device { struct vfio_device vdev; struct pci_dev *pdev; + const struct vfio_pci_device_ops *pci_ops; void __iomem *barmap[PCI_STD_NUM_BARS]; bool bar_mmap_supported[PCI_STD_NUM_BARS]; u8 *pci_config_map; @@ -95,6 +136,7 @@ struct vfio_pci_core_device { struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; struct rw_semaphore memory_lock; + struct list_head dmabufs; struct vfio_pci_core_device_ser *liveupdate_incoming_state; }; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 26a1e5b39b2e..4703f1c566a9 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -14,6 +14,7 @@ #include #include +#include #define VFIO_API_VERSION 0 @@ -1492,6 +1493,33 @@ struct vfio_device_feature_bus_master { }; #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 +/** + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the + * regions selected. + * + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC, + * etc. offset/length specify a slice of the region to create the dmabuf from. + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf. + * + * flags should be 0. + * + * Return: The fd number on success, -1 and errno is set on failure. + */ +#define VFIO_DEVICE_FEATURE_DMA_BUF 11 + +struct vfio_region_dma_range { + __u64 offset; + __u64 length; +}; + +struct vfio_device_feature_dma_buf { + __u32 region_index; + __u32 open_flags; + __u32 flags; + __u32 nr_ranges; + struct vfio_region_dma_range dma_ranges[] __counted_by(nr_ranges); +}; + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- Gitee From 623c86c5edfb706e60aa1ecd56e7fb72ce9518dd Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Jan 2026 17:45:02 +0200 Subject: [PATCH 15/43] vfio: Prevent from pinned DMABUF importers to attach to VFIO DMABUF ANBZ: #31969 commit 61ceaf236115f20f4fdd7cf60f883ada1063349a upstream. Some pinned importers, such as non-ODP RDMA ones, cannot invalidate their mappings and therefore must be prevented from attaching to this exporter. Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO regions") Signed-off-by: Leon Romanovsky Reviewed-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20260121-vfio-add-pin-v1-1-4e04916b17f1@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/vfio/pci/vfio_pci_dmabuf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index e003bf550f76..d9a2a1acc73a 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -20,6 +20,16 @@ struct vfio_pci_dma_buf { u8 revoked : 1; }; +static int vfio_pci_dma_buf_pin(struct dma_buf_attachment *attachment) +{ + return -EOPNOTSUPP; +} + +static void vfio_pci_dma_buf_unpin(struct dma_buf_attachment *attachment) +{ + /* Do nothing */ +} + static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, struct dma_buf_attachment *attachment) { @@ -76,6 +86,8 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) } static const struct dma_buf_ops vfio_pci_dmabuf_ops = { + .pin = vfio_pci_dma_buf_pin, + .unpin = vfio_pci_dma_buf_unpin, .attach = vfio_pci_dma_buf_attach, .map_dma_buf = vfio_pci_dma_buf_map, .unmap_dma_buf = vfio_pci_dma_buf_unmap, -- Gitee From 336f4063eff5e23d4a4fcec3bec52bcf23b93e3c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Nov 2025 11:28:30 +0200 Subject: [PATCH 16/43] vfio/nvgrace: Support get_dmabuf_phys ANBZ: #31969 commit 5415d887db0e059920cb5673a32cc4d66daa280f upstream. Call vfio_pci_core_fill_phys_vec() with the proper physical ranges for the synthetic BAR 2 and BAR 4 regions. Otherwise use the normal flow based on the PCI bar. This demonstrates a DMABUF that follows the region info report to only allow mapping parts of the region that are mmapable. Since the BAR is power of two sized and the "CXL" region is just page aligned the there can be a padding region at the end that is not mmaped or passed into the DMABUF. The "CXL" ranges that are remapped into BAR 2 and BAR 4 areas are not PCI MMIO, they actually run over the CXL-like coherent interconnect and for the purposes of DMA behave identically to DRAM. We don't try to model this distinction between true PCI BAR memory that takes a real PCI path and the "CXL" memory that takes a different path in the p2p framework for now. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Reviewed-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-11-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/vfio/pci/nvgrace-gpu/main.c | 52 +++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 6aac1b6503db..46c394420839 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -809,6 +810,50 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, return vfio_pci_core_write(core_vdev, buf, count, ppos); } +static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct nvgrace_gpu_pci_core_device *nvdev = container_of( + core_vdev, struct nvgrace_gpu_pci_core_device, core_device); + struct pci_dev *pdev = core_vdev->pdev; + struct mem_region *mem_region; + + /* + * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) { + * The P2P properties of the non-BAR memory is the same as the + * BAR memory, so just use the provider for index 0. Someday + * when CXL gets P2P support we could create CXLish providers + * for the non-BAR memory. + * } else if (region_index == USEMEM_REGION_INDEX) { + * This is actually cachable memory and isn't treated as P2P in + * the chip. For now we have no way to push cachable memory + * through everything and the Grace HW doesn't care what caching + * attribute is programmed into the SMMU. So use BAR 0. + * } + */ + mem_region = nvgrace_gpu_memregion(region_index, nvdev); + if (mem_region) { + *provider = pcim_p2pdma_provider(pdev, 0); + if (!*provider) + return -EINVAL; + return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges, + nr_ranges, + mem_region->memphys, + mem_region->memlength); + } + + return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index, + phys_vec, dma_ranges, nr_ranges); +} + +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = { + .get_dmabuf_phys = nvgrace_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .name = "nvgrace-gpu-vfio-pci", .init = vfio_pci_core_init_dev, @@ -828,6 +873,10 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .name = "nvgrace-gpu-vfio-pci-core", .init = vfio_pci_core_init_dev, @@ -1111,6 +1160,9 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, nvdev->egm_node = egmpxm; } + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops; + } else { + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops; } ret = vfio_pci_core_register_device(&nvdev->core_device); -- Gitee From f72535da0a5e790d58f1eecdb9bc5d79de93a4a1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 21 Nov 2025 11:50:58 -0400 Subject: [PATCH 17/43] vfio/pci: Add vfio_pci_dma_buf_iommufd_map() ANBZ: #31969 commit 96ce2aeb15bd8672ab47abe547e2a1f8ba3886ff upstream. This function is used to establish the "private interconnect" between the VFIO DMABUF exporter and the iommufd DMABUF importer. This is intended to be a temporary API until the core DMABUF interface is improved to natively support a private interconnect and revocable negotiation. This function should only be called by iommufd when trying to map a DMABUF. For now iommufd will only support VFIO DMABUFs. The following improvements are needed in the DMABUF API to generically support more exporters with iommufd/kvm type importers that cannot use the DMA API: 1) Revoke semantics. VFIO needs to be able to prevent access to the MMIO during FLR, and so it will use dma_buf_move_notify() to prevent access. iommmufd does not support fault handling so it cannot implement the full move_notify. Instead if revoke is negotiated the exporter promises not to use move_notify() unless the importer can experiance failures. iommufd will unmap the dmabuf from the iommu page tables while it is revoked. 2) Private interconnect negotiation. iommufd will only be able to map a "private interconnect" that provides a phys_addr_t and a struct p2pdma_provider * to describe the memory. It cannot use a DMA mapped scatterlist since it is directly calling iommu_map(). 3) NULL device during dma_buf_dynamic_attach(). Since iommufd doesn't use the DMA API it doesn't have a DMAable struct device to pass here. Link: https://patch.msgid.link/r/1-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Shuai Xue Acked-by: Alex Williamson Signed-off-by: Jason Gunthorpe [ Shuai Xue: Conflicts: include/linux/vfio_pci_core.h: minor, missing vfio_pci_core_iowrite repalce EXPORT_SYMBOL_FOR_MODULES with EXPORT_SYMBOL_GPL_FOR_MODULES ] Signed-off-by: Ruidong Tian --- drivers/vfio/pci/vfio_pci_dmabuf.c | 34 ++++++++++++++++++++++++++++++ include/linux/vfio_pci_core.h | 5 +++++ 2 files changed, 39 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index d9a2a1acc73a..c0156c585b49 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -94,6 +94,40 @@ static const struct dma_buf_ops vfio_pci_dmabuf_ops = { .release = vfio_pci_dma_buf_release, }; +/* + * This is a temporary "private interconnect" between VFIO DMABUF and iommufd. + * It allows the two co-operating drivers to exchange the physical address of + * the BAR. This is to be replaced with a formal DMABUF system for negotiated + * interconnect types. + * + * If this function succeeds the following are true: + * - There is one physical range and it is pointing to MMIO + * - When move_notify is called it means revoke, not move, vfio_dma_buf_map + * will fail if it is currently revoked + */ +int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + struct vfio_pci_dma_buf *priv; + + dma_resv_assert_held(attachment->dmabuf->resv); + + if (attachment->dmabuf->ops != &vfio_pci_dmabuf_ops) + return -EOPNOTSUPP; + + priv = attachment->dmabuf->priv; + if (priv->revoked) + return -ENODEV; + + /* More than one range to iommufd will require proper DMABUF support */ + if (priv->nr_ranges != 1) + return -EOPNOTSUPP; + + *phys = priv->phys_vec[0]; + return 0; +} +EXPORT_SYMBOL_GPL_FOR_MODULES(vfio_pci_dma_buf_iommufd_map, "iommufd"); + int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, struct vfio_region_dma_range *dma_ranges, size_t nr_ranges, phys_addr_t start, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index c6e9df4ecfd6..336ef71c40a5 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -29,6 +29,7 @@ struct vfio_pci_core_device_ser; struct vfio_pci_region; struct p2pdma_provider; struct dma_buf_phys_vec; +struct dma_buf_attachment; struct vfio_pci_regops { ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, @@ -180,4 +181,8 @@ bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, loff_t *buf_offset, size_t *intersect_count, size_t *register_offset); + +int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys); + #endif /* VFIO_PCI_CORE_H */ -- Gitee From 39bb11e51102bd58000a5890ae15180820ce350a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 21 Nov 2025 11:50:59 -0400 Subject: [PATCH 18/43] iommufd: Add DMABUF to iopt_pages ANBZ: #31969 commit 71db84a092c399f434976d0522e848e9803cd51a upstream. Add IOPT_ADDRESS_DMABUF to the iopt_pages and the basic infrastructure to create an iopt_pages from a struct dma_buf *. DMABUF pages are not supported for accesses, and for now can only be used with the VFIO DMABUF exporter. The overall flow will be similar to memfd where the user can pass in a DMABUF file descriptor to IOMMU_IOAS_MAP_FILE and create an area and pages. Like other areas it can be copied and otherwise manipulated, though there is little point in doing so. There is no pinned page accounting done for DMABUF maps. The DMABUF attachment exists so long as the dmabuf is mapped into an IOAS, even if the IOAS is not mapped to any domains. Link: https://patch.msgid.link/r/2-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Shuai Xue Signed-off-by: Jason Gunthorpe Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/io_pagetable.c | 3 + drivers/iommu/iommufd/io_pagetable.h | 25 ++++- drivers/iommu/iommufd/iommufd_private.h | 2 + drivers/iommu/iommufd/main.c | 10 ++ drivers/iommu/iommufd/pages.c | 137 ++++++++++++++++++++++-- 5 files changed, 169 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 75d60f2ad900..1a8fb8214549 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -284,6 +284,9 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, case IOPT_ADDRESS_FILE: start = elm->start_byte + elm->pages->start; break; + case IOPT_ADDRESS_DMABUF: + start = elm->start_byte + elm->pages->dmabuf.start; + break; } rc = iopt_alloc_iova(iopt, dst_iova, start, length); if (rc) diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index b6064f4ce4af..08b6bfb6b144 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -5,6 +5,7 @@ #ifndef __IO_PAGETABLE_H #define __IO_PAGETABLE_H +#include #include #include #include @@ -179,7 +180,15 @@ enum { enum iopt_address_type { IOPT_ADDRESS_USER = 0, - IOPT_ADDRESS_FILE = 1, + IOPT_ADDRESS_FILE, + IOPT_ADDRESS_DMABUF, +}; + +struct iopt_pages_dmabuf { + struct dma_buf_attachment *attach; + struct dma_buf_phys_vec phys; + /* Always PAGE_SIZE aligned */ + unsigned long start; }; /* @@ -209,6 +218,8 @@ struct iopt_pages { struct file *file; unsigned long start; }; + /* IOPT_ADDRESS_DMABUF */ + struct iopt_pages_dmabuf dmabuf; }; bool writable:1; u8 account_mode; @@ -220,10 +231,22 @@ struct iopt_pages { struct rb_root_cached domains_itree; }; +static inline bool iopt_is_dmabuf(struct iopt_pages *pages) +{ + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return false; + return pages->type == IOPT_ADDRESS_DMABUF; +} + struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, unsigned long length, bool writable); struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, unsigned long length, bool writable); +struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, + struct dma_buf *dmabuf, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable); void iopt_release_pages(struct kref *kref); static inline void iopt_put_pages(struct iopt_pages *pages) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 85d0843ed07b..96a3264760c0 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -504,6 +504,8 @@ void iommufd_device_pre_destroy(struct iommufd_object *obj); void iommufd_device_destroy(struct iommufd_object *obj); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); +struct device *iommufd_global_device(void); + struct iommufd_access { struct iommufd_object obj; struct iommufd_ctx *ictx; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index f0f4c7723abf..abeb41680e3f 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -751,6 +751,15 @@ static struct miscdevice vfio_misc_dev = { .mode = 0666, }; +/* + * Used only by DMABUF, returns a valid struct device to use as a dummy struct + * device for attachment. + */ +struct device *iommufd_global_device(void) +{ + return iommu_misc_dev.this_device; +} + static int __init iommufd_init(void) { int ret; @@ -794,5 +803,6 @@ MODULE_ALIAS("devname:vfio/vfio"); #endif MODULE_IMPORT_NS(IOMMUFD_INTERNAL); MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS(DMA_BUF); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 5968bed72214..4c5e3c30a5ab 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -45,6 +45,8 @@ * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ +#include +#include #include #include #include @@ -53,6 +55,7 @@ #include #include #include +#include #include "double_span.h" #include "io_pagetable.h" @@ -272,6 +275,7 @@ struct pfn_batch { unsigned int end; unsigned int total_pfns; }; +enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) }; static void batch_clear(struct pfn_batch *batch) { @@ -350,7 +354,6 @@ static void batch_destroy(struct pfn_batch *batch, void *backup) static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn, u32 nr) { - const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); unsigned int end = batch->end; if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] && @@ -1454,6 +1457,121 @@ struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, return pages; } +static void iopt_revoke_notify(struct dma_buf_attachment *attach) +{ + struct iopt_pages *pages = attach->importer_priv; + + guard(mutex)(&pages->mutex); + pages->dmabuf.phys.len = 0; +} + +static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = { + .allow_peer2peer = true, + .move_notify = iopt_revoke_notify, +}; + +/* + * iommufd and vfio have a circular dependency. Future work for a phys + * based private interconnect will remove this. + */ +static int +sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + typeof(&vfio_pci_dma_buf_iommufd_map) fn; + int rc; + + if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)) + return -EOPNOTSUPP; + + fn = symbol_get(vfio_pci_dma_buf_iommufd_map); + if (!fn) + return -EOPNOTSUPP; + rc = fn(attachment, phys); + symbol_put(vfio_pci_dma_buf_iommufd_map); + return rc; +} + +static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages, + struct dma_buf *dmabuf) +{ + struct dma_buf_attachment *attach; + int rc; + + attach = dma_buf_dynamic_attach(dmabuf, iommufd_global_device(), + &iopt_dmabuf_attach_revoke_ops, pages); + if (IS_ERR(attach)) + return PTR_ERR(attach); + + dma_resv_lock(dmabuf->resv, NULL); + /* + * Lock ordering requires the mutex to be taken inside the reservation, + * make sure lockdep sees this. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + mutex_lock(&pages->mutex); + mutex_unlock(&pages->mutex); + } + + rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys); + if (rc) + goto err_detach; + + dma_resv_unlock(dmabuf->resv); + + /* On success iopt_release_pages() will detach and put the dmabuf. */ + pages->dmabuf.attach = attach; + return 0; + +err_detach: + dma_resv_unlock(dmabuf->resv); + dma_buf_detach(dmabuf, attach); + return rc; +} + +struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, + struct dma_buf *dmabuf, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable) +{ + static struct lock_class_key pages_dmabuf_mutex_key; + struct iopt_pages *pages; + int rc; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return ERR_PTR(-EOPNOTSUPP); + + if (dmabuf->size <= (start + length - 1) || + length / PAGE_SIZE >= MAX_NPFNS) + return ERR_PTR(-EINVAL); + + pages = iopt_alloc_pages(start_byte, length, writable); + if (IS_ERR(pages)) + return pages; + + /* + * The mmap_lock can be held when obtaining the dmabuf reservation lock + * which creates a locking cycle with the pages mutex which is held + * while obtaining the mmap_lock. This locking path is not present for + * IOPT_ADDRESS_DMABUF so split the lock class. + */ + lockdep_set_class(&pages->mutex, &pages_dmabuf_mutex_key); + + /* dmabuf does not use pinned page accounting. */ + pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; + pages->type = IOPT_ADDRESS_DMABUF; + pages->dmabuf.start = start - start_byte; + + rc = iopt_map_dmabuf(ictx, pages, dmabuf); + if (rc) { + iopt_put_pages(pages); + return ERR_PTR(rc); + } + + return pages; +} + void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); @@ -1466,8 +1584,14 @@ void iopt_release_pages(struct kref *kref) mutex_destroy(&pages->mutex); put_task_struct(pages->source_task); free_uid(pages->source_user); - if (pages->type == IOPT_ADDRESS_FILE) + if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) { + struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf; + + dma_buf_detach(dmabuf, pages->dmabuf.attach); + dma_buf_put(dmabuf); + } else if (pages->type == IOPT_ADDRESS_FILE) { fput(pages->file); + } kfree(pages); } @@ -2125,15 +2249,14 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; - if (pages->type == IOPT_ADDRESS_FILE) + if (iopt_is_dmabuf(pages)) + return -EINVAL; + + if (pages->type != IOPT_ADDRESS_USER) return iopt_pages_rw_slow(pages, start_index, last_index, start_byte % PAGE_SIZE, data, length, flags); - if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && - WARN_ON(pages->type != IOPT_ADDRESS_USER)) - return -EINVAL; - if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { if (start_index == last_index) return iopt_pages_rw_page(pages, start_index, -- Gitee From 853f689d546250e0a485e688909dcf77c735fbe3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 21 Nov 2025 11:51:00 -0400 Subject: [PATCH 19/43] iommufd: Do not map/unmap revoked DMABUFs ANBZ: #31969 commit 71e2409a0c85c3875ca0dec2515ccceb21a4785b upstream. Once a DMABUF is revoked the domain will be unmapped under the pages mutex. Double unmapping will trigger a WARN, and mapping while revoked will fail. Check for revoked DMABUFs along all the map and unmap paths to resolve this. Ensure that map/unmap is always done under the pages mutex so it is synchronized with the revoke notifier. If a revoke happens between allocating the iopt_pages and the population to a domain then the population will succeed, and leave things unmapped as though revoke had happened immediately after. Currently there is no way to repopulate the domains. Userspace is expected to know if it is going to do something that would trigger revoke (eg if it is about to do a FLR) then it should go and remove the DMABUF mappings before and put the back after. The revoke is only to protect the kernel from mis-behaving userspace. Link: https://patch.msgid.link/r/3-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Shuai Xue Signed-off-by: Jason Gunthorpe Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/io_pagetable.c | 11 +++++- drivers/iommu/iommufd/io_pagetable.h | 8 +++++ drivers/iommu/iommufd/pages.c | 54 +++++++++++++++++----------- 3 files changed, 52 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 1a8fb8214549..0fe0f9c00fc4 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -964,9 +964,14 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, WARN_ON(!area->storage_domain); if (area->storage_domain == domain) area->storage_domain = storage_domain; + if (iopt_is_dmabuf(pages)) { + if (!iopt_dmabuf_revoked(pages)) + iopt_area_unmap_domain(area, domain); + } mutex_unlock(&pages->mutex); - iopt_area_unmap_domain(area, domain); + if (!iopt_is_dmabuf(pages)) + iopt_area_unmap_domain(area, domain); } return; } @@ -1255,6 +1260,10 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova) if (!pages || area->prevent_access) return -EBUSY; + /* Maintaining the domains_itree below is a bit complicated */ + if (iopt_is_dmabuf(pages)) + return -EOPNOTSUPP; + if (new_start & (alignment - 1) || iopt_area_start_byte(area, new_start) & (alignment - 1)) return -EINVAL; diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 08b6bfb6b144..892daf4b1f1e 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -238,6 +238,14 @@ static inline bool iopt_is_dmabuf(struct iopt_pages *pages) return pages->type == IOPT_ADDRESS_DMABUF; } +static inline bool iopt_dmabuf_revoked(struct iopt_pages *pages) +{ + lockdep_assert_held(&pages->mutex); + if (iopt_is_dmabuf(pages)) + return pages->dmabuf.phys.len == 0; + return false; +} + struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, unsigned long length, bool writable); struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 4c5e3c30a5ab..4ac2b5a9a281 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1744,6 +1744,9 @@ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain) { + if (iopt_dmabuf_revoked(pages)) + return; + __iopt_area_unfill_domain(area, pages, domain, iopt_area_last_index(area)); } @@ -1764,6 +1767,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) lockdep_assert_held(&area->pages->mutex); + if (iopt_dmabuf_revoked(area->pages)) + return 0; + rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), iopt_area_last_index(area)); if (rc) @@ -1823,33 +1829,38 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) return 0; mutex_lock(&pages->mutex); - rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), - iopt_area_last_index(area)); - if (rc) - goto out_unlock; + if (!iopt_dmabuf_revoked(pages)) { + rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + goto out_unlock; + + while (!pfn_reader_done(&pfns)) { + done_first_end_index = pfns.batch_end_index; + done_all_end_index = pfns.batch_start_index; + xa_for_each(&area->iopt->domains, index, domain) { + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + } + done_all_end_index = done_first_end_index; - while (!pfn_reader_done(&pfns)) { - done_first_end_index = pfns.batch_end_index; - done_all_end_index = pfns.batch_start_index; - xa_for_each(&area->iopt->domains, index, domain) { - rc = batch_to_domain(&pfns.batch, domain, area, - pfns.batch_start_index); + rc = pfn_reader_next(&pfns); if (rc) goto out_unmap; } - done_all_end_index = done_first_end_index; - - rc = pfn_reader_next(&pfns); + rc = pfn_reader_update_pinned(&pfns); if (rc) goto out_unmap; + + pfn_reader_destroy(&pfns); } - rc = pfn_reader_update_pinned(&pfns); - if (rc) - goto out_unmap; area->storage_domain = xa_load(&area->iopt->domains, 0); interval_tree_insert(&area->pages_node, &pages->domains_itree); - goto out_destroy; + mutex_unlock(&pages->mutex); + return 0; out_unmap: pfn_reader_release_pins(&pfns); @@ -1876,7 +1887,6 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) end_index); } } -out_destroy: pfn_reader_destroy(&pfns); out_unlock: mutex_unlock(&pages->mutex); @@ -1903,11 +1913,15 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) if (!area->storage_domain) goto out_unlock; - xa_for_each(&iopt->domains, index, domain) - if (domain != area->storage_domain) + xa_for_each(&iopt->domains, index, domain) { + if (domain == area->storage_domain) + continue; + + if (!iopt_dmabuf_revoked(pages)) iopt_area_unmap_domain_range( area, domain, iopt_area_index(area), iopt_area_last_index(area)); + } if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb)); -- Gitee From 6de6d7ce016fd31d960996201ce205fb0926432d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 21 Nov 2025 11:51:01 -0400 Subject: [PATCH 20/43] iommufd: Allow a DMABUF to be revoked ANBZ: #31969 commit fc7063abd98e8477c47b792324785ef60c071b9e upstream. When connected to VFIO, the only DMABUF exporter that is accepted, the move_notify callback will be made when VFIO wants to remove access to the MMIO. This is being called revoke. Wire up revoke to go through all the iommu_domain's that have mapped the DMABUF and unmap them. The locking here is unpleasant, since the existing locking scheme was designed to come from the iopt through the area to the pages we cannot use pages as starting point for the locking. There is no way to obtain the domains_rwsem before obtaining the pages mutex to reliably use the existing domains_itree. Solve this problem by adding a new tracking structure just for DMABUF revoke. Record a linked list of areas and domains inside the pages mutex. Clean the entries on the list during revoke. The map/unmaps are now all done under a pages mutex while updating the tracking linked list so nothing can get out of sync. Only one lock is required for revoke processing. Link: https://patch.msgid.link/r/4-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Shuai Xue Signed-off-by: Jason Gunthorpe Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/io_pagetable.c | 16 +++- drivers/iommu/iommufd/io_pagetable.h | 17 +++++ drivers/iommu/iommufd/pages.c | 106 ++++++++++++++++++++++++++- 3 files changed, 135 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 0fe0f9c00fc4..ee4e042ac6fd 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -967,6 +967,7 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, if (iopt_is_dmabuf(pages)) { if (!iopt_dmabuf_revoked(pages)) iopt_area_unmap_domain(area, domain); + iopt_dmabuf_untrack_domain(pages, area, domain); } mutex_unlock(&pages->mutex); @@ -988,6 +989,8 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, WARN_ON(area->storage_domain != domain); area->storage_domain = NULL; iopt_area_unfill_domain(area, pages, domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); mutex_unlock(&pages->mutex); } } @@ -1017,10 +1020,16 @@ static int iopt_fill_domain(struct io_pagetable *iopt, if (!pages) continue; - mutex_lock(&pages->mutex); + guard(mutex)(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + rc = iopt_dmabuf_track_domain(pages, area, domain); + if (rc) + goto out_unfill; + } rc = iopt_area_fill_domain(area, domain); if (rc) { - mutex_unlock(&pages->mutex); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); goto out_unfill; } if (!area->storage_domain) { @@ -1029,7 +1038,6 @@ static int iopt_fill_domain(struct io_pagetable *iopt, interval_tree_insert(&area->pages_node, &pages->domains_itree); } - mutex_unlock(&pages->mutex); } return 0; @@ -1050,6 +1058,8 @@ static int iopt_fill_domain(struct io_pagetable *iopt, area->storage_domain = NULL; } iopt_area_unfill_domain(area, pages, domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); mutex_unlock(&pages->mutex); } return rc; diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 892daf4b1f1e..8f8d583e0243 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -70,6 +70,16 @@ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain); +int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, + struct iommu_domain *domain); +void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, + struct iopt_area *area, + struct iommu_domain *domain); +int iopt_dmabuf_track_all_domains(struct iopt_area *area, + struct iopt_pages *pages); +void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, + struct iopt_pages *pages); + static inline unsigned long iopt_area_index(struct iopt_area *area) { return area->pages_node.start; @@ -184,11 +194,18 @@ enum iopt_address_type { IOPT_ADDRESS_DMABUF, }; +struct iopt_pages_dmabuf_track { + struct iommu_domain *domain; + struct iopt_area *area; + struct list_head elm; +}; + struct iopt_pages_dmabuf { struct dma_buf_attachment *attach; struct dma_buf_phys_vec phys; /* Always PAGE_SIZE aligned */ unsigned long start; + struct list_head tracker; }; /* diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 4ac2b5a9a281..72865d32e803 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1460,8 +1460,19 @@ struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, static void iopt_revoke_notify(struct dma_buf_attachment *attach) { struct iopt_pages *pages = attach->importer_priv; + struct iopt_pages_dmabuf_track *track; guard(mutex)(&pages->mutex); + if (iopt_dmabuf_revoked(pages)) + return; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) { + struct iopt_area *area = track->area; + + iopt_area_unmap_domain_range(area, track->domain, + iopt_area_index(area), + iopt_area_last_index(area)); + } pages->dmabuf.phys.len = 0; } @@ -1562,6 +1573,7 @@ struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; pages->type = IOPT_ADDRESS_DMABUF; pages->dmabuf.start = start - start_byte; + INIT_LIST_HEAD(&pages->dmabuf.tracker); rc = iopt_map_dmabuf(ictx, pages, dmabuf); if (rc) { @@ -1572,6 +1584,86 @@ struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, return pages; } +int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, + struct iommu_domain *domain) +{ + struct iopt_pages_dmabuf_track *track; + + lockdep_assert_held(&pages->mutex); + if (WARN_ON(!iopt_is_dmabuf(pages))) + return -EINVAL; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) + if (WARN_ON(track->domain == domain && track->area == area)) + return -EINVAL; + + track = kzalloc(sizeof(*track), GFP_KERNEL); + if (!track) + return -ENOMEM; + track->domain = domain; + track->area = area; + list_add_tail(&track->elm, &pages->dmabuf.tracker); + + return 0; +} + +void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, + struct iopt_area *area, + struct iommu_domain *domain) +{ + struct iopt_pages_dmabuf_track *track; + + lockdep_assert_held(&pages->mutex); + WARN_ON(!iopt_is_dmabuf(pages)); + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) { + if (track->domain == domain && track->area == area) { + list_del(&track->elm); + kfree(track); + return; + } + } + WARN_ON(true); +} + +int iopt_dmabuf_track_all_domains(struct iopt_area *area, + struct iopt_pages *pages) +{ + struct iopt_pages_dmabuf_track *track; + struct iommu_domain *domain; + unsigned long index; + int rc; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) + if (WARN_ON(track->area == area)) + return -EINVAL; + + xa_for_each(&area->iopt->domains, index, domain) { + rc = iopt_dmabuf_track_domain(pages, area, domain); + if (rc) + goto err_untrack; + } + return 0; +err_untrack: + iopt_dmabuf_untrack_all_domains(area, pages); + return rc; +} + +void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, + struct iopt_pages *pages) +{ + struct iopt_pages_dmabuf_track *track; + struct iopt_pages_dmabuf_track *tmp; + + list_for_each_entry_safe(track, tmp, &pages->dmabuf.tracker, + elm) { + if (track->area == area) { + list_del(&track->elm); + kfree(track); + } + } +} + void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); @@ -1589,6 +1681,7 @@ void iopt_release_pages(struct kref *kref) dma_buf_detach(dmabuf, pages->dmabuf.attach); dma_buf_put(dmabuf); + WARN_ON(!list_empty(&pages->dmabuf.tracker)); } else if (pages->type == IOPT_ADDRESS_FILE) { fput(pages->file); } @@ -1829,11 +1922,17 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) return 0; mutex_lock(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + rc = iopt_dmabuf_track_all_domains(area, pages); + if (rc) + goto out_unlock; + } + if (!iopt_dmabuf_revoked(pages)) { rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), iopt_area_last_index(area)); if (rc) - goto out_unlock; + goto out_untrack; while (!pfn_reader_done(&pfns)) { done_first_end_index = pfns.batch_end_index; @@ -1888,6 +1987,9 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) } } pfn_reader_destroy(&pfns); +out_untrack: + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_all_domains(area, pages); out_unlock: mutex_unlock(&pages->mutex); return rc; @@ -1927,6 +2029,8 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb)); interval_tree_remove(&area->pages_node, &pages->domains_itree); iopt_area_unfill_domain(area, pages, area->storage_domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_all_domains(area, pages); area->storage_domain = NULL; out_unlock: mutex_unlock(&pages->mutex); -- Gitee From 568ceb4c4757704391a18432e25e663b0531ff12 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 21 Nov 2025 11:51:02 -0400 Subject: [PATCH 21/43] iommufd: Allow MMIO pages in a batch ANBZ: #31969 commit 3114c674401e81b6b56da2f9f0762eb99ae8ceba upstream. Addresses intended for MMIO should be propagated through to the iommu with the IOMMU_MMIO flag set. Keep track in the batch if all the pfns are cachable or mmio and flush the batch out of it ever needs to be changed. Switch to IOMMU_MMIO if the batch is MMIO when mapping the iommu. Link: https://patch.msgid.link/r/5-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Shuai Xue Signed-off-by: Jason Gunthorpe Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/pages.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 72865d32e803..d7df69b7f7e6 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -261,6 +261,11 @@ static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, return container_of(node, struct iopt_area, pages_node); } +enum batch_kind { + BATCH_CPU_MEMORY = 0, + BATCH_MMIO, +}; + /* * A simple datastructure to hold a vector of PFNs, optimized for contiguous * PFNs. This is used as a temporary holding memory for shuttling pfns from one @@ -274,6 +279,7 @@ struct pfn_batch { unsigned int array_size; unsigned int end; unsigned int total_pfns; + enum batch_kind kind; }; enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) }; @@ -352,10 +358,17 @@ static void batch_destroy(struct pfn_batch *batch, void *backup) } static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn, - u32 nr) + u32 nr, enum batch_kind kind) { unsigned int end = batch->end; + if (batch->kind != kind) { + /* One kind per batch */ + if (batch->end != 0) + return false; + batch->kind = kind; + } + if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] && nr <= MAX_NPFNS - batch->npfns[end - 1]) { batch->npfns[end - 1] += nr; @@ -382,7 +395,7 @@ static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr) /* true if the pfn was added, false otherwise */ static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) { - return batch_add_pfn_num(batch, pfn, 1); + return batch_add_pfn_num(batch, pfn, 1, BATCH_CPU_MEMORY); } /* @@ -495,6 +508,7 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, { bool disable_large_pages = area->iopt->disable_large_pages; unsigned long last_iova = iopt_area_last_iova(area); + int iommu_prot = area->iommu_prot; unsigned int page_offset = 0; unsigned long start_iova; unsigned long next_iova; @@ -502,6 +516,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, unsigned long iova; int rc; + if (batch->kind == BATCH_MMIO) { + iommu_prot &= ~IOMMU_CACHE; + iommu_prot |= IOMMU_MMIO; + } + /* The first index might be a partial page */ if (start_index == iopt_area_index(area)) page_offset = area->page_offset; @@ -515,11 +534,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, rc = batch_iommu_map_small( domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, - next_iova - iova, area->iommu_prot); + next_iova - iova, iommu_prot); else rc = iommu_map(domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, - next_iova - iova, area->iommu_prot, + next_iova - iova, iommu_prot, GFP_KERNEL_ACCOUNT); if (rc) goto err_unmap; @@ -655,7 +674,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, nr = min(nr, npages); npages -= nr; - if (!batch_add_pfn_num(batch, pfn, nr)) + if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY)) break; if (nr > 1) { rc = folio_add_pins(folio, nr - 1); -- Gitee From d1c6a0918442603c468af9bc000571df12634224 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 21 Nov 2025 11:51:03 -0400 Subject: [PATCH 22/43] iommufd: Have pfn_reader process DMABUF iopt_pages ANBZ: #31969 commit 74014a4b55f5df935977368efe09d650e22495f0 upstream. Make another sub implementation of pfn_reader for DMABUF. This version will fill the batch using the struct phys_vec recorded during the attachment. Link: https://patch.msgid.link/r/6-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Shuai Xue Signed-off-by: Jason Gunthorpe Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/pages.c | 73 +++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index d7df69b7f7e6..c021ffbe4b70 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1169,6 +1169,41 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, return iopt_pages_update_pinned(pages, npages, inc, user); } +struct pfn_reader_dmabuf { + struct dma_buf_phys_vec phys; + unsigned long start_offset; +}; + +static int pfn_reader_dmabuf_init(struct pfn_reader_dmabuf *dmabuf, + struct iopt_pages *pages) +{ + /* Callers must not get here if the dmabuf was already revoked */ + if (WARN_ON(iopt_dmabuf_revoked(pages))) + return -EINVAL; + + dmabuf->phys = pages->dmabuf.phys; + dmabuf->start_offset = pages->dmabuf.start; + return 0; +} + +static int pfn_reader_fill_dmabuf(struct pfn_reader_dmabuf *dmabuf, + struct pfn_batch *batch, + unsigned long start_index, + unsigned long last_index) +{ + unsigned long start = dmabuf->start_offset + start_index * PAGE_SIZE; + + /* + * start/last_index and start are all PAGE_SIZE aligned, the batch is + * always filled using page size aligned PFNs just like the other types. + * If the dmabuf has been sliced on a sub page offset then the common + * batch to domain code will adjust it before mapping to the domain. + */ + batch_add_pfn_num(batch, PHYS_PFN(dmabuf->phys.paddr + start), + last_index - start_index + 1, BATCH_MMIO); + return 0; +} + /* * PFNs are stored in three places, in order of preference: * - The iopt_pages xarray. This is only populated if there is a @@ -1187,7 +1222,10 @@ struct pfn_reader { unsigned long batch_end_index; unsigned long last_index; - struct pfn_reader_user user; + union { + struct pfn_reader_user user; + struct pfn_reader_dmabuf dmabuf; + }; }; static int pfn_reader_update_pinned(struct pfn_reader *pfns) @@ -1223,7 +1261,7 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) { struct interval_tree_double_span_iter *span = &pfns->span; unsigned long start_index = pfns->batch_end_index; - struct pfn_reader_user *user = &pfns->user; + struct pfn_reader_user *user; unsigned long npages; struct iopt_area *area; int rc; @@ -1255,8 +1293,13 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) return 0; } - if (start_index >= pfns->user.upages_end) { - rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, + if (iopt_is_dmabuf(pfns->pages)) + return pfn_reader_fill_dmabuf(&pfns->dmabuf, &pfns->batch, + start_index, span->last_hole); + + user = &pfns->user; + if (start_index >= user->upages_end) { + rc = pfn_reader_user_pin(user, pfns->pages, start_index, span->last_hole); if (rc) return rc; @@ -1324,7 +1367,10 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, pfns->batch_start_index = start_index; pfns->batch_end_index = start_index; pfns->last_index = last_index; - pfn_reader_user_init(&pfns->user, pages); + if (iopt_is_dmabuf(pages)) + pfn_reader_dmabuf_init(&pfns->dmabuf, pages); + else + pfn_reader_user_init(&pfns->user, pages); rc = batch_init(&pfns->batch, last_index - start_index + 1); if (rc) return rc; @@ -1345,8 +1391,12 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, static void pfn_reader_release_pins(struct pfn_reader *pfns) { struct iopt_pages *pages = pfns->pages; - struct pfn_reader_user *user = &pfns->user; + struct pfn_reader_user *user; + + if (iopt_is_dmabuf(pages)) + return; + user = &pfns->user; if (user->upages_end > pfns->batch_end_index) { /* Any pages not transferred to the batch are just unpinned */ @@ -1377,7 +1427,8 @@ static void pfn_reader_destroy(struct pfn_reader *pfns) struct iopt_pages *pages = pfns->pages; pfn_reader_release_pins(pfns); - pfn_reader_user_destroy(&pfns->user, pfns->pages); + if (!iopt_is_dmabuf(pfns->pages)) + pfn_reader_user_destroy(&pfns->user, pfns->pages); batch_destroy(&pfns->batch, NULL); WARN_ON(pages->last_npinned != pages->npinned); } @@ -1781,6 +1832,14 @@ static void __iopt_area_unfill_domain(struct iopt_area *area, lockdep_assert_held(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + if (WARN_ON(iopt_dmabuf_revoked(pages))) + return; + iopt_area_unmap_domain_range(area, domain, start_index, + last_index); + return; + } + /* * For security we must not unpin something that is still DMA mapped, * so this must unmap any IOVA before we go ahead and unpin the pages. -- Gitee From c6da8711987aa3864d2b6f606e61eba3c03c3d06 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 21 Nov 2025 11:51:04 -0400 Subject: [PATCH 23/43] iommufd: Have iopt_map_file_pages convert the fd to a file ANBZ: #31969 commit 217725f0b2c31d8328fe92b35ae4b38c102a846f upstream. Since dmabuf only has APIs that work on an int fd and not a struct file *, pass the fd deeper into the call chain so we can use the dmabuf APIs as is. Link: https://patch.msgid.link/r/7-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Shuai Xue Signed-off-by: Jason Gunthorpe Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/io_pagetable.c | 15 +++++++++++---- drivers/iommu/iommufd/ioas.c | 8 +------- drivers/iommu/iommufd/iommufd_private.h | 2 +- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index ee4e042ac6fd..eaa24b9f2990 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -471,21 +472,27 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, * @iopt: io_pagetable to act on * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains * the chosen iova on output. Otherwise is the iova to map to on input - * @file: file to map + * @fd: fdno of a file to map * @start: map file starting at this byte offset * @length: Number of bytes to map * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping * @flags: IOPT_ALLOC_IOVA or zero */ int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, - unsigned long *iova, struct file *file, - unsigned long start, unsigned long length, - int iommu_prot, unsigned int flags) + unsigned long *iova, int fd, unsigned long start, + unsigned long length, int iommu_prot, + unsigned int flags) { struct iopt_pages *pages; + struct file *file; + + file = fget(fd); + if (!file) + return -EBADF; pages = iopt_alloc_file_pages(file, start, length, iommu_prot & IOMMU_WRITE); + fput(file); if (IS_ERR(pages)) return PTR_ERR(pages); return iopt_map_common(ictx, iopt, pages, iova, length, diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 459a7c516915..f4721afedadc 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -207,7 +207,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) unsigned long iova = cmd->iova; struct iommufd_ioas *ioas; unsigned int flags = 0; - struct file *file; int rc; if (cmd->flags & @@ -229,11 +228,7 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) flags = IOPT_ALLOC_IOVA; - file = fget(cmd->fd); - if (!file) - return -EBADF; - - rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file, + rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, cmd->fd, cmd->start, cmd->length, conv_iommu_prot(cmd->flags), flags); if (rc) @@ -243,7 +238,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put: iommufd_put_object(ucmd->ictx, &ioas->obj); - fput(file); return rc; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 96a3264760c0..98661aee760b 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -108,7 +108,7 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, - unsigned long *iova, struct file *file, + unsigned long *iova, int fd, unsigned long start, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, -- Gitee From 1ca47773b318786b80c9f550807ac6a05d3cd5bc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 21 Nov 2025 11:51:05 -0400 Subject: [PATCH 24/43] iommufd: Accept a DMABUF through IOMMU_IOAS_MAP_FILE ANBZ: #31969 commit 44ebaa1744fd79cd86d10f5453c90b8c4e22b7f4 upstream. Finally call iopt_alloc_dmabuf_pages() if the user passed in a DMABUF through IOMMU_IOAS_MAP_FILE. This makes the feature visible to userspace. Link: https://patch.msgid.link/r/8-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Shuai Xue Signed-off-by: Jason Gunthorpe Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/io_pagetable.c | 43 +++++++++++++++++++++------- drivers/iommu/iommufd/io_pagetable.h | 4 ++- drivers/iommu/iommufd/pages.c | 13 ++++----- 3 files changed, 41 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index eaa24b9f2990..54cf4d856179 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -8,6 +8,7 @@ * The datastructure uses the iopt_pages to optimize the storage of the PFNs * between the domains and xarray. */ +#include #include #include #include @@ -484,19 +485,41 @@ int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned int flags) { struct iopt_pages *pages; - struct file *file; + struct dma_buf *dmabuf; + unsigned long start_byte; + unsigned long last; - file = fget(fd); - if (!file) - return -EBADF; + if (!length) + return -EINVAL; + if (check_add_overflow(start, length - 1, &last)) + return -EOVERFLOW; + + start_byte = start - ALIGN_DOWN(start, PAGE_SIZE); + dmabuf = dma_buf_get(fd); + if (!IS_ERR(dmabuf)) { + pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start, + length, + iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) { + dma_buf_put(dmabuf); + return PTR_ERR(pages); + } + } else { + struct file *file; + + file = fget(fd); + if (!file) + return -EBADF; + + pages = iopt_alloc_file_pages(file, start_byte, start, length, + iommu_prot & IOMMU_WRITE); + fput(file); + if (IS_ERR(pages)) + return PTR_ERR(pages); + } - pages = iopt_alloc_file_pages(file, start, length, - iommu_prot & IOMMU_WRITE); - fput(file); - if (IS_ERR(pages)) - return PTR_ERR(pages); return iopt_map_common(ictx, iopt, pages, iova, length, - start - pages->start, iommu_prot, flags); + start_byte, iommu_prot, flags); } struct iova_bitmap_fn_arg { diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 8f8d583e0243..14cd052fd320 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -265,7 +265,9 @@ static inline bool iopt_dmabuf_revoked(struct iopt_pages *pages) struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, unsigned long length, bool writable); -struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, +struct iopt_pages *iopt_alloc_file_pages(struct file *file, + unsigned long start_byte, + unsigned long start, unsigned long length, bool writable); struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, struct dma_buf *dmabuf, diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index c021ffbe4b70..a5e184da9ae8 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1507,22 +1507,19 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, return pages; } -struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, +struct iopt_pages *iopt_alloc_file_pages(struct file *file, + unsigned long start_byte, + unsigned long start, unsigned long length, bool writable) { struct iopt_pages *pages; - unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE); - unsigned long end; - - if (length && check_add_overflow(start, length - 1, &end)) - return ERR_PTR(-EOVERFLOW); - pages = iopt_alloc_pages(start - start_down, length, writable); + pages = iopt_alloc_pages(start_byte, length, writable); if (IS_ERR(pages)) return pages; pages->file = get_file(file); - pages->start = start_down; + pages->start = start - start_byte; pages->type = IOPT_ADDRESS_FILE; return pages; } -- Gitee From d5ef88b5d043ce4d1bcb75716ba732ea485727df Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Dec 2025 11:03:29 +0100 Subject: [PATCH 25/43] iommufd: Fix building without dmabuf ANBZ: #31969 commit 69dc538a4f5a57dcc5ea4893c769d567f539a1b1 upstream. When DMABUF is disabled, trying to use it causes a link failure: x86_64-linux-ld: drivers/iommu/iommufd/io_pagetable.o: in function `iopt_map_file_pages': io_pagetable.c:(.text+0x1735): undefined reference to `dma_buf_get' x86_64-linux-ld: io_pagetable.c:(.text+0x1775): undefined reference to `dma_buf_put' Fixes: 44ebaa1744fd ("iommufd: Accept a DMABUF through IOMMU_IOAS_MAP_FILE") Link: https://patch.msgid.link/r/20251204100333.1034767-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/io_pagetable.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 54cf4d856179..436992331111 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -495,7 +495,11 @@ int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, return -EOVERFLOW; start_byte = start - ALIGN_DOWN(start, PAGE_SIZE); - dmabuf = dma_buf_get(fd); + if (IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + dmabuf = dma_buf_get(fd); + else + dmabuf = ERR_PTR(-ENXIO); + if (!IS_ERR(dmabuf)) { pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start, length, -- Gitee From 1341d265745168106dec672e1f0c4141820bf4ea Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 21 Nov 2025 11:51:06 -0400 Subject: [PATCH 26/43] iommufd/selftest: Add some tests for the dmabuf flow ANBZ: #31969 commit d2041f1f11dd99076010841a86c4d02d04650814 upstream. Basic tests of establishing a dmabuf and revoking it. The selftest kernel side provides a basic small dmabuf for this testing. Link: https://patch.msgid.link/r/9-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/iommufd_private.h | 10 ++ drivers/iommu/iommufd/iommufd_test.h | 10 ++ drivers/iommu/iommufd/pages.c | 4 + drivers/iommu/iommufd/selftest.c | 143 ++++++++++++++++++ tools/testing/selftests/iommu/iommufd.c | 43 ++++++ tools/testing/selftests/iommu/iommufd_utils.h | 44 ++++++ 6 files changed, 254 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 98661aee760b..eb6d1a70f673 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -19,6 +19,8 @@ struct iommu_domain; struct iommu_group; struct iommu_option; struct iommufd_device; +struct dma_buf_attachment; +struct dma_buf_phys_vec; struct iommufd_sw_msi_map { struct list_head sw_msi_item; @@ -715,6 +717,8 @@ bool iommufd_should_fail(void); int __init iommufd_test_init(void); void iommufd_test_exit(void); bool iommufd_selftest_is_mock_dev(struct device *dev); +int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys); #else static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, @@ -736,5 +740,11 @@ static inline bool iommufd_selftest_is_mock_dev(struct device *dev) { return false; } +static inline int +iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + return -EOPNOTSUPP; +} #endif #endif diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 8fc618b2bcf9..9166c39eb0c8 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -29,6 +29,8 @@ enum { IOMMU_TEST_OP_PASID_REPLACE, IOMMU_TEST_OP_PASID_DETACH, IOMMU_TEST_OP_PASID_CHECK_HWPT, + IOMMU_TEST_OP_DMABUF_GET, + IOMMU_TEST_OP_DMABUF_REVOKE, }; enum { @@ -176,6 +178,14 @@ struct iommu_test_cmd { __u32 hwpt_id; /* @id is stdev_id */ } pasid_check; + struct { + __u32 length; + __u32 open_flags; + } dmabuf_get; + struct { + __s32 dmabuf_fd; + __u32 revoked; + } dmabuf_revoke; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index a5e184da9ae8..68da809ad07c 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1559,6 +1559,10 @@ sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, typeof(&vfio_pci_dma_buf_iommufd_map) fn; int rc; + rc = iommufd_test_dma_buf_iommufd_map(attachment, phys); + if (rc != -EOPNOTSUPP) + return rc; + if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)) return -EOPNOTSUPP; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 3e64335770d6..7f886810690c 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -5,6 +5,8 @@ */ #include #include +#include +#include #include #include #include @@ -2037,6 +2039,140 @@ void iommufd_selftest_destroy(struct iommufd_object *obj) } } +struct iommufd_test_dma_buf { + void *memory; + size_t length; + bool revoked; +}; + +static int iommufd_test_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + return 0; +} + +static void iommufd_test_dma_buf_detach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ +} + +static struct sg_table * +iommufd_test_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static void iommufd_test_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ +} + +static void iommufd_test_dma_buf_release(struct dma_buf *dmabuf) +{ + struct iommufd_test_dma_buf *priv = dmabuf->priv; + + kfree(priv->memory); + kfree(priv); +} + +static const struct dma_buf_ops iommufd_test_dmabuf_ops = { + .attach = iommufd_test_dma_buf_attach, + .detach = iommufd_test_dma_buf_detach, + .map_dma_buf = iommufd_test_dma_buf_map, + .release = iommufd_test_dma_buf_release, + .unmap_dma_buf = iommufd_test_dma_buf_unmap, +}; + +int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + struct iommufd_test_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(attachment->dmabuf->resv); + + if (attachment->dmabuf->ops != &iommufd_test_dmabuf_ops) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + phys->paddr = virt_to_phys(priv->memory); + phys->len = priv->length; + return 0; +} + +static int iommufd_test_dmabuf_get(struct iommufd_ucmd *ucmd, + unsigned int open_flags, + size_t len) +{ + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct iommufd_test_dma_buf *priv; + struct dma_buf *dmabuf; + int rc; + + len = ALIGN(len, PAGE_SIZE); + if (len == 0 || len > PAGE_SIZE * 512) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->length = len; + priv->memory = kzalloc(len, GFP_KERNEL); + if (!priv->memory) { + rc = -ENOMEM; + goto err_free; + } + + exp_info.ops = &iommufd_test_dmabuf_ops; + exp_info.size = len; + exp_info.flags = open_flags; + exp_info.priv = priv; + + dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(dmabuf)) { + rc = PTR_ERR(dmabuf); + goto err_free; + } + + return dma_buf_fd(dmabuf, open_flags); + +err_free: + kfree(priv->memory); + kfree(priv); + return rc; +} + +static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd, + bool revoked) +{ + struct iommufd_test_dma_buf *priv; + struct dma_buf *dmabuf; + int rc = 0; + + dmabuf = dma_buf_get(fd); + if (IS_ERR(dmabuf)) + return PTR_ERR(dmabuf); + + if (dmabuf->ops != &iommufd_test_dmabuf_ops) { + rc = -EOPNOTSUPP; + goto err_put; + } + + priv = dmabuf->priv; + dma_resv_lock(dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(dmabuf); + dma_resv_unlock(dmabuf->resv); + +err_put: + dma_buf_put(dmabuf); + return rc; +} + int iommufd_test(struct iommufd_ucmd *ucmd) { struct iommu_test_cmd *cmd = ucmd->cmd; @@ -2115,6 +2251,13 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_pasid_detach(ucmd, cmd); case IOMMU_TEST_OP_PASID_CHECK_HWPT: return iommufd_test_pasid_check_hwpt(ucmd, cmd); + case IOMMU_TEST_OP_DMABUF_GET: + return iommufd_test_dmabuf_get(ucmd, cmd->dmabuf_get.open_flags, + cmd->dmabuf_get.length); + case IOMMU_TEST_OP_DMABUF_REVOKE: + return iommufd_test_dmabuf_revoke(ucmd, + cmd->dmabuf_revoke.dmabuf_fd, + cmd->dmabuf_revoke.revoked); default: return -EOPNOTSUPP; } diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 1f52140d9d9e..b206b4cacd9b 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1572,6 +1572,49 @@ TEST_F(iommufd_ioas, copy_sweep) test_ioctl_destroy(dst_ioas_id); } +TEST_F(iommufd_ioas, dmabuf_simple) +{ + size_t buf_size = PAGE_SIZE*4; + __u64 iova; + int dfd; + + test_cmd_get_dmabuf(buf_size, &dfd); + test_err_ioctl_ioas_map_file(EINVAL, dfd, 0, 0, &iova); + test_err_ioctl_ioas_map_file(EINVAL, dfd, buf_size, buf_size, &iova); + test_err_ioctl_ioas_map_file(EINVAL, dfd, 0, buf_size + 1, &iova); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova); + + close(dfd); +} + +TEST_F(iommufd_ioas, dmabuf_revoke) +{ + size_t buf_size = PAGE_SIZE*4; + __u32 hwpt_id; + __u64 iova; + __u64 iova2; + int dfd; + + test_cmd_get_dmabuf(buf_size, &dfd); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova); + test_cmd_revoke_dmabuf(dfd, true); + + if (variant->mock_domains) + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, 0, + &hwpt_id); + + test_err_ioctl_ioas_map_file(ENODEV, dfd, 0, buf_size, &iova2); + + test_cmd_revoke_dmabuf(dfd, false); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova2); + + /* Restore the iova back */ + test_ioctl_ioas_unmap(iova, buf_size); + test_ioctl_ioas_map_fixed_file(dfd, 0, buf_size, iova); + + close(dfd); +} + FIXTURE(iommufd_mock_domain) { int fd; diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 9f472c20c190..55c285dbbf52 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -548,6 +548,39 @@ static int _test_cmd_destroy_access_pages(int fd, unsigned int access_id, EXPECT_ERRNO(_errno, _test_cmd_destroy_access_pages( \ self->fd, access_id, access_pages_id)) +static int _test_cmd_get_dmabuf(int fd, size_t len, int *out_fd) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DMABUF_GET, + .dmabuf_get = { .length = len, .open_flags = O_CLOEXEC }, + }; + + *out_fd = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (*out_fd < 0) + return -1; + return 0; +} +#define test_cmd_get_dmabuf(len, out_fd) \ + ASSERT_EQ(0, _test_cmd_get_dmabuf(self->fd, len, out_fd)) + +static int _test_cmd_revoke_dmabuf(int fd, int dmabuf_fd, bool revoked) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DMABUF_REVOKE, + .dmabuf_revoke = { .dmabuf_fd = dmabuf_fd, .revoked = revoked }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret < 0) + return -1; + return 0; +} +#define test_cmd_revoke_dmabuf(dmabuf_fd, revoke) \ + ASSERT_EQ(0, _test_cmd_revoke_dmabuf(self->fd, dmabuf_fd, revoke)) + static int _test_ioctl_destroy(int fd, unsigned int id) { struct iommu_destroy cmd = { @@ -718,6 +751,17 @@ static int _test_ioctl_ioas_map_file(int fd, unsigned int ioas_id, int mfd, self->fd, ioas_id, mfd, start, length, iova_p, \ IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)) +#define test_ioctl_ioas_map_fixed_file(mfd, start, length, iova) \ + ({ \ + __u64 __iova = iova; \ + ASSERT_EQ(0, _test_ioctl_ioas_map_file( \ + self->fd, self->ioas_id, mfd, start, \ + length, &__iova, \ + IOMMU_IOAS_MAP_FIXED_IOVA | \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)); \ + }) + static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit) { struct iommu_test_cmd memlimit_cmd = { -- Gitee From 380623b3fad9be89fb64590813844af6621cfbb5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Jan 2026 15:22:11 -0400 Subject: [PATCH 27/43] iommufd/selftest: Add missing kconfig for DMA_SHARED_BUFFER ANBZ: #31969 commit faa37ff3bf18d5242fe3d54f5462b1c3254c2567 upstream. The test doesn't build without it, dma-buf.h does not provide stub functions if it is not enabled. Compilation can fail with: ERROR:root:ld: vmlinux.o: in function `iommufd_test': (.text+0x3b1cdd): undefined reference to `dma_buf_get' ld: (.text+0x3b1d08): undefined reference to `dma_buf_put' ld: (.text+0x3b2105): undefined reference to `dma_buf_export' ld: (.text+0x3b211f): undefined reference to `dma_buf_fd' ld: (.text+0x3b2e47): undefined reference to `dma_buf_move_notify' Add the missing select. Fixes: d2041f1f11dd ("iommufd/selftest: Add some tests for the dmabuf flow") Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 2beeb4f60ee5..e38927818410 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -41,6 +41,7 @@ config IOMMUFD_TEST depends on DEBUG_KERNEL depends on FAULT_INJECTION depends on RUNTIME_TESTING_MENU + select DMA_SHARED_BUFFER select IOMMUFD_DRIVER default n help -- Gitee From c144d1e600bb16e5c8ecc734828bb545125fcfb2 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Mon, 26 Jan 2026 17:24:46 +0800 Subject: [PATCH 28/43] anolis: config: enable CONFIG_{VFIO}_PCI_P2PDMA on arm64 and x86 ANBZ: #31969 Set CONFIG_{VFIO}_PCI_P2PDMA to default y on arm64 and x86. CONFIG_PCI_P2PDMA depends on ZONE_DEVICE, which is not enabled on loongarch and riscv. Therefore, keep CONFIG_PCI_P2PDMA disabled on loongarch and riscv. Signed-off-by: Shuai Xue Signed-off-by: Ruidong Tian --- anolis/configs/L1-RECOMMEND/default/CONFIG_PCI_P2PDMA | 1 + anolis/configs/L1-RECOMMEND/default/CONFIG_VFIO_PCI_DMABUF | 1 + .../default => L1-RECOMMEND/loongarch}/CONFIG_PCI_P2PDMA | 0 anolis/configs/L1-RECOMMEND/loongarch/CONFIG_VFIO_PCI_DMABUF | 1 + .../loongarch => L1-RECOMMEND/riscv}/CONFIG_PCI_P2PDMA | 0 anolis/configs/L1-RECOMMEND/riscv/CONFIG_VFIO_PCI_DMABUF | 1 + anolis/configs/L2-OPTIONAL/riscv/CONFIG_PCI_P2PDMA | 1 - 7 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 anolis/configs/L1-RECOMMEND/default/CONFIG_PCI_P2PDMA create mode 100644 anolis/configs/L1-RECOMMEND/default/CONFIG_VFIO_PCI_DMABUF rename anolis/configs/{L2-OPTIONAL/default => L1-RECOMMEND/loongarch}/CONFIG_PCI_P2PDMA (100%) create mode 100644 anolis/configs/L1-RECOMMEND/loongarch/CONFIG_VFIO_PCI_DMABUF rename anolis/configs/{L2-OPTIONAL/loongarch => L1-RECOMMEND/riscv}/CONFIG_PCI_P2PDMA (100%) create mode 100644 anolis/configs/L1-RECOMMEND/riscv/CONFIG_VFIO_PCI_DMABUF delete mode 100644 anolis/configs/L2-OPTIONAL/riscv/CONFIG_PCI_P2PDMA diff --git a/anolis/configs/L1-RECOMMEND/default/CONFIG_PCI_P2PDMA b/anolis/configs/L1-RECOMMEND/default/CONFIG_PCI_P2PDMA new file mode 100644 index 000000000000..be83d03b36e3 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/default/CONFIG_PCI_P2PDMA @@ -0,0 +1 @@ +CONFIG_PCI_P2PDMA=y diff --git a/anolis/configs/L1-RECOMMEND/default/CONFIG_VFIO_PCI_DMABUF b/anolis/configs/L1-RECOMMEND/default/CONFIG_VFIO_PCI_DMABUF new file mode 100644 index 000000000000..8dc9bd2ef222 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/default/CONFIG_VFIO_PCI_DMABUF @@ -0,0 +1 @@ +CONFIG_VFIO_PCI_DMABUF=y diff --git a/anolis/configs/L2-OPTIONAL/default/CONFIG_PCI_P2PDMA b/anolis/configs/L1-RECOMMEND/loongarch/CONFIG_PCI_P2PDMA similarity index 100% rename from anolis/configs/L2-OPTIONAL/default/CONFIG_PCI_P2PDMA rename to anolis/configs/L1-RECOMMEND/loongarch/CONFIG_PCI_P2PDMA diff --git a/anolis/configs/L1-RECOMMEND/loongarch/CONFIG_VFIO_PCI_DMABUF b/anolis/configs/L1-RECOMMEND/loongarch/CONFIG_VFIO_PCI_DMABUF new file mode 100644 index 000000000000..33a773afc5d0 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/loongarch/CONFIG_VFIO_PCI_DMABUF @@ -0,0 +1 @@ +# CONFIG_VFIO_PCI_DMABUF is not set diff --git a/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PCI_P2PDMA b/anolis/configs/L1-RECOMMEND/riscv/CONFIG_PCI_P2PDMA similarity index 100% rename from anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PCI_P2PDMA rename to anolis/configs/L1-RECOMMEND/riscv/CONFIG_PCI_P2PDMA diff --git a/anolis/configs/L1-RECOMMEND/riscv/CONFIG_VFIO_PCI_DMABUF b/anolis/configs/L1-RECOMMEND/riscv/CONFIG_VFIO_PCI_DMABUF new file mode 100644 index 000000000000..33a773afc5d0 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/riscv/CONFIG_VFIO_PCI_DMABUF @@ -0,0 +1 @@ +# CONFIG_VFIO_PCI_DMABUF is not set diff --git a/anolis/configs/L2-OPTIONAL/riscv/CONFIG_PCI_P2PDMA b/anolis/configs/L2-OPTIONAL/riscv/CONFIG_PCI_P2PDMA deleted file mode 100644 index 8898dbd7961d..000000000000 --- a/anolis/configs/L2-OPTIONAL/riscv/CONFIG_PCI_P2PDMA +++ /dev/null @@ -1 +0,0 @@ -# CONFIG_PCI_P2PDMA is not set -- Gitee From 5bc0b41c96bebc46a3b23def3c825c095115adcb Mon Sep 17 00:00:00 2001 From: Ted Logan Date: Thu, 19 Mar 2026 15:58:47 -0700 Subject: [PATCH 29/43] vfio: selftests: Build tests on aarch64 ANBZ: #31969 commit 1347a742a1e1b080e2e8d200312ae45b8d6ac859 upstream. Fix vfio selftests on aarch64, allowing native builds on aarch64 hosts. Reported-by: Matt Evans Closes: https://lore.kernel.org/all/e51b4ff2-13c4-47d4-b781-3dcbd740d274@meta.com/ Fixes: a55d4bbbe644 ("vfio: selftests: only build tests on arm64 and x86_64") Signed-off-by: Ted Logan Reviewed-by: David Matlack Link: https://lore.kernel.org/r/20260319-vfio-selftests-aarch64-v2-1-bb2621c24dc4@fb.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- tools/testing/selftests/vfio/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 792c4245d4f7..17082a57aa9f 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,6 +1,6 @@ ARCH ?= $(shell uname -m) -ifeq (,$(filter $(ARCH),arm64 x86_64)) +ifeq (,$(filter $(ARCH),aarch64 arm64 x86_64)) # Do nothing on unsupported architectures include ../lib.mk else -- Gitee From 43ea76a4c46f0981e2b6ea558ca04e33dbbd052f Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Tue, 28 Oct 2025 09:15:00 -0700 Subject: [PATCH 30/43] vfio/type1: sanitize for overflow using check_*_overflow() ANBZ: #31969 commit 6012379ede6aa7477db6276bb9876fe7d67c4312 upstream. Adopt check_*_overflow() functions to clearly express overflow check intent. Tested-by: Alejandro Jimenez Fixes: 73fa0d10d077 ("vfio: Type1 IOMMU implementation") Reviewed-by: Jason Gunthorpe Reviewed-by: Alejandro Jimenez Signed-off-by: Alex Mastro Link: https://lore.kernel.org/r/20251028-fix-unmap-v6-1-2542b96bcc8e@fb.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/vfio/vfio_iommu_type1.c | 86 ++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index aebf21f550e4..b877f5d72864 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "vfio.h" #define DRIVER_VERSION "0.2" @@ -181,7 +182,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, } static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu, - dma_addr_t start, u64 size) + dma_addr_t start, size_t size) { struct rb_node *res = NULL; struct rb_node *node = iommu->dma_list.rb_node; @@ -803,14 +804,20 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, unsigned long remote_vaddr; struct vfio_dma *dma; bool do_accounting; + dma_addr_t iova_end; + size_t iova_size; - if (!iommu || !pages) + if (!iommu || !pages || npage <= 0) return -EINVAL; /* Supported for v2 version only */ if (!iommu->v2) return -EACCES; + if (check_mul_overflow(npage, PAGE_SIZE, &iova_size) || + check_add_overflow(user_iova, iova_size - 1, &iova_end)) + return -EOVERFLOW; + mutex_lock(&iommu->lock); if (WARN_ONCE(iommu->vaddr_invalid_count, @@ -916,12 +923,21 @@ static void vfio_iommu_type1_unpin_pages(void *iommu_data, { struct vfio_iommu *iommu = iommu_data; bool do_accounting; + dma_addr_t iova_end; + size_t iova_size; int i; /* Supported for v2 version only */ if (WARN_ON(!iommu->v2)) return; + if (WARN_ON(npage <= 0)) + return; + + if (WARN_ON(check_mul_overflow(npage, PAGE_SIZE, &iova_size) || + check_add_overflow(user_iova, iova_size - 1, &iova_end))) + return; + mutex_lock(&iommu->lock); do_accounting = list_empty(&iommu->domain_list); @@ -1283,7 +1299,8 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, int ret = -EINVAL, retries = 0; unsigned long pgshift; dma_addr_t iova = unmap->iova; - u64 size = unmap->size; + dma_addr_t iova_end; + size_t size = unmap->size; bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL; bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR; struct rb_node *n, *first_n; @@ -1296,6 +1313,11 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, goto unlock; } + if (iova != unmap->iova || size != unmap->size) { + ret = -EOVERFLOW; + goto unlock; + } + pgshift = __ffs(iommu->pgsize_bitmap); pgsize = (size_t)1 << pgshift; @@ -1305,10 +1327,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, if (unmap_all) { if (iova || size) goto unlock; - size = U64_MAX; - } else if (!size || size & (pgsize - 1) || - iova + size - 1 < iova || size > SIZE_MAX) { - goto unlock; + size = SIZE_MAX; + } else { + if (!size || size & (pgsize - 1)) + goto unlock; + + if (check_add_overflow(iova, size - 1, &iova_end)) { + ret = -EOVERFLOW; + goto unlock; + } } /* When dirty tracking is enabled, allow only min supported pgsize */ @@ -1355,7 +1382,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, if (dma && dma->iova != iova) goto unlock; - dma = vfio_find_dma(iommu, iova + size - 1, 0); + dma = vfio_find_dma(iommu, iova_end, 0); if (dma && dma->iova + dma->size != iova + size) goto unlock; } @@ -1592,7 +1619,9 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, { bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR; dma_addr_t iova = map->iova; + dma_addr_t iova_end; unsigned long vaddr = map->vaddr; + unsigned long vaddr_end; size_t size = map->size; int ret = 0, prot = 0; size_t pgsize; @@ -1600,8 +1629,15 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, /* Verify that none of our __u64 fields overflow */ if (map->size != size || map->vaddr != vaddr || map->iova != iova) + return -EOVERFLOW; + + if (!size) return -EINVAL; + if (check_add_overflow(iova, size - 1, &iova_end) || + check_add_overflow(vaddr, size - 1, &vaddr_end)) + return -EOVERFLOW; + /* READ/WRITE from device perspective */ if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) prot |= IOMMU_WRITE; @@ -1617,13 +1653,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, WARN_ON((pgsize - 1) & PAGE_MASK); - if (!size || (size | iova | vaddr) & (pgsize - 1)) { - ret = -EINVAL; - goto out_unlock; - } - - /* Don't allow IOVA or virtual address wrap */ - if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) { + if ((size | iova | vaddr) & (pgsize - 1)) { ret = -EINVAL; goto out_unlock; } @@ -1654,7 +1684,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, goto out_unlock; } - if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) { + if (!vfio_iommu_iova_dma_valid(iommu, iova, iova_end)) { ret = -EINVAL; goto out_unlock; } @@ -2965,7 +2995,8 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, struct vfio_iommu_type1_dirty_bitmap_get range; unsigned long pgshift; size_t data_size = dirty.argsz - minsz; - size_t iommu_pgsize; + size_t size, iommu_pgsize; + dma_addr_t iova, iova_end; if (!data_size || data_size < sizeof(range)) return -EINVAL; @@ -2974,14 +3005,24 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, sizeof(range))) return -EFAULT; - if (range.iova + range.size < range.iova) + iova = range.iova; + size = range.size; + + if (iova != range.iova || size != range.size) + return -EOVERFLOW; + + if (!size) return -EINVAL; + + if (check_add_overflow(iova, size - 1, &iova_end)) + return -EOVERFLOW; + if (!access_ok((void __user *)range.bitmap.data, range.bitmap.size)) return -EINVAL; pgshift = __ffs(range.bitmap.pgsize); - ret = verify_bitmap_size(range.size >> pgshift, + ret = verify_bitmap_size(size >> pgshift, range.bitmap.size); if (ret) return ret; @@ -2995,19 +3036,18 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, ret = -EINVAL; goto out_unlock; } - if (range.iova & (iommu_pgsize - 1)) { + if (iova & (iommu_pgsize - 1)) { ret = -EINVAL; goto out_unlock; } - if (!range.size || range.size & (iommu_pgsize - 1)) { + if (size & (iommu_pgsize - 1)) { ret = -EINVAL; goto out_unlock; } if (iommu->dirty_page_tracking) ret = vfio_iova_dirty_bitmap(range.bitmap.data, - iommu, range.iova, - range.size, + iommu, iova, size, range.bitmap.pgsize); else ret = -EINVAL; -- Gitee From e6802b0fcd56cb458fa84ad4ae8e324c006491ac Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Tue, 3 Mar 2026 11:46:24 -0800 Subject: [PATCH 31/43] vfio: selftests: fix crash in vfio_dma_mapping_mmio_test ANBZ: #31969 commit f183963891b4b0126f19aa0993ed931f3f3f9520 upstream. Remove the __iommu_unmap() call on a region that was never mapped. When __iommu_map() fails (expected for MMIO vaddrs in non-VFIO modes), the region is not added to the dma_regions list, leaving its list_head zero-initialized. If the unmap ioctl returns success, __iommu_unmap() calls list_del_init() on this zeroed node and crashes. This fixes the iommufd_compat_type1 and iommufd_compat_type1v2 test variants. Fixes: 080723f4d4c3 ("vfio: selftests: Add vfio_dma_mapping_mmio_test") Signed-off-by: Alex Mastro Reviewed-by: David Matlack Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20260303-fix-mmio-test-v1-1-78b4a9e46a4e@fb.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c index 957a89ce7b3a..d7f25ef77671 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c @@ -100,7 +100,6 @@ static void do_mmio_map_test(struct iommu *iommu, iommu_unmap(iommu, ®ion); } else { VFIO_ASSERT_NE(__iommu_map(iommu, ®ion), 0); - VFIO_ASSERT_NE(__iommu_unmap(iommu, ®ion, NULL), 0); } } -- Gitee From c181114f8f29ffb7df52a6305039a42cfea58fd6 Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Wed, 15 Apr 2026 11:17:52 -0700 Subject: [PATCH 32/43] vfio/pci: Clean up DMABUFs before disabling function ANBZ: #31969 commit d97708701434ce72968e771976aaf9d3438fcafd upstream. On device shutdown, make vfio_pci_core_close_device() call vfio_pci_dma_buf_cleanup() before the function is disabled via vfio_pci_core_disable(). This ensures that all access via DMABUFs is revoked before the function's BARs become inaccessible. This fixes an issue where, if the function is disabled first, a tiny window exists in which the function's MSE is cleared and yet BARs could still be accessed via the DMABUF. The resources would also be freed and up for grabs by a different driver. Fixes: 5d74781ebc86c ("vfio/pci: Add dma-buf export support for MMIO regions") Signed-off-by: Matt Evans Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20260415181752.1027604-1-mattev@meta.com Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/vfio/pci/vfio_pci_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 3a475cd4fa17..e95271f2f12b 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -738,10 +738,10 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #if IS_ENABLED(CONFIG_EEH) eeh_dev_release(vdev->pdev); #endif - vfio_pci_core_disable(vdev); - vfio_pci_dma_buf_cleanup(vdev); + vfio_pci_core_disable(vdev); + mutex_lock(&vdev->igate); if (vdev->err_trigger) { eventfd_ctx_put(vdev->err_trigger); -- Gitee From 8199a2ae599642fb3ceb48d27a7fc515e6a2409b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 23 Mar 2026 15:56:58 -0600 Subject: [PATCH 33/43] vfio/pci: Fix double free in dma-buf feature ANBZ: #31969 commit e98137f0a874ab36d0946de4707aa48cb7137d1c upstream. The error path through vfio_pci_core_feature_dma_buf() ignores its own advice to only use dma_buf_put() after dma_buf_export(), instead falling through the entire unwind chain. In the unlikely event that we encounter file descriptor exhaustion, this can result in an unbalanced refcount on the vfio device and double free of allocated objects. Avoid this by moving the "put" directly into the error path and return the errno rather than entering the unwind chain. Reported-by: Renato Marziano Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO regions") Cc: stable@vger.kernel.org Acked-by: Leon Romanovsky Signed-off-by: Alex Williamson Link: https://lore.kernel.org/r/20260323215659.2108191-3-alex.williamson@nvidia.com Reviewed-by: Jason Gunthorpe Signed-off-by: Alex Williamson Signed-off-by: Ruidong Tian --- drivers/vfio/pci/vfio_pci_dmabuf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index c0156c585b49..d894135b051b 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -302,11 +302,10 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, */ ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags); if (ret < 0) - goto err_dma_buf; + dma_buf_put(priv->dmabuf); + return ret; -err_dma_buf: - dma_buf_put(priv->dmabuf); err_dev_put: vfio_device_put_registration(&vdev->vdev); err_free_phys: -- Gitee From e181f9a037581904d853fa75d19423620bfd690d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 24 Jan 2026 21:14:13 +0200 Subject: [PATCH 34/43] dma-buf: Rename .move_notify() callback to a clearer identifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 commit ef246da8e63c486780dca4d9b4d79589cbebf5e5 upstream. Rename the .move_notify() callback to .invalidate_mappings() to make its purpose explicit and highlight that it is responsible for invalidating existing mappings. Suggested-by: Christian König Reviewed-by: Christian König Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20260124-dmabuf-revoke-v5-1-f98fca917e96@nvidia.com Signed-off-by: Christian König [Ruidong: delete change in drivers/gpu/drm/virtio/virtgpu_prime.c: not support dma-buf yet drivers/gpu/drm/xe/tests/xe_dma_buf.c: not support xe drivers drivers/gpu/drm/xe/xe_dma_buf.c: not support xe drivers ] Signed-off-by: Ruidong Tian --- drivers/dma-buf/dma-buf.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 4 ++-- drivers/infiniband/core/umem_dmabuf.c | 4 ++-- drivers/infiniband/hw/mlx5/mr.c | 2 +- drivers/iommu/iommufd/pages.c | 2 +- include/linux/dma-buf.h | 6 +++--- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 11b077129bc5..6cb5d9568018 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -898,7 +898,7 @@ dma_buf_dynamic_attach(struct dma_buf *dmabuf, struct device *dev, if (WARN_ON(!dmabuf || !dev)) return ERR_PTR(-EINVAL); - if (WARN_ON(importer_ops && !importer_ops->move_notify)) + if (WARN_ON(importer_ops && !importer_ops->invalidate_mappings)) return ERR_PTR(-EINVAL); attach = kzalloc(sizeof(*attach), GFP_KERNEL); @@ -1005,7 +1005,7 @@ EXPORT_SYMBOL_NS_GPL(dma_buf_pin, DMA_BUF); * * This unpins a buffer pinned by dma_buf_pin() and allows the exporter to move * any mapping of @attach again and inform the importer through - * &dma_buf_attach_ops.move_notify. + * &dma_buf_attach_ops.invalidate_mappings. */ void dma_buf_unpin(struct dma_buf_attachment *attach) { @@ -1212,7 +1212,7 @@ void dma_buf_move_notify(struct dma_buf *dmabuf) list_for_each_entry(attach, &dmabuf->attachments, node) if (attach->importer_ops) - attach->importer_ops->move_notify(attach); + attach->importer_ops->invalidate_mappings(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_move_notify, DMA_BUF); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 493e18bcea06..ac1f222a3ae2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -378,7 +378,7 @@ amdgpu_dma_buf_create_obj(struct drm_device *dev, struct dma_buf *dma_buf) } /** - * amdgpu_dma_buf_move_notify - &attach.move_notify implementation + * amdgpu_dma_buf_move_notify - &attach.invalidate_mappings implementation * * @attach: the DMA-buf attachment * @@ -445,7 +445,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach) static const struct dma_buf_attach_ops amdgpu_dma_buf_attach_ops = { .allow_peer2peer = true, - .move_notify = amdgpu_dma_buf_move_notify + .invalidate_mappings = amdgpu_dma_buf_move_notify }; /** diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 39357dc2d229..75a270b820c0 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -124,7 +124,7 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, if (check_add_overflow(offset, (unsigned long)size, &end)) return ret; - if (unlikely(!ops || !ops->move_notify)) + if (unlikely(!ops || !ops->invalidate_mappings)) return ret; dmabuf = dma_buf_get(fd); @@ -181,7 +181,7 @@ ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach) static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { .allow_peer2peer = true, - .move_notify = ib_umem_dmabuf_unsupported_move_notify, + .invalidate_mappings = ib_umem_dmabuf_unsupported_move_notify, }; struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 9e465cf99733..a9cb0f34f90f 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1533,7 +1533,7 @@ static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { .allow_peer2peer = 1, - .move_notify = mlx5_ib_dmabuf_invalidate_cb, + .invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb, }; struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 68da809ad07c..1d833b84bba4 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1545,7 +1545,7 @@ static void iopt_revoke_notify(struct dma_buf_attachment *attach) static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = { .allow_peer2peer = true, - .move_notify = iopt_revoke_notify, + .invalidate_mappings = iopt_revoke_notify, }; /* diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 28066eb28fd5..042cda987a33 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -406,7 +406,7 @@ struct dma_buf { * through the device. * * - Dynamic importers should set fences for any access that they can't - * disable immediately from their &dma_buf_attach_ops.move_notify + * disable immediately from their &dma_buf_attach_ops.invalidate_mappings * callback. * * IMPORTANT: @@ -457,7 +457,7 @@ struct dma_buf_attach_ops { bool allow_peer2peer; /** - * @move_notify: [optional] notification that the DMA-buf is moving + * @invalidate_mappings: [optional] notification that the DMA-buf is moving * * If this callback is provided the framework can avoid pinning the * backing store while mappings exists. @@ -474,7 +474,7 @@ struct dma_buf_attach_ops { * New mappings can be created after this callback returns, and will * point to the new location of the DMA-buf. */ - void (*move_notify)(struct dma_buf_attachment *attach); + void (*invalidate_mappings)(struct dma_buf_attachment *attach); }; /** -- Gitee From 5e20e770955430473d6e020b181e5bfc7c5ee940 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 24 Jan 2026 21:14:14 +0200 Subject: [PATCH 35/43] dma-buf: Rename dma_buf_move_notify() to dma_buf_invalidate_mappings() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 commit 95308225e5baeaae1e313816059c59a0036ab6b2 upstream. Along with renaming the .move_notify() callback, rename the corresponding dma-buf core function. This makes the expected behavior clear to exporters calling this function. Signed-off-by: Leon Romanovsky Reviewed-by: Christian König Link: https://lore.kernel.org/r/20260124-dmabuf-revoke-v5-2-f98fca917e96@nvidia.com Signed-off-by: Christian König Signed-off-by: Ruidong Tian --- drivers/dma-buf/dma-buf.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 +- drivers/iommu/iommufd/selftest.c | 2 +- drivers/vfio/pci/vfio_pci_dmabuf.c | 4 ++-- include/linux/dma-buf.h | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 6cb5d9568018..ffbd4450d10b 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -862,7 +862,7 @@ dma_buf_pin_on_map(struct dma_buf_attachment *attach) * 3. Exporters must hold the dma-buf reservation lock when calling these * functions: * - * - dma_buf_move_notify() + * - dma_buf_invalidate_mappings() */ /** @@ -1197,14 +1197,14 @@ void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach, EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment_unlocked, DMA_BUF); /** - * dma_buf_move_notify - notify attachments that DMA-buf is moving + * dma_buf_invalidate_mappings - notify attachments that DMA-buf is moving * * @dmabuf: [in] buffer which is moving * * Informs all attachments that they need to destroy and recreate all their * mappings. */ -void dma_buf_move_notify(struct dma_buf *dmabuf) +void dma_buf_invalidate_mappings(struct dma_buf *dmabuf) { struct dma_buf_attachment *attach; @@ -1214,7 +1214,7 @@ void dma_buf_move_notify(struct dma_buf *dmabuf) if (attach->importer_ops) attach->importer_ops->invalidate_mappings(attach); } -EXPORT_SYMBOL_NS_GPL(dma_buf_move_notify, DMA_BUF); +EXPORT_SYMBOL_NS_GPL(dma_buf_invalidate_mappings, DMA_BUF); /** * DOC: cpu access diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 4e9ae52ef9fd..d5d1eae7c933 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1275,7 +1275,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, if (abo->tbo.base.dma_buf && !abo->tbo.base.import_attach && old_mem && old_mem->mem_type != TTM_PL_SYSTEM) - dma_buf_move_notify(abo->tbo.base.dma_buf); + dma_buf_invalidate_mappings(abo->tbo.base.dma_buf); /* move_notify is called before move happens */ trace_amdgpu_bo_move(abo, new_mem ? new_mem->mem_type : -1, diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 7f886810690c..b88b11c61450 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -2165,7 +2165,7 @@ static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd, priv = dmabuf->priv; dma_resv_lock(dmabuf->resv, NULL); priv->revoked = revoked; - dma_buf_move_notify(dmabuf); + dma_buf_invalidate_mappings(dmabuf); dma_resv_unlock(dmabuf->resv); err_put: diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index d894135b051b..c9d7524f5cb9 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -331,7 +331,7 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) if (priv->revoked != revoked) { dma_resv_lock(priv->dmabuf->resv, NULL); priv->revoked = revoked; - dma_buf_move_notify(priv->dmabuf); + dma_buf_invalidate_mappings(priv->dmabuf); dma_resv_unlock(priv->dmabuf->resv); } fput(priv->dmabuf->file); @@ -352,7 +352,7 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) list_del_init(&priv->dmabufs_elm); priv->vdev = NULL; priv->revoked = true; - dma_buf_move_notify(priv->dmabuf); + dma_buf_invalidate_mappings(priv->dmabuf); dma_resv_unlock(priv->dmabuf->resv); vfio_device_put_registration(&vdev->vdev); fput(priv->dmabuf->file); diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 042cda987a33..37055b783cd4 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -599,7 +599,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *, enum dma_data_direction); void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *, enum dma_data_direction); -void dma_buf_move_notify(struct dma_buf *dma_buf); +void dma_buf_invalidate_mappings(struct dma_buf *dma_buf); int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); int dma_buf_end_cpu_access(struct dma_buf *dma_buf, -- Gitee From c4a90ce6d8f4ec1d6f08103f503b348c811687fd Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 24 Jan 2026 21:14:15 +0200 Subject: [PATCH 36/43] dma-buf: Always build with DMABUF_MOVE_NOTIFY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 commit 609fc8766d68da10697e1df0514d079d54bf48df upstream. DMABUF_MOVE_NOTIFY was introduced in 2018 and has been marked as experimental and disabled by default ever since. Six years later, all new importers implement this callback. It is therefore reasonable to drop CONFIG_DMABUF_MOVE_NOTIFY and always build DMABUF with support for it enabled. Suggested-by: Christian König Signed-off-by: Leon Romanovsky Reviewed-by: Christian König Link: https://lore.kernel.org/r/20260124-dmabuf-revoke-v5-3-f98fca917e96@nvidia.com Signed-off-by: Christian König [Ruidong: delete change in drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c drivers/gpu/drm/xe/tests/xe_dma_buf.c drivers/gpu/drm/xe/xe_dma_buf.c ] Signed-off-by: Ruidong Tian --- drivers/dma-buf/Kconfig | 12 ------------ drivers/dma-buf/dma-buf.c | 3 +-- drivers/gpu/drm/amd/amdkfd/Kconfig | 2 +- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig index e4dc53a36428..709cf26936a3 100644 --- a/drivers/dma-buf/Kconfig +++ b/drivers/dma-buf/Kconfig @@ -39,18 +39,6 @@ config UDMABUF A driver to let userspace turn memfd regions into dma-bufs. Qemu can use this to create host dmabufs for guest framebuffers. -config DMABUF_MOVE_NOTIFY - bool "Move notify between drivers (EXPERIMENTAL)" - default n - depends on DMA_SHARED_BUFFER - help - Don't pin buffers if the dynamic DMA-buf interface is available on - both the exporter as well as the importer. This fixes a security - problem where userspace is able to pin unrestricted amounts of memory - through DMA-buf. - This is marked experimental because we don't yet have a consistent - execution context and memory management between drivers. - config DMABUF_DEBUG bool "DMA-BUF debug checks" depends on DMA_SHARED_BUFFER diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index ffbd4450d10b..b67ddf1822bd 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -797,8 +797,7 @@ static bool dma_buf_pin_on_map(struct dma_buf_attachment *attach) { return attach->dmabuf->ops->pin && - (!dma_buf_attachment_is_dynamic(attach) || - !IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)); + !dma_buf_attachment_is_dynamic(attach); } /** diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index d3c3d3ab7225..abe490f84993 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -27,7 +27,7 @@ config HSA_AMD_SVM config HSA_AMD_P2P bool "HSA kernel driver support for peer-to-peer for AMD GPU devices" - depends on HSA_AMD && PCI_P2PDMA && DMABUF_MOVE_NOTIFY + depends on HSA_AMD && PCI_P2PDMA help Enable peer-to-peer (P2P) communication between AMD GPUs over the PCIe bus. This can improve performance of multi-GPU compute -- Gitee From a71edc8bb84b41f004b604561020b450da60411c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 31 Jan 2026 07:34:14 +0200 Subject: [PATCH 37/43] vfio: Wait for dma-buf invalidation to complete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 commit 1a8a5227f22996d3e503c60569b1813a404da033 upstream. dma-buf invalidation is handled asynchronously by the hardware, so VFIO must wait until all affected objects have been fully invalidated. In addition, the dma-buf exporter is expecting that all importers unmap any buffers they previously mapped. Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO regions") Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Signed-off-by: Leon Romanovsky Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260131-dmabuf-revoke-v7-4-463d956bd527@nvidia.com Signed-off-by: Ruidong Tian --- drivers/vfio/pci/vfio_pci_dmabuf.c | 61 ++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index c9d7524f5cb9..97fae2d9f16b 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -17,6 +17,8 @@ struct vfio_pci_dma_buf { struct dma_buf_phys_vec *phys_vec; struct p2pdma_provider *provider; u32 nr_ranges; + struct kref kref; + struct completion comp; u8 revoked : 1; }; @@ -44,27 +46,46 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, return 0; } +static void vfio_pci_dma_buf_done(struct kref *kref) +{ + struct vfio_pci_dma_buf *priv = + container_of(kref, struct vfio_pci_dma_buf, kref); + + complete(&priv->comp); +} + static struct sg_table * vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, enum dma_data_direction dir) { struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; + struct sg_table *ret; dma_resv_assert_held(priv->dmabuf->resv); if (priv->revoked) return ERR_PTR(-ENODEV); - return dma_buf_phys_vec_to_sgt(attachment, priv->provider, - priv->phys_vec, priv->nr_ranges, - priv->size, dir); + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider, + priv->phys_vec, priv->nr_ranges, + priv->size, dir); + if (IS_ERR(ret)) + return ret; + + kref_get(&priv->kref); + return ret; } static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, struct sg_table *sgt, enum dma_data_direction dir) { + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(priv->dmabuf->resv); + dma_buf_free_sgt(attachment, sgt, dir); + kref_put(&priv->kref, vfio_pci_dma_buf_done); } static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) @@ -287,6 +308,9 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, goto err_dev_put; } + kref_init(&priv->kref); + init_completion(&priv->comp); + /* dma_buf_put() now frees priv */ INIT_LIST_HEAD(&priv->dmabufs_elm); down_write(&vdev->memory_lock); @@ -330,9 +354,33 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) if (priv->revoked != revoked) { dma_resv_lock(priv->dmabuf->resv, NULL); - priv->revoked = revoked; + if (revoked) + priv->revoked = true; dma_buf_invalidate_mappings(priv->dmabuf); + dma_resv_wait_timeout(priv->dmabuf->resv, + DMA_RESV_USAGE_BOOKKEEP, false, + MAX_SCHEDULE_TIMEOUT); dma_resv_unlock(priv->dmabuf->resv); + if (revoked) { + kref_put(&priv->kref, vfio_pci_dma_buf_done); + wait_for_completion(&priv->comp); + } else { + /* + * Kref is initialize again, because when revoke + * was performed the reference counter was decreased + * to zero to trigger completion. + */ + kref_init(&priv->kref); + /* + * There is no need to wait as no mapping was + * performed when the previous status was + * priv->revoked == true. + */ + reinit_completion(&priv->comp); + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = false; + dma_resv_unlock(priv->dmabuf->resv); + } } fput(priv->dmabuf->file); } @@ -353,7 +401,12 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) priv->vdev = NULL; priv->revoked = true; dma_buf_invalidate_mappings(priv->dmabuf); + dma_resv_wait_timeout(priv->dmabuf->resv, + DMA_RESV_USAGE_BOOKKEEP, false, + MAX_SCHEDULE_TIMEOUT); dma_resv_unlock(priv->dmabuf->resv); + kref_put(&priv->kref, vfio_pci_dma_buf_done); + wait_for_completion(&priv->comp); vfio_device_put_registration(&vdev->vdev); fput(priv->dmabuf->file); } -- Gitee From 4019ce489ebfd0496ef68dc40b97727ec25b1390 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 31 Jan 2026 07:34:15 +0200 Subject: [PATCH 38/43] dma-buf: Make .invalidate_mapping() truly optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 commit 575157b1b5fcfcdd0f9c42d635dcf8219f8c86dc upstream. The .invalidate_mapping() callback is documented as optional, yet it effectively became mandatory whenever importer_ops were provided. This led to cases where RDMA non-ODP code had to supply an empty stub. Relax the checks in the dma-buf core so the callback can be omitted, allowing RDMA code to drop the unnecessary function. Removing the stub allows the next patch to tell that RDMA does not support .invalidate_mapping() by checking for a NULL op. Signed-off-by: Leon Romanovsky Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260131-dmabuf-revoke-v7-5-463d956bd527@nvidia.com Signed-off-by: Ruidong Tian --- drivers/dma-buf/dma-buf.c | 6 ++---- drivers/infiniband/core/umem_dmabuf.c | 10 ---------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index b67ddf1822bd..102d985d3d15 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -897,9 +897,6 @@ dma_buf_dynamic_attach(struct dma_buf *dmabuf, struct device *dev, if (WARN_ON(!dmabuf || !dev)) return ERR_PTR(-EINVAL); - if (WARN_ON(importer_ops && !importer_ops->invalidate_mappings)) - return ERR_PTR(-EINVAL); - attach = kzalloc(sizeof(*attach), GFP_KERNEL); if (!attach) return ERR_PTR(-ENOMEM); @@ -1210,7 +1207,8 @@ void dma_buf_invalidate_mappings(struct dma_buf *dmabuf) dma_resv_assert_held(dmabuf->resv); list_for_each_entry(attach, &dmabuf->attachments, node) - if (attach->importer_ops) + if (attach->importer_ops && + attach->importer_ops->invalidate_mappings) attach->importer_ops->invalidate_mappings(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_invalidate_mappings, DMA_BUF); diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 75a270b820c0..c221bed3ac8b 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -170,18 +170,8 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, } EXPORT_SYMBOL(ib_umem_dmabuf_get); -static void -ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach) -{ - struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; - - ibdev_warn_ratelimited(umem_dmabuf->umem.ibdev, - "Invalidate callback should not be called when memory is pinned\n"); -} - static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { .allow_peer2peer = true, - .invalidate_mappings = ib_umem_dmabuf_unsupported_move_notify, }; struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, -- Gitee From 64e039bf46b6663f394b871e6d9a430d4c2cf2f9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 31 Jan 2026 07:34:16 +0200 Subject: [PATCH 39/43] dma-buf: Add dma_buf_attach_revocable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 commit be6d4c9e9d714ebbf358be41332726a0f94b9ffa upstream. Some exporters need a flow to synchronously revoke access to the DMA-buf by importers. Once revoke is completed the importer is not permitted to touch the memory otherwise they may get IOMMU faults, AERs, or worse. DMA-buf today defines a revoke flow, for both pinned and dynamic importers, which is broadly: dma_resv_lock(dmabuf->resv, NULL); // Prevent new mappings from being established priv->revoked = true; // Tell all importers to eventually unmap dma_buf_invalidate_mappings(dmabuf); // Wait for any inprogress fences on the old mapping dma_resv_wait_timeout(dmabuf->resv, DMA_RESV_USAGE_BOOKKEEP, false, MAX_SCHEDULE_TIMEOUT); dma_resv_unlock(dmabuf->resv, NULL); // Wait for all importers to complete unmap wait_for_completion(&priv->unmapped_comp); This works well, and an importer that continues to access the DMA-buf after unmapping it is very buggy. However, the final wait for unmap is effectively unbounded. Several importers do not support invalidate_mappings() at all and won't unmap until userspace triggers it. This unbounded wait is not suitable for exporters like VFIO and RDMA tha need to issue revoke as part of their normal operations. Add dma_buf_attach_revocable() to allow exporters to determine the difference between importers that can complete the above in bounded time, and those that can't. It can be called inside the exporter's attach op to reject incompatible importers. Document these details about how dma_buf_invalidate_mappings() works and what the required sequence is to achieve a full revocation. Signed-off-by: Leon Romanovsky Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260131-dmabuf-revoke-v7-6-463d956bd527@nvidia.com Signed-off-by: Ruidong Tian --- drivers/dma-buf/dma-buf.c | 48 ++++++++++++++++++++++++++++++++++++++- include/linux/dma-buf.h | 9 +++----- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 102d985d3d15..eb8a6589faa1 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1192,13 +1192,59 @@ void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach, } EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment_unlocked, DMA_BUF); +/** + * dma_buf_attach_revocable - check if a DMA-buf importer implements + * revoke semantics. + * @attach: the DMA-buf attachment to check + * + * Returns true if the DMA-buf importer can support the revoke sequence + * explained in dma_buf_invalidate_mappings() within bounded time. Meaning the + * importer implements invalidate_mappings() and ensures that unmap is called as + * a result. + */ +bool dma_buf_attach_revocable(struct dma_buf_attachment *attach) +{ + return attach->importer_ops && + attach->importer_ops->invalidate_mappings; +} +EXPORT_SYMBOL_NS_GPL(dma_buf_attach_revocable, DMA_BUF); + /** * dma_buf_invalidate_mappings - notify attachments that DMA-buf is moving * * @dmabuf: [in] buffer which is moving * * Informs all attachments that they need to destroy and recreate all their - * mappings. + * mappings. If the attachment is dynamic then the dynamic importer is expected + * to invalidate any caches it has of the mapping result and perform a new + * mapping request before allowing HW to do any further DMA. + * + * If the attachment is pinned then this informs the pinned importer that the + * underlying mapping is no longer available. Pinned importers may take this is + * as a permanent revocation and never establish new mappings so exporters + * should not trigger it lightly. + * + * Upon return importers may continue to access the DMA-buf memory. The caller + * must do two additional waits to ensure that the memory is no longer being + * accessed: + * 1) Until dma_resv_wait_timeout() retires fences the importer is allowed to + * fully access the memory. + * 2) Until the importer calls unmap it is allowed to speculatively + * read-and-discard the memory. It must not write to the memory. + * + * A caller wishing to use dma_buf_invalidate_mappings() to fully stop access to + * the DMA-buf must wait for both. Dynamic callers can often use just the first. + * + * All importers providing a invalidate_mappings() op must ensure that unmap is + * called within bounded time after the op. + * + * Pinned importers that do not support a invalidate_mappings() op will + * eventually perform unmap when they are done with the buffer, which may be an + * ubounded time from calling this function. dma_buf_attach_revocable() can be + * used to prevent such importers from attaching. + * + * Importers are free to request a new mapping in parallel as this function + * returns. */ void dma_buf_invalidate_mappings(struct dma_buf *dmabuf) { diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 37055b783cd4..012b921b2146 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -467,12 +467,8 @@ struct dma_buf_attach_ops { * called with this lock held as well. This makes sure that no mapping * is created concurrently with an ongoing move operation. * - * Mappings stay valid and are not directly affected by this callback. - * But the DMA-buf can now be in a different physical location, so all - * mappings should be destroyed and re-created as soon as possible. - * - * New mappings can be created after this callback returns, and will - * point to the new location of the DMA-buf. + * See the kdoc for dma_buf_invalidate_mappings() for details on the + * required behavior. */ void (*invalidate_mappings)(struct dma_buf_attachment *attach); }; @@ -600,6 +596,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *, void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *, enum dma_data_direction); void dma_buf_invalidate_mappings(struct dma_buf *dma_buf); +bool dma_buf_attach_revocable(struct dma_buf_attachment *attach); int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); int dma_buf_end_cpu_access(struct dma_buf *dma_buf, -- Gitee From fb47a0296a5046aaa7fd0d38b4c330ce6a7d3a65 Mon Sep 17 00:00:00 2001 From: Sumit Semwal Date: Fri, 10 Apr 2026 18:07:03 +0530 Subject: [PATCH 40/43] dma-buf: fix htmldocs error for dma_buf_attach_revocable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 commit dc6d51959ec0c08366d5aaeb5b8fb02d814d1e4b upstream. linux-next testing showed this htmldoc error due to a missing extra line in the comments; add it. Fixes: be6d4c9e9d714 ("dma-buf: Add dma_buf_attach_revocable()") Reported-by: Mark Brown Closes: https://lore.kernel.org/lkml/adaNJaF58PZlvs6K@sirena.org.uk/ Signed-off-by: Sumit Semwal Reviewed-by: Christian König Link: https://patch.msgid.link/20260410123703.937822-1-sumit.semwal@linaro.org Signed-off-by: Ruidong Tian --- drivers/dma-buf/dma-buf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index eb8a6589faa1..72daef4d505e 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1227,6 +1227,7 @@ EXPORT_SYMBOL_NS_GPL(dma_buf_attach_revocable, DMA_BUF); * Upon return importers may continue to access the DMA-buf memory. The caller * must do two additional waits to ensure that the memory is no longer being * accessed: + * * 1) Until dma_resv_wait_timeout() retires fences the importer is allowed to * fully access the memory. * 2) Until the importer calls unmap it is allowed to speculatively -- Gitee From b61edc01f7dfa6f328b95cd777474bacfe5d5e8e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 31 Jan 2026 07:34:17 +0200 Subject: [PATCH 41/43] vfio: Permit VFIO to work with pinned importers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 commit 6ffe939bf94fbbeaaa4050a0e9c22820e959b15f upstream. Till now VFIO has rejected pinned importers, largely to avoid being used with the RDMA pinned importer that cannot handle a move_notify() to revoke access. Using dma_buf_attach_revocable() it can tell the difference between pinned importers that support the flow described in dma_buf_invalidate_mappings() and those that don't. Thus permit compatible pinned importers. This is one of two items IOMMUFD requires to remove its private interface to VFIO's dma-buf. Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Reviewed-by: Christian König Signed-off-by: Leon Romanovsky Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260131-dmabuf-revoke-v7-7-463d956bd527@nvidia.com Signed-off-by: Ruidong Tian --- drivers/vfio/pci/vfio_pci_dmabuf.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index 97fae2d9f16b..c907a74fd534 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -22,16 +22,6 @@ struct vfio_pci_dma_buf { u8 revoked : 1; }; -static int vfio_pci_dma_buf_pin(struct dma_buf_attachment *attachment) -{ - return -EOPNOTSUPP; -} - -static void vfio_pci_dma_buf_unpin(struct dma_buf_attachment *attachment) -{ - /* Do nothing */ -} - static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, struct dma_buf_attachment *attachment) { @@ -43,6 +33,9 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, if (priv->revoked) return -ENODEV; + if (!dma_buf_attach_revocable(attachment)) + return -EOPNOTSUPP; + return 0; } @@ -107,8 +100,6 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) } static const struct dma_buf_ops vfio_pci_dmabuf_ops = { - .pin = vfio_pci_dma_buf_pin, - .unpin = vfio_pci_dma_buf_unpin, .attach = vfio_pci_dma_buf_attach, .map_dma_buf = vfio_pci_dma_buf_map, .unmap_dma_buf = vfio_pci_dma_buf_unmap, -- Gitee From 46c2e80f21e655793a379c6a91a83533f485c2f7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 31 Jan 2026 07:34:18 +0200 Subject: [PATCH 42/43] iommufd: Add dma_buf_pin() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31969 commit 8c5f9645c3893f0db679d9affe4fe4e665b990dd upstream. IOMMUFD relies on a private protocol with VFIO, and this always operated in pinned mode. Now that VFIO can support pinned importers update IOMMUFD to invoke the normal dma-buf flow to request pin. This isn't enough to allow IOMMUFD to work with other exporters, it still needs a way to get the physical address list which is another series. IOMMUFD supports the defined revoke semantics. It immediately stops and fences access to the memory inside it's invalidate_mappings() callback, and it currently doesn't use scatterlists so doesn't call map/unmap at all. It is expected that a future revision can synchronously call unmap from the move_notify callback as well. Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Acked-by: Christian König Signed-off-by: Leon Romanovsky Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260131-dmabuf-revoke-v7-8-463d956bd527@nvidia.com Signed-off-by: Ruidong Tian --- drivers/iommu/iommufd/pages.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 1d833b84bba4..6b73fcbf4181 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1595,16 +1595,22 @@ static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages, mutex_unlock(&pages->mutex); } - rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys); + rc = dma_buf_pin(attach); if (rc) goto err_detach; + rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys); + if (rc) + goto err_unpin; + dma_resv_unlock(dmabuf->resv); /* On success iopt_release_pages() will detach and put the dmabuf. */ pages->dmabuf.attach = attach; return 0; +err_unpin: + dma_buf_unpin(attach); err_detach: dma_resv_unlock(dmabuf->resv); dma_buf_detach(dmabuf, attach); @@ -1750,6 +1756,7 @@ void iopt_release_pages(struct kref *kref) if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) { struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf; + dma_buf_unpin(pages->dmabuf.attach); dma_buf_detach(dmabuf, pages->dmabuf.attach); dma_buf_put(dmabuf); WARN_ON(!list_empty(&pages->dmabuf.tracker)); -- Gitee From 1f0e411a8fbeb60ec3cb7d9a54b85888a61a79e2 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Wed, 6 May 2026 01:53:37 +0800 Subject: [PATCH 43/43] anolis: configs: update dmabuf move and AMD p2p ANBZ: #31969 Signed-off-by: Ruidong Tian --- anolis/configs/L2-OPTIONAL/arm64/CONFIG_HSA_AMD_P2P | 1 + anolis/configs/L2-OPTIONAL/default/CONFIG_DMABUF_MOVE_NOTIFY | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_HSA_AMD_P2P delete mode 100644 anolis/configs/L2-OPTIONAL/default/CONFIG_DMABUF_MOVE_NOTIFY diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_HSA_AMD_P2P b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_HSA_AMD_P2P new file mode 100644 index 000000000000..c77f8389fb1e --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_HSA_AMD_P2P @@ -0,0 +1 @@ +CONFIG_HSA_AMD_P2P=y diff --git a/anolis/configs/L2-OPTIONAL/default/CONFIG_DMABUF_MOVE_NOTIFY b/anolis/configs/L2-OPTIONAL/default/CONFIG_DMABUF_MOVE_NOTIFY deleted file mode 100644 index 33c2fe87688e..000000000000 --- a/anolis/configs/L2-OPTIONAL/default/CONFIG_DMABUF_MOVE_NOTIFY +++ /dev/null @@ -1 +0,0 @@ -# CONFIG_DMABUF_MOVE_NOTIFY is not set -- Gitee