From 19a21b70cf2f7298e1fae409dfeae47e044016b6 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Tue, 5 Sep 2023 09:07:09 +0800 Subject: [PATCH 01/25] KVM: X86: Reduce size of kvm_vcpu_arch structure when CONFIG_KVM_XEN=n ANBZ: #34453 commit ee11ab6bb04e1093d3cc5f5bea9779799ce1d2c7 upstream. When CONFIG_KVM_XEN=n, the size of kvm_vcpu_arch can be reduced from 5100+ to 4400+ by adding macro control. Signed-off-by: Peng Hao Link: https://lore.kernel.org/all/CAPm50aKwbZGeXPK5uig18Br8CF1hOS71CE2j_dLX+ub7oJdpGg@mail.gmail.com [sean: fix whitespace damage] Signed-off-by: Sean Christopherson Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/include/asm/kvm_host.h | 5 ++++- arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/x86.c | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f4fa1779bef7..ce369b21fa68 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -680,6 +680,7 @@ struct kvm_hypervisor_cpuid { u32 limit; }; +#ifdef CONFIG_KVM_XEN /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -702,6 +703,7 @@ struct kvm_vcpu_xen { struct timer_list poll_timer; struct kvm_hypervisor_cpuid cpuid; }; +#endif struct kvm_queued_exception { bool pending; @@ -932,8 +934,9 @@ struct kvm_vcpu_arch { bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; +#ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; - +#endif cpumask_var_t wbinvd_dirty_mask; unsigned long last_retry_eip; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f8a84dc03635..b9770daa165a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -470,7 +470,9 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_nent = nent; vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); +#ifdef CONFIG_KVM_XEN vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); +#endif kvm_vcpu_after_set_cpuid(vcpu); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e07c9eaaf407..9dfd05078fcb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3236,11 +3236,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (vcpu->pv_time.active) kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0); +#ifdef CONFIG_KVM_XEN if (vcpu->xen.vcpu_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0); +#endif kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } -- Gitee From 7940778213f684a4b3ea6692a25b78060504ea61 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Oct 2023 14:52:30 -0400 Subject: [PATCH 02/25] hugetlbfs: convert to new timestamp accessors ANBZ: #34453 commit cfd87e76901fde31983860c6a377128173ee8b49 upstream. Convert to using the new inode timestamp accessor functions. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20231004185347.80880-43-jlayton@kernel.org Signed-off-by: Christian Brauner Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- fs/hugetlbfs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2268a978e922..b072110eeb30 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -969,7 +969,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode->i_mode = S_IFDIR | ctx->mode; inode->i_uid = ctx->uid; inode->i_gid = ctx->gid; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -1013,7 +1013,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_mapping->private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { @@ -1056,7 +1056,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (!inode) return -ENOSPC; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_instantiate(dentry, inode); dget(dentry);/* Extra count - pin the dentry in core */ return 0; @@ -1088,7 +1088,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_tmpfile(file, inode); return finish_open_simple(file, 0); } @@ -1110,7 +1110,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap, } else iput(inode); } - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return error; } -- Gitee From a3656e79d86e40cd7a0c8a3e7f6c6d78c16096f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:06 -0700 Subject: [PATCH 03/25] KVM: selftests: Drop unused kvm_userspace_memory_region_find() helper ANBZ: #34453 commit 335869c3f2b881bda6eeeb2df342841a1db3310b upstream. Drop kvm_userspace_memory_region_find(), it's unused and a terrible API (probably why it's unused). If anything outside of kvm_util.c needs to get at the memslot, userspace_mem_region_find() can be exposed to give others full access to all memory region/slot information. Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-25-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../selftests/kvm/include/kvm_util_base.h | 4 --- tools/testing/selftests/kvm/lib/kvm_util.c | 29 ------------------- 2 files changed, 33 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 4c4a2e3b8eee..d07f90fe67cd 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -777,10 +777,6 @@ vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) return n; } -struct kvm_userspace_memory_region * -kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, - uint64_t end); - #define sync_global_to_guest(vm, g) ({ \ typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ memcpy(_p, &(g), sizeof(g)); \ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index bc1ef536b640..cbf48cd60206 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -590,35 +590,6 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) return NULL; } -/* - * KVM Userspace Memory Region Find - * - * Input Args: - * vm - Virtual Machine - * start - Starting VM physical address - * end - Ending VM physical address, inclusive. - * - * Output Args: None - * - * Return: - * Pointer to overlapping region, NULL if no such region. - * - * Public interface to userspace_mem_region_find. Allows tests to look up - * the memslot datastructure for a given range of guest physical memory. - */ -struct kvm_userspace_memory_region * -kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, - uint64_t end) -{ - struct userspace_mem_region *region; - - region = userspace_mem_region_find(vm, start, end); - if (!region) - return NULL; - - return ®ion->region; -} - __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) { -- Gitee From 161c1dc7dc71880fab303619b7613fb0ba073686 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:07 -0700 Subject: [PATCH 04/25] KVM: selftests: Convert lib's mem regions to KVM_SET_USER_MEMORY_REGION2 ANBZ: #34453 commit 8d99e347c097ab3f9fb93d0f88dddf20051d7c88 upstream. Use KVM_SET_USER_MEMORY_REGION2 throughout KVM's selftests library so that support for guest private memory can be added without needing an entirely separate set of helpers. Note, this obviously makes selftests backwards-incompatible with older KVM versions from this point forward. [Backport Changes] 1. In tools/testing/selftests/kvm/lib/kvm_util.c, during the earlier backport of Anolis commit 818676db5e0c7b, the vm_ioctl() function call, was moved from __vm_mem_region_delete() to vm_mem_region_delete() with KVM_SET_USER_MEMORY_REGION instead of KVM_SET_USER_MEMORY_REGION2 deviating form upstream commit 5afe18dfa47dae. Since the current upstream commit replaces KVM_SET_USER_MEMORY_REGION with KVM_SET_USER_MEMORY_REGION2, fixed the deviation in the vm_ioctl() call to match with with upstream commit. Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-26-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../selftests/kvm/include/kvm_util_base.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index d07f90fe67cd..46b62f5fee62 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -44,7 +44,7 @@ typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ struct userspace_mem_region { - struct kvm_userspace_memory_region region; + struct kvm_userspace_memory_region2 region; struct sparsebit *unused_phy_pages; int fd; off_t offset; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index cbf48cd60206..b0a0283cec70 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -453,8 +453,9 @@ void kvm_vm_restart(struct kvm_vm *vmp) vm_create_irqchip(vmp); hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { - int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION2, ®ion->region); + + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" " guest_phys_addr: 0x%llx size: 0x%llx", @@ -1011,8 +1012,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->region.guest_phys_addr = guest_paddr; region->region.memory_size = npages * vm->page_size; region->region.userspace_addr = (uintptr_t) region->host_mem; - ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" " guest_phys_addr: 0x%lx size: 0x%lx", @@ -1094,9 +1095,9 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) region->region.flags = flags; - ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i slot: %u flags: 0x%x", ret, errno, slot, flags); } @@ -1124,9 +1125,9 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) region->region.guest_phys_addr = new_gpa; - ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); - TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n" + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed\n" "ret: %i errno: %i slot: %u new_gpa: 0x%lx", ret, errno, slot, new_gpa); } @@ -1149,7 +1150,7 @@ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) struct userspace_mem_region *region = memslot2region(vm, slot); region->region.memory_size = 0; - vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); __vm_mem_region_delete(vm, region, true); } -- Gitee From e52b5db863cdb1901f2d918fb7f36966f27bae91 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:08 -0700 Subject: [PATCH 05/25] KVM: selftests: Add support for creating private memslots ANBZ: #34453 commit bb2968ad6c33c0902dce48ea57d58c5bb4f3c617 upstream. Add support for creating "private" memslots via KVM_CREATE_GUEST_MEMFD and KVM_SET_USER_MEMORY_REGION2. Make vm_userspace_mem_region_add() a wrapper to its effective replacement, vm_mem_add(), so that private memslots are fully opt-in, i.e. don't require update all tests that add memory regions. Pivot on the KVM_MEM_PRIVATE flag instead of the validity of the "gmem" file descriptor so that simple tests can let vm_mem_add() do the heavy lifting of creating the guest memfd, but also allow the caller to pass in an explicit fd+offset so that fancier tests can do things like back multiple memslots with a single file. If the caller passes in a fd, dup() the fd so that (a) __vm_mem_region_delete() can close the fd associated with the memory region without needing yet another flag, and (b) so that the caller can safely close its copy of the fd without having to first destroy memslots. Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-27-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../selftests/kvm/include/kvm_util_base.h | 23 ++++++ .../testing/selftests/kvm/include/test_util.h | 5 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 76 +++++++++++-------- 3 files changed, 73 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 46b62f5fee62..f77b76d36755 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -432,6 +432,26 @@ static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) void vm_create_irqchip(struct kvm_vm *vm); +static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + struct kvm_create_guest_memfd guest_memfd = { + .size = size, + .flags = flags, + }; + + return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, &guest_memfd); +} + +static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + int fd = __vm_create_guest_memfd(vm, size, flags); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd)); + return fd; +} + void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, uint64_t gpa, uint64_t size, void *hva); int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, @@ -440,6 +460,9 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint64_t guest_paddr, uint32_t slot, uint64_t npages, uint32_t flags); +void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 7883c0a963fd..ec66b2283d1e 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -145,6 +145,11 @@ static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t) return vm_mem_backing_src_alias(t)->flag & MAP_SHARED; } +static inline bool backing_src_can_be_huge(enum vm_mem_backing_src_type t) +{ + return t != VM_MEM_SRC_ANONYMOUS && t != VM_MEM_SRC_SHMEM; +} + /* Aligns x up to the next multiple of size. Size must be a power of 2. */ static inline uint64_t align_up(uint64_t x, uint64_t size) { diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b0a0283cec70..6768ec15521c 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -666,6 +666,8 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); close(region->fd); } + if (region->region.guest_memfd >= 0) + close(region->region.guest_memfd); free(region); } @@ -867,36 +869,15 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, errno, strerror(errno)); } -/* - * VM Userspace Memory Region Add - * - * Input Args: - * vm - Virtual Machine - * src_type - Storage source for this region. - * NULL to use anonymous memory. - * guest_paddr - Starting guest physical address - * slot - KVM region slot - * npages - Number of physical pages - * flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES) - * - * Output Args: None - * - * Return: None - * - * Allocates a memory area of the number of pages specified by npages - * and maps it to the VM specified by vm, at a starting physical address - * given by guest_paddr. The region is created with a KVM region slot - * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM. The - * region is created with the flags given by flags. - */ -void vm_userspace_mem_region_add(struct kvm_vm *vm, - enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags) +/* FIXME: This thing needs to be ripped apart and rewritten. */ +void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) { int ret; struct userspace_mem_region *region; size_t backing_src_pagesz = get_backing_src_pagesz(src_type); + size_t mem_size = npages * vm->page_size; size_t alignment; TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, @@ -949,7 +930,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, /* Allocate and initialize new mem region structure. */ region = calloc(1, sizeof(*region)); TEST_ASSERT(region != NULL, "Insufficient Memory"); - region->mmap_size = npages * vm->page_size; + region->mmap_size = mem_size; #ifdef __s390x__ /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */ @@ -996,14 +977,38 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, /* As needed perform madvise */ if ((src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) { - ret = madvise(region->host_mem, npages * vm->page_size, + ret = madvise(region->host_mem, mem_size, src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s", - region->host_mem, npages * vm->page_size, + region->host_mem, mem_size, vm_mem_backing_src_alias(src_type)->name); } region->backing_src_type = src_type; + + if (flags & KVM_MEM_GUEST_MEMFD) { + if (guest_memfd < 0) { + uint32_t guest_memfd_flags = 0; + TEST_ASSERT(!guest_memfd_offset, + "Offset must be zero when creating new guest_memfd"); + guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags); + } else { + /* + * Install a unique fd for each memslot so that the fd + * can be closed when the region is deleted without + * needing to track if the fd is owned by the framework + * or by the caller. + */ + guest_memfd = dup(guest_memfd); + TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd)); + } + + region->region.guest_memfd = guest_memfd; + region->region.guest_memfd_offset = guest_memfd_offset; + } else { + region->region.guest_memfd = -1; + } + region->unused_phy_pages = sparsebit_alloc(); sparsebit_set_num(region->unused_phy_pages, guest_paddr >> vm->page_shift, npages); @@ -1016,9 +1021,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx", + " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d\n", ret, errno, slot, flags, - guest_paddr, (uint64_t) region->region.memory_size); + guest_paddr, (uint64_t) region->region.memory_size, + region->region.guest_memfd); /* Add to quick lookup data structures */ vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); @@ -1039,6 +1045,14 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, } } +void vm_userspace_mem_region_add(struct kvm_vm *vm, + enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, + uint64_t npages, uint32_t flags) +{ + vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0); +} + /* * Memslot to region * -- Gitee From 68fd2b68fb1e601b05ae6595047d0cf5ab87ed39 Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Fri, 27 Oct 2023 11:22:09 -0700 Subject: [PATCH 06/25] KVM: selftests: Add helpers to convert guest memory b/w private and shared ANBZ: #34453 commit f7fa67495d118f734f98b406fd46888616b4a3c3 upstream. Add helpers to convert memory between private and shared via KVM's memory attributes, as well as helpers to free/allocate guest_memfd memory via fallocate(). Userspace, i.e. tests, is NOT required to do fallocate() when converting memory, as the attributes are the single source of truth. Provide allocate() helpers so that tests can mimic a userspace that frees private memory on conversion, e.g. to prioritize memory usage over performance. Signed-off-by: Vishal Annapurve Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-28-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../selftests/kvm/include/kvm_util_base.h | 48 +++++++++++++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 28 +++++++++++ 2 files changed, 76 insertions(+) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index f77b76d36755..8719749b4536 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -334,6 +334,54 @@ static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); } +static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa, + uint64_t size, uint64_t attributes) +{ + struct kvm_memory_attributes attr = { + .attributes = attributes, + .address = gpa, + .size = size, + .flags = 0, + }; + + /* + * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows + * need significant enhancements to support multiple attributes. + */ + TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE, + "Update me to support multiple attributes!"); + + vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr); +} + + +static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); +} + +static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, 0); +} + +void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size, + bool punch_hole); + +static inline void vm_guest_mem_punch_hole(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, true); +} + +static inline void vm_guest_mem_allocate(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, false); +} + void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); const char *vm_guest_mode_string(uint32_t i); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 6768ec15521c..15369e0ca340 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1169,6 +1169,34 @@ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) __vm_mem_region_delete(vm, region, true); } +void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size, + bool punch_hole) +{ + const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0); + struct userspace_mem_region *region; + uint64_t end = base + size; + uint64_t gpa, len; + off_t fd_offset; + int ret; + + for (gpa = base; gpa < end; gpa += len) { + uint64_t offset; + + region = userspace_mem_region_find(vm, gpa, gpa); + TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD, + "Private memory region not found for GPA 0x%lx", gpa); + + offset = gpa - region->region.guest_phys_addr; + fd_offset = region->region.guest_memfd_offset + offset; + len = min_t(uint64_t, end - gpa, region->region.memory_size - offset); + + ret = fallocate(region->region.guest_memfd, mode, fd_offset, len); + TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx\n", + punch_hole ? "punch hole" : "allocate", gpa, len, + region->region.guest_memfd, mode, fd_offset); + } +} + /* Returns the size of a vCPU's kvm_run structure. */ static int vcpu_mmap_sz(void) { -- Gitee From 05fc8d9e2482e12548156a0804935a9c3d4add49 Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Fri, 27 Oct 2023 11:22:10 -0700 Subject: [PATCH 07/25] KVM: selftests: Add helpers to do KVM_HC_MAP_GPA_RANGE hypercalls (x86) ANBZ: #34453 commit 01244fce2fa22176e46609c051f6fe15cf81e188 upstream. Add helpers for x86 guests to invoke the KVM_HC_MAP_GPA_RANGE hypercall, which KVM will forward to userspace and thus can be used by tests to coordinate private<=>shared conversions between host userspace code and guest code. Signed-off-by: Vishal Annapurve [sean: drop shared/private helpers (let tests specify flags)] Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-29-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../selftests/kvm/include/x86_64/processor.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 709e739fd2bc..0d09cdaeb5dd 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -15,6 +15,7 @@ #include #include +#include #include #include "../kvm_util.h" @@ -1208,6 +1209,20 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1); void xen_hypercall(uint64_t nr, uint64_t a0, void *a1); +static inline uint64_t __kvm_hypercall_map_gpa_range(uint64_t gpa, + uint64_t size, uint64_t flags) +{ + return kvm_hypercall(KVM_HC_MAP_GPA_RANGE, gpa, size >> PAGE_SHIFT, flags, 0); +} + +static inline void kvm_hypercall_map_gpa_range(uint64_t gpa, uint64_t size, + uint64_t flags) +{ + uint64_t ret = __kvm_hypercall_map_gpa_range(gpa, size, flags); + + GUEST_ASSERT(!ret); +} + void __vm_xsave_require_permission(uint64_t xfeature, const char *name); #define vm_xsave_require_permission(xfeature) \ -- Gitee From 7c4fc03b2593a406bc4f445f1e2cfc2fdd5f6025 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:11 -0700 Subject: [PATCH 08/25] KVM: selftests: Introduce VM "shape" to allow tests to specify the VM type ANBZ: #34453 commit 672eaa351015d8165c41fff644ee7d2b369cea12 upstream. Add a "vm_shape" structure to encapsulate the selftests-defined "mode", along with the KVM-defined "type" for use when creating a new VM. "mode" tracks physical and virtual address properties, as well as the preferred backing memory type, while "type" corresponds to the VM type. Taking the VM type will allow adding tests for KVM_CREATE_GUEST_MEMFD without needing an entirely separate set of helpers. At this time, guest_memfd is effectively usable only by confidential VM types in the form of guest private memory, and it's expected that x86 will double down and require unique VM types for TDX and SNP guests. Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-30-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../selftests/kvm/aarch64/page_fault_test.c | 2 +- tools/testing/selftests/kvm/dirty_log_test.c | 2 +- .../selftests/kvm/include/kvm_util_base.h | 54 +++++++++++++++---- .../selftests/kvm/kvm_page_table_test.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 43 +++++++-------- tools/testing/selftests/kvm/lib/memstress.c | 3 +- .../kvm/x86_64/ucna_injection_test.c | 2 +- 7 files changed, 73 insertions(+), 35 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 975d28be3cca..192fbf64466b 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -705,7 +705,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) print_test_banner(mode, p); - vm = ____vm_create(mode); + vm = ____vm_create(VM_SHAPE(mode)); setup_memslots(vm, p); kvm_vm_elf_load(vm, program_invocation_name); setup_ucall(vm); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index e96fababd3f0..4ca747c33846 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -686,7 +686,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = __vm_create(mode, 1, extra_mem_pages); + vm = __vm_create(VM_SHAPE(mode), 1, extra_mem_pages); log_mode_create_vm_done(vm); *vcpu = vm_vcpu_add(vm, 0, guest_code); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 8719749b4536..683f6262da25 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -189,6 +189,23 @@ enum vm_guest_mode { NUM_VM_MODES, }; +struct vm_shape { + enum vm_guest_mode mode; + unsigned int type; +}; + +#define VM_TYPE_DEFAULT 0 + +#define VM_SHAPE(__mode) \ +({ \ + struct vm_shape shape = { \ + .mode = (__mode), \ + .type = VM_TYPE_DEFAULT \ + }; \ + \ + shape; \ +}) + #if defined(__aarch64__) extern enum vm_guest_mode vm_mode_default; @@ -221,6 +238,8 @@ extern enum vm_guest_mode vm_mode_default; #endif +#define VM_SHAPE_DEFAULT VM_SHAPE(VM_MODE_DEFAULT) + #define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) #define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) @@ -785,21 +804,21 @@ vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to * calculate the amount of memory needed for per-vCPU data, e.g. stacks. */ -struct kvm_vm *____vm_create(enum vm_guest_mode mode); -struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, +struct kvm_vm *____vm_create(struct vm_shape shape); +struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, uint64_t nr_extra_pages); static inline struct kvm_vm *vm_create_barebones(void) { - return ____vm_create(VM_MODE_DEFAULT); + return ____vm_create(VM_SHAPE_DEFAULT); } static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) { - return __vm_create(VM_MODE_DEFAULT, nr_runnable_vcpus, 0); + return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); } -struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, +struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, uint64_t extra_mem_pages, void *guest_code, struct kvm_vcpu *vcpus[]); @@ -807,17 +826,27 @@ static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, void *guest_code, struct kvm_vcpu *vcpus[]) { - return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, 0, + return __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus, 0, guest_code, vcpus); } + +struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code); + /* * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages * additional pages of guest memory. Returns the VM and vCPU (via out param). */ -struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, - uint64_t extra_mem_pages, - void *guest_code); +static inline struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, vcpu, + extra_mem_pages, guest_code); +} static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, void *guest_code) @@ -825,6 +854,13 @@ static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, return __vm_create_with_one_vcpu(vcpu, 0, guest_code); } +static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(shape, vcpu, 0, guest_code); +} + struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); void kvm_pin_this_task_to_pcpu(uint32_t pcpu); diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 69f26d80c821..e37dc9c21888 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -254,7 +254,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; - vm = __vm_create_with_vcpus(mode, nr_vcpus, guest_num_pages, + vm = __vm_create_with_vcpus(VM_SHAPE(mode), nr_vcpus, guest_num_pages, guest_code, test_args.vcpus); /* Align down GPA of the testing memslot */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 15369e0ca340..9f4c115116b8 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -209,7 +209,7 @@ __weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm) (1ULL << (vm->va_bits - 1)) >> vm->page_shift); } -struct kvm_vm *____vm_create(enum vm_guest_mode mode) +struct kvm_vm *____vm_create(struct vm_shape shape) { struct kvm_vm *vm; @@ -221,13 +221,13 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode) vm->regions.hva_tree = RB_ROOT; hash_init(vm->regions.slot_hash); - vm->mode = mode; - vm->type = 0; + vm->mode = shape.mode; + vm->type = shape.type; - vm->pa_bits = vm_guest_mode_params[mode].pa_bits; - vm->va_bits = vm_guest_mode_params[mode].va_bits; - vm->page_size = vm_guest_mode_params[mode].page_size; - vm->page_shift = vm_guest_mode_params[mode].page_shift; + vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits; + vm->va_bits = vm_guest_mode_params[vm->mode].va_bits; + vm->page_size = vm_guest_mode_params[vm->mode].page_size; + vm->page_shift = vm_guest_mode_params[vm->mode].page_shift; /* Setup mode specific traits. */ switch (vm->mode) { @@ -265,7 +265,7 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode) /* * Ignore KVM support for 5-level paging (vm->va_bits == 57), * it doesn't take effect unless a CR4.LA57 is set, which it - * isn't for this VM_MODE. + * isn't for this mode (48-bit virtual address space). */ TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, "Linear address width (%d bits) not supported", @@ -285,10 +285,11 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode) vm->pgtable_levels = 5; break; default: - TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); + TEST_FAIL("Unknown guest mode: 0x%x", vm->mode); } #ifdef __aarch64__ + TEST_ASSERT(!vm->type, "ARM doesn't support test-provided types"); if (vm->pa_bits != 40) vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits); #endif @@ -347,19 +348,19 @@ static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, return vm_adjust_num_guest_pages(mode, nr_pages); } -struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, +struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, uint64_t nr_extra_pages) { - uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus, + uint64_t nr_pages = vm_nr_pages_required(shape.mode, nr_runnable_vcpus, nr_extra_pages); struct userspace_mem_region *slot0; struct kvm_vm *vm; int i; - pr_debug("%s: mode='%s' pages='%ld'\n", __func__, - vm_guest_mode_string(mode), nr_pages); + pr_debug("%s: mode='%s' type='%d', pages='%ld'\n", __func__, + vm_guest_mode_string(shape.mode), shape.type, nr_pages); - vm = ____vm_create(mode); + vm = ____vm_create(shape); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0); for (i = 0; i < NR_MEM_REGIONS; i++) @@ -400,7 +401,7 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, * extra_mem_pages is only used to calculate the maximum page table size, * no real memory allocation for non-slot0 memory in this function. */ -struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, +struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, uint64_t extra_mem_pages, void *guest_code, struct kvm_vcpu *vcpus[]) { @@ -409,7 +410,7 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array"); - vm = __vm_create(mode, nr_vcpus, extra_mem_pages); + vm = __vm_create(shape, nr_vcpus, extra_mem_pages); for (i = 0; i < nr_vcpus; ++i) vcpus[i] = vm_vcpu_add(vm, i, guest_code); @@ -417,15 +418,15 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus return vm; } -struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, - uint64_t extra_mem_pages, - void *guest_code) +struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) { struct kvm_vcpu *vcpus[1]; struct kvm_vm *vm; - vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, extra_mem_pages, - guest_code, vcpus); + vm = __vm_create_with_vcpus(shape, 1, extra_mem_pages, guest_code, vcpus); *vcpu = vcpus[0]; return vm; diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index df457452d146..d05487e5a371 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -168,7 +168,8 @@ struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, * The memory is also added to memslot 0, but that's a benign side * effect as KVM allows aliasing HVAs in meslots. */ - vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages, + vm = __vm_create_with_vcpus(VM_SHAPE(mode), nr_vcpus, + slot0_pages + guest_num_pages, memstress_guest_code, vcpus); args->vm = vm; diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c index 85f34ca7e49e..0ed32ec903d0 100644 --- a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c @@ -271,7 +271,7 @@ int main(int argc, char *argv[]) kvm_check_cap(KVM_CAP_MCE); - vm = __vm_create(VM_MODE_DEFAULT, 3, 0); + vm = __vm_create(VM_SHAPE_DEFAULT, 3, 0); kvm_ioctl(vm->kvm_fd, KVM_X86_GET_MCE_CAP_SUPPORTED, &supported_mcg_caps); -- Gitee From 5ae0c299f76083e4653ef26e78a510c29588f7d2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:12 -0700 Subject: [PATCH 09/25] KVM: selftests: Add GUEST_SYNC[1-6] macros for synchronizing more data ANBZ: #34453 commit 242331dfc4959f44e706c9b09b768f312aa5e117 upstream. Add GUEST_SYNC[1-6]() so that tests can pass the maximum amount of information supported via ucall(), without needing to resort to shared memory. Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-31-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- tools/testing/selftests/kvm/include/ucall_common.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index ce33d306c2cb..0fb472a5a058 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -52,6 +52,17 @@ int ucall_nr_pages_required(uint64_t page_size); #define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) +#define GUEST_SYNC1(arg0) ucall(UCALL_SYNC, 1, arg0) +#define GUEST_SYNC2(arg0, arg1) ucall(UCALL_SYNC, 2, arg0, arg1) +#define GUEST_SYNC3(arg0, arg1, arg2) \ + ucall(UCALL_SYNC, 3, arg0, arg1, arg2) +#define GUEST_SYNC4(arg0, arg1, arg2, arg3) \ + ucall(UCALL_SYNC, 4, arg0, arg1, arg2, arg3) +#define GUEST_SYNC5(arg0, arg1, arg2, arg3, arg4) \ + ucall(UCALL_SYNC, 5, arg0, arg1, arg2, arg3, arg4) +#define GUEST_SYNC6(arg0, arg1, arg2, arg3, arg4, arg5) \ + ucall(UCALL_SYNC, 6, arg0, arg1, arg2, arg3, arg4, arg5) + #define GUEST_PRINTF(_fmt, _args...) ucall_fmt(UCALL_PRINTF, _fmt, ##_args) #define GUEST_DONE() ucall(UCALL_DONE, 0) -- Gitee From 1346b97ea2e70ee99641c0c1a9242a2046788f3a Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Fri, 27 Oct 2023 11:22:13 -0700 Subject: [PATCH 10/25] KVM: selftests: Add x86-only selftest for private memory conversions ANBZ: #34453 commit 43f623f350ce1c46c53b6b77f4dbe741af8c44f3 upstream. Add a selftest to exercise implicit/explicit conversion functionality within KVM and verify: - Shared memory is visible to host userspace - Private memory is not visible to host userspace - Host userspace and guest can communicate over shared memory - Data in shared backing is preserved across conversions (test's host userspace doesn't free the data) - Private memory is bound to the lifetime of the VM Ideally, KVM's selftests infrastructure would be reworked to allow backing a single region of guest memory with multiple memslots for _all_ backing types and shapes, i.e. ideally the code for using a single backing fd across multiple memslots would work for "regular" memory as well. But sadly, support for KVM_CREATE_GUEST_MEMFD has languished for far too long, and overhauling selftests' memslots infrastructure would likely open a can of worms, i.e. delay things even further. In addition to the more obvious tests, verify that PUNCH_HOLE actually frees memory. Directly verifying that KVM frees memory is impractical, if it's even possible, so instead indirectly verify memory is freed by asserting that the guest reads zeroes after a PUNCH_HOLE. E.g. if KVM zaps SPTEs but doesn't actually punch a hole in the inode, the subsequent read will still see the previous value. And obviously punching a hole shouldn't cause explosions. Let the user specify the number of memslots in the private mem conversion test, i.e. don't require the number of memslots to be '1' or "nr_vcpus". Creating more memslots than vCPUs is particularly interesting, e.g. it can result in a single KVM_SET_MEMORY_ATTRIBUTES spanning multiple memslots. To keep the math reasonable, align each vCPU's chunk to at least 2MiB (the size is 2MiB+4KiB), and require the total size to be cleanly divisible by the number of memslots. The goal is to be able to validate that KVM plays nice with multiple memslots, being able to create a truly arbitrary number of memslots doesn't add meaningful value, i.e. isn't worth the cost. Intentionally don't take a requirement on KVM_CAP_GUEST_MEMFD, KVM_CAP_MEMORY_FAULT_INFO, KVM_MEMORY_ATTRIBUTE_PRIVATE, etc., as it's a KVM bug to advertise KVM_X86_SW_PROTECTED_VM without its prerequisites. Signed-off-by: Vishal Annapurve Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-32-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- tools/testing/selftests/kvm/Makefile | 1 + .../kvm/x86_64/private_mem_conversions_test.c | 482 ++++++++++++++++++ 2 files changed, 483 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 197ba0aebf7a..3ffdcb39e051 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -91,6 +91,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test +TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c new file mode 100644 index 000000000000..4d6a37a5d896 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022, Google LLC. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#define BASE_DATA_SLOT 10 +#define BASE_DATA_GPA ((uint64_t)(1ull << 32)) +#define PER_CPU_DATA_SIZE ((uint64_t)(SZ_2M + PAGE_SIZE)) + +/* Horrific macro so that the line info is captured accurately :-( */ +#define memcmp_g(gpa, pattern, size) \ +do { \ + uint8_t *mem = (uint8_t *)gpa; \ + size_t i; \ + \ + for (i = 0; i < size; i++) \ + __GUEST_ASSERT(mem[i] == pattern, \ + "Guest expected 0x%x at offset %lu (gpa 0x%llx), got 0x%x", \ + pattern, i, gpa + i, mem[i]); \ +} while (0) + +static void memcmp_h(uint8_t *mem, uint64_t gpa, uint8_t pattern, size_t size) +{ + size_t i; + + for (i = 0; i < size; i++) + TEST_ASSERT(mem[i] == pattern, + "Host expected 0x%x at gpa 0x%lx, got 0x%x", + pattern, gpa + i, mem[i]); +} + +/* + * Run memory conversion tests with explicit conversion: + * Execute KVM hypercall to map/unmap gpa range which will cause userspace exit + * to back/unback private memory. Subsequent accesses by guest to the gpa range + * will not cause exit to userspace. + * + * Test memory conversion scenarios with following steps: + * 1) Access private memory using private access and verify that memory contents + * are not visible to userspace. + * 2) Convert memory to shared using explicit conversions and ensure that + * userspace is able to access the shared regions. + * 3) Convert memory back to private using explicit conversions and ensure that + * userspace is again not able to access converted private regions. + */ + +#define GUEST_STAGE(o, s) { .offset = o, .size = s } + +enum ucall_syncs { + SYNC_SHARED, + SYNC_PRIVATE, +}; + +static void guest_sync_shared(uint64_t gpa, uint64_t size, + uint8_t current_pattern, uint8_t new_pattern) +{ + GUEST_SYNC5(SYNC_SHARED, gpa, size, current_pattern, new_pattern); +} + +static void guest_sync_private(uint64_t gpa, uint64_t size, uint8_t pattern) +{ + GUEST_SYNC4(SYNC_PRIVATE, gpa, size, pattern); +} + +/* Arbitrary values, KVM doesn't care about the attribute flags. */ +#define MAP_GPA_SET_ATTRIBUTES BIT(0) +#define MAP_GPA_SHARED BIT(1) +#define MAP_GPA_DO_FALLOCATE BIT(2) + +static void guest_map_mem(uint64_t gpa, uint64_t size, bool map_shared, + bool do_fallocate) +{ + uint64_t flags = MAP_GPA_SET_ATTRIBUTES; + + if (map_shared) + flags |= MAP_GPA_SHARED; + if (do_fallocate) + flags |= MAP_GPA_DO_FALLOCATE; + kvm_hypercall_map_gpa_range(gpa, size, flags); +} + +static void guest_map_shared(uint64_t gpa, uint64_t size, bool do_fallocate) +{ + guest_map_mem(gpa, size, true, do_fallocate); +} + +static void guest_map_private(uint64_t gpa, uint64_t size, bool do_fallocate) +{ + guest_map_mem(gpa, size, false, do_fallocate); +} + +struct { + uint64_t offset; + uint64_t size; +} static const test_ranges[] = { + GUEST_STAGE(0, PAGE_SIZE), + GUEST_STAGE(0, SZ_2M), + GUEST_STAGE(PAGE_SIZE, PAGE_SIZE), + GUEST_STAGE(PAGE_SIZE, SZ_2M), + GUEST_STAGE(SZ_2M, PAGE_SIZE), +}; + +static void guest_test_explicit_conversion(uint64_t base_gpa, bool do_fallocate) +{ + const uint8_t def_p = 0xaa; + const uint8_t init_p = 0xcc; + uint64_t j; + int i; + + /* Memory should be shared by default. */ + memset((void *)base_gpa, def_p, PER_CPU_DATA_SIZE); + memcmp_g(base_gpa, def_p, PER_CPU_DATA_SIZE); + guest_sync_shared(base_gpa, PER_CPU_DATA_SIZE, def_p, init_p); + + memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE); + + for (i = 0; i < ARRAY_SIZE(test_ranges); i++) { + uint64_t gpa = base_gpa + test_ranges[i].offset; + uint64_t size = test_ranges[i].size; + uint8_t p1 = 0x11; + uint8_t p2 = 0x22; + uint8_t p3 = 0x33; + uint8_t p4 = 0x44; + + /* + * Set the test region to pattern one to differentiate it from + * the data range as a whole (contains the initial pattern). + */ + memset((void *)gpa, p1, size); + + /* + * Convert to private, set and verify the private data, and + * then verify that the rest of the data (map shared) still + * holds the initial pattern, and that the host always sees the + * shared memory (initial pattern). Unlike shared memory, + * punching a hole in private memory is destructive, i.e. + * previous values aren't guaranteed to be preserved. + */ + guest_map_private(gpa, size, do_fallocate); + + if (size > PAGE_SIZE) { + memset((void *)gpa, p2, PAGE_SIZE); + goto skip; + } + + memset((void *)gpa, p2, size); + guest_sync_private(gpa, size, p1); + + /* + * Verify that the private memory was set to pattern two, and + * that shared memory still holds the initial pattern. + */ + memcmp_g(gpa, p2, size); + if (gpa > base_gpa) + memcmp_g(base_gpa, init_p, gpa - base_gpa); + if (gpa + size < base_gpa + PER_CPU_DATA_SIZE) + memcmp_g(gpa + size, init_p, + (base_gpa + PER_CPU_DATA_SIZE) - (gpa + size)); + + /* + * Convert odd-number page frames back to shared to verify KVM + * also correctly handles holes in private ranges. + */ + for (j = 0; j < size; j += PAGE_SIZE) { + if ((j >> PAGE_SHIFT) & 1) { + guest_map_shared(gpa + j, PAGE_SIZE, do_fallocate); + guest_sync_shared(gpa + j, PAGE_SIZE, p1, p3); + + memcmp_g(gpa + j, p3, PAGE_SIZE); + } else { + guest_sync_private(gpa + j, PAGE_SIZE, p1); + } + } + +skip: + /* + * Convert the entire region back to shared, explicitly write + * pattern three to fill in the even-number frames before + * asking the host to verify (and write pattern four). + */ + guest_map_shared(gpa, size, do_fallocate); + memset((void *)gpa, p3, size); + guest_sync_shared(gpa, size, p3, p4); + memcmp_g(gpa, p4, size); + + /* Reset the shared memory back to the initial pattern. */ + memset((void *)gpa, init_p, size); + + /* + * Free (via PUNCH_HOLE) *all* private memory so that the next + * iteration starts from a clean slate, e.g. with respect to + * whether or not there are pages/folios in guest_mem. + */ + guest_map_shared(base_gpa, PER_CPU_DATA_SIZE, true); + } +} + +static void guest_punch_hole(uint64_t gpa, uint64_t size) +{ + /* "Mapping" memory shared via fallocate() is done via PUNCH_HOLE. */ + uint64_t flags = MAP_GPA_SHARED | MAP_GPA_DO_FALLOCATE; + + kvm_hypercall_map_gpa_range(gpa, size, flags); +} + +/* + * Test that PUNCH_HOLE actually frees memory by punching holes without doing a + * proper conversion. Freeing (PUNCH_HOLE) should zap SPTEs, and reallocating + * (subsequent fault) should zero memory. + */ +static void guest_test_punch_hole(uint64_t base_gpa, bool precise) +{ + const uint8_t init_p = 0xcc; + int i; + + /* + * Convert the entire range to private, this testcase is all about + * punching holes in guest_memfd, i.e. shared mappings aren't needed. + */ + guest_map_private(base_gpa, PER_CPU_DATA_SIZE, false); + + for (i = 0; i < ARRAY_SIZE(test_ranges); i++) { + uint64_t gpa = base_gpa + test_ranges[i].offset; + uint64_t size = test_ranges[i].size; + + /* + * Free all memory before each iteration, even for the !precise + * case where the memory will be faulted back in. Freeing and + * reallocating should obviously work, and freeing all memory + * minimizes the probability of cross-testcase influence. + */ + guest_punch_hole(base_gpa, PER_CPU_DATA_SIZE); + + /* Fault-in and initialize memory, and verify the pattern. */ + if (precise) { + memset((void *)gpa, init_p, size); + memcmp_g(gpa, init_p, size); + } else { + memset((void *)base_gpa, init_p, PER_CPU_DATA_SIZE); + memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE); + } + + /* + * Punch a hole at the target range and verify that reads from + * the guest succeed and return zeroes. + */ + guest_punch_hole(gpa, size); + memcmp_g(gpa, 0, size); + } +} + +static void guest_code(uint64_t base_gpa) +{ + /* + * Run the conversion test twice, with and without doing fallocate() on + * the guest_memfd backing when converting between shared and private. + */ + guest_test_explicit_conversion(base_gpa, false); + guest_test_explicit_conversion(base_gpa, true); + + /* + * Run the PUNCH_HOLE test twice too, once with the entire guest_memfd + * faulted in, once with only the target range faulted in. + */ + guest_test_punch_hole(base_gpa, false); + guest_test_punch_hole(base_gpa, true); + GUEST_DONE(); +} + +static void handle_exit_hypercall(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + uint64_t gpa = run->hypercall.args[0]; + uint64_t size = run->hypercall.args[1] * PAGE_SIZE; + bool set_attributes = run->hypercall.args[2] & MAP_GPA_SET_ATTRIBUTES; + bool map_shared = run->hypercall.args[2] & MAP_GPA_SHARED; + bool do_fallocate = run->hypercall.args[2] & MAP_GPA_DO_FALLOCATE; + struct kvm_vm *vm = vcpu->vm; + + TEST_ASSERT(run->hypercall.nr == KVM_HC_MAP_GPA_RANGE, + "Wanted MAP_GPA_RANGE (%u), got '%llu'", + KVM_HC_MAP_GPA_RANGE, run->hypercall.nr); + + if (do_fallocate) + vm_guest_mem_fallocate(vm, gpa, size, map_shared); + + if (set_attributes) + vm_set_memory_attributes(vm, gpa, size, + map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE); + run->hypercall.ret = 0; +} + +static bool run_vcpus; + +static void *__test_mem_conversions(void *__vcpu) +{ + struct kvm_vcpu *vcpu = __vcpu; + struct kvm_run *run = vcpu->run; + struct kvm_vm *vm = vcpu->vm; + struct ucall uc; + + while (!READ_ONCE(run_vcpus)) + ; + + for ( ;; ) { + vcpu_run(vcpu); + + if (run->exit_reason == KVM_EXIT_HYPERCALL) { + handle_exit_hypercall(vcpu); + continue; + } + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Wanted KVM_EXIT_IO, got exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: { + uint64_t gpa = uc.args[1]; + size_t size = uc.args[2]; + size_t i; + + TEST_ASSERT(uc.args[0] == SYNC_SHARED || + uc.args[0] == SYNC_PRIVATE, + "Unknown sync command '%ld'", uc.args[0]); + + for (i = 0; i < size; i += vm->page_size) { + size_t nr_bytes = min_t(size_t, vm->page_size, size - i); + uint8_t *hva = addr_gpa2hva(vm, gpa + i); + + /* In all cases, the host should observe the shared data. */ + memcmp_h(hva, gpa + i, uc.args[3], nr_bytes); + + /* For shared, write the new pattern to guest memory. */ + if (uc.args[0] == SYNC_SHARED) + memset(hva, uc.args[4], nr_bytes); + } + break; + } + case UCALL_DONE: + return NULL; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } + } +} + +static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t nr_vcpus, + uint32_t nr_memslots) +{ + /* + * Allocate enough memory so that each vCPU's chunk of memory can be + * naturally aligned with respect to the size of the backing store. + */ + const size_t alignment = max_t(size_t, SZ_2M, get_backing_src_pagesz(src_type)); + const size_t per_cpu_size = align_up(PER_CPU_DATA_SIZE, alignment); + const size_t memfd_size = per_cpu_size * nr_vcpus; + const size_t slot_size = memfd_size / nr_memslots; + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + pthread_t threads[KVM_MAX_VCPUS]; + struct kvm_vm *vm; + int memfd, i, r; + + const struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = KVM_X86_SW_PROTECTED_VM, + }; + + TEST_ASSERT(slot_size * nr_memslots == memfd_size, + "The memfd size (0x%lx) needs to be cleanly divisible by the number of memslots (%u)", + memfd_size, nr_memslots); + vm = __vm_create_with_vcpus(shape, nr_vcpus, 0, guest_code, vcpus); + + vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE)); + + memfd = vm_create_guest_memfd(vm, memfd_size, 0); + + for (i = 0; i < nr_memslots; i++) + vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i, + BASE_DATA_SLOT + i, slot_size / vm->page_size, + KVM_MEM_GUEST_MEMFD, memfd, slot_size * i); + + for (i = 0; i < nr_vcpus; i++) { + uint64_t gpa = BASE_DATA_GPA + i * per_cpu_size; + + vcpu_args_set(vcpus[i], 1, gpa); + + /* + * Map only what is needed so that an out-of-bounds access + * results #PF => SHUTDOWN instead of data corruption. + */ + virt_map(vm, gpa, gpa, PER_CPU_DATA_SIZE / vm->page_size); + + pthread_create(&threads[i], NULL, __test_mem_conversions, vcpus[i]); + } + + WRITE_ONCE(run_vcpus, true); + + for (i = 0; i < nr_vcpus; i++) + pthread_join(threads[i], NULL); + + kvm_vm_free(vm); + + /* + * Allocate and free memory from the guest_memfd after closing the VM + * fd. The guest_memfd is gifted a reference to its owning VM, i.e. + * should prevent the VM from being fully destroyed until the last + * reference to the guest_memfd is also put. + */ + r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + + r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); +} + +static void usage(const char *cmd) +{ + puts(""); + printf("usage: %s [-h] [-m nr_memslots] [-s mem_type] [-n nr_vcpus]\n", cmd); + puts(""); + backing_src_help("-s"); + puts(""); + puts(" -n: specify the number of vcpus (default: 1)"); + puts(""); + puts(" -m: specify the number of memslots (default: 1)"); + puts(""); +} + +int main(int argc, char *argv[]) +{ + enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC; + uint32_t nr_memslots = 1; + uint32_t nr_vcpus = 1; + int opt; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)); + + while ((opt = getopt(argc, argv, "hm:s:n:")) != -1) { + switch (opt) { + case 's': + src_type = parse_backing_src_type(optarg); + break; + case 'n': + nr_vcpus = atoi_positive("nr_vcpus", optarg); + break; + case 'm': + nr_memslots = atoi_positive("nr_memslots", optarg); + break; + case 'h': + default: + usage(argv[0]); + exit(0); + } + } + + test_mem_conversions(src_type, nr_vcpus, nr_memslots); + + return 0; +} -- Gitee From b06c12a20be578b01e8356f3dba29abc760ab7cd Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:14 -0700 Subject: [PATCH 11/25] KVM: selftests: Add KVM_SET_USER_MEMORY_REGION2 helper ANBZ: #34453 commit e6f4f345b259cad178bad7e40a9d4b65cf195067 upstream. Add helpers to invoke KVM_SET_USER_MEMORY_REGION2 directly so that tests can validate of features that are unique to "version 2" of "set user memory region", e.g. do negative testing on gmem_fd and gmem_offset. Provide a raw version as well as an assert-success version to reduce the amount of boilerplate code need for basic usage. Signed-off-by: Chao Peng Signed-off-by: Ackerley Tng Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-33-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../selftests/kvm/include/kvm_util_base.h | 7 +++++ tools/testing/selftests/kvm/lib/kvm_util.c | 29 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 683f6262da25..398c705e4e11 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -523,6 +523,13 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, uint64_t gpa, uint64_t size, void *hva); int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, uint64_t gpa, uint64_t size, void *hva); +void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); +int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); + void vm_userspace_mem_region_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint64_t guest_paddr, uint32_t slot, uint64_t npages, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 9f4c115116b8..8964027cc299 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -870,6 +870,35 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, errno, strerror(errno)); } +int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset) +{ + struct kvm_userspace_memory_region2 region = { + .slot = slot, + .flags = flags, + .guest_phys_addr = gpa, + .memory_size = size, + .userspace_addr = (uintptr_t)hva, + .guest_memfd = guest_memfd, + .guest_memfd_offset = guest_memfd_offset, + }; + + return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, ®ion); +} + +void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset) +{ + int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva, + guest_memfd, guest_memfd_offset); + + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)", + errno, strerror(errno)); +} + + /* FIXME: This thing needs to be ripped apart and rewritten. */ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint64_t guest_paddr, uint32_t slot, uint64_t npages, -- Gitee From 72b18df8c3bfb54960847968447e8d5c8c4537d2 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:15 -0700 Subject: [PATCH 12/25] KVM: selftests: Expand set_memory_region_test to validate guest_memfd() ANBZ: #34453 commit 2feabb855df8f0889146c2e951307ba477d1f37b upstream. Expand set_memory_region_test to exercise various positive and negative testcases for private memory. - Non-guest_memfd() file descriptor for private memory - guest_memfd() from different VM - Overlapping bindings - Unaligned bindings Signed-off-by: Chao Peng Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng [sean: trim the testcases to remove duplicate coverage] Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-34-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../selftests/kvm/include/kvm_util_base.h | 12 ++ .../selftests/kvm/set_memory_region_test.c | 104 +++++++++++++++++- 2 files changed, 114 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 398c705e4e11..8c714071e503 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -820,6 +820,18 @@ static inline struct kvm_vm *vm_create_barebones(void) return ____vm_create(VM_SHAPE_DEFAULT); } +#ifdef __x86_64__ +static inline struct kvm_vm *vm_create_barebones_protected_vm(void) +{ + const struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = KVM_X86_SW_PROTECTED_VM, + }; + + return ____vm_create(shape); +} +#endif + static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) { return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index b32960189f5f..a78394faee7c 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -385,13 +385,105 @@ static void test_add_max_memory_regions(void) kvm_vm_free(vm); } + +#ifdef __x86_64__ +static void test_invalid_guest_memfd(struct kvm_vm *vm, int memfd, + size_t offset, const char *msg) +{ + int r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, + 0, memfd, offset); + TEST_ASSERT(r == -1 && errno == EINVAL, "%s", msg); +} + +static void test_add_private_memory_region(void) +{ + struct kvm_vm *vm, *vm2; + int memfd, i; + + pr_info("Testing ADD of KVM_MEM_GUEST_MEMFD memory regions\n"); + + vm = vm_create_barebones_protected_vm(); + + test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail"); + test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail"); + + memfd = kvm_memfd_alloc(MEM_REGION_SIZE, false); + test_invalid_guest_memfd(vm, memfd, 0, "Regular memfd() should fail"); + close(memfd); + + vm2 = vm_create_barebones_protected_vm(); + memfd = vm_create_guest_memfd(vm2, MEM_REGION_SIZE, 0); + test_invalid_guest_memfd(vm, memfd, 0, "Other VM's guest_memfd() should fail"); + + vm_set_user_memory_region2(vm2, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); + close(memfd); + kvm_vm_free(vm2); + + memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0); + for (i = 1; i < PAGE_SIZE; i++) + test_invalid_guest_memfd(vm, memfd, i, "Unaligned offset should fail"); + + vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); + close(memfd); + + kvm_vm_free(vm); +} + +static void test_add_overlapping_private_memory_regions(void) +{ + struct kvm_vm *vm; + int memfd; + int r; + + pr_info("Testing ADD of overlapping KVM_MEM_GUEST_MEMFD memory regions\n"); + + vm = vm_create_barebones_protected_vm(); + + memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 4, 0); + + vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE * 2, 0, memfd, 0); + + vm_set_user_memory_region2(vm, MEM_REGION_SLOT + 1, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA * 2, MEM_REGION_SIZE * 2, + 0, memfd, MEM_REGION_SIZE * 2); + + /* + * Delete the first memslot, and then attempt to recreate it except + * with a "bad" offset that results in overlap in the guest_memfd(). + */ + vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, 0, NULL, -1, 0); + + /* Overlap the front half of the other slot. */ + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA * 2 - MEM_REGION_SIZE, + MEM_REGION_SIZE * 2, + 0, memfd, 0); + TEST_ASSERT(r == -1 && errno == EEXIST, "%s", + "Overlapping guest_memfd() bindings should fail with EEXIST"); + + /* And now the back half of the other slot. */ + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA * 2 + MEM_REGION_SIZE, + MEM_REGION_SIZE * 2, + 0, memfd, 0); + TEST_ASSERT(r == -1 && errno == EEXIST, "%s", + "Overlapping guest_memfd() bindings should fail with EEXIST"); + + close(memfd); + kvm_vm_free(vm); +} +#endif + int main(int argc, char *argv[]) { #ifdef __x86_64__ int i, loops; -#endif -#ifdef __x86_64__ /* * FIXME: the zero-memslot test fails on aarch64 and s390x because * KVM_RUN fails with ENOEXEC or EFAULT. @@ -402,6 +494,14 @@ int main(int argc, char *argv[]) test_add_max_memory_regions(); #ifdef __x86_64__ + if (kvm_has_cap(KVM_CAP_GUEST_MEMFD) && + (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))) { + test_add_private_memory_region(); + test_add_overlapping_private_memory_regions(); + } else { + pr_info("Skipping tests for KVM_MEM_GUEST_MEMFD memory regions\n"); + } + if (argc > 1) loops = atoi_positive("Number of iterations", argv[1]); else -- Gitee From 9037164145fdcaef11eebfb6550f676c74971610 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:16 -0700 Subject: [PATCH 13/25] KVM: selftests: Add basic selftest for guest_memfd() ANBZ: #34453 commit 8a89efd43423cb3005c5e641e846184e292c1465 upstream. Add a selftest to verify the basic functionality of guest_memfd(): + file descriptor created with the guest_memfd() ioctl does not allow read/write/mmap operations + file size and block size as returned from fstat are as expected + fallocate on the fd checks that offset/length on fallocate(FALLOC_FL_PUNCH_HOLE) should be page aligned + invalid inputs (misaligned size, invalid flags) are rejected + file size and inode are unique (the innocuous-sounding anon_inode_getfile() backs all files with a single inode...) Signed-off-by: Chao Peng Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Co-developed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-35-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- tools/testing/selftests/kvm/Makefile | 1 + .../testing/selftests/kvm/guest_memfd_test.c | 207 ++++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 tools/testing/selftests/kvm/guest_memfd_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 3ffdcb39e051..d00c0955c3aa 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -134,6 +134,7 @@ TEST_GEN_PROGS_x86_64 += access_tracking_perf_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test +TEST_GEN_PROGS_x86_64 += guest_memfd_test TEST_GEN_PROGS_x86_64 += guest_print_test TEST_GEN_PROGS_x86_64 += hardware_disable_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c new file mode 100644 index 000000000000..fd389663c49b --- /dev/null +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright Intel Corporation, 2023 + * + * Author: Chao Peng + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util_base.h" + +static void test_file_read_write(int fd) +{ + char buf[64]; + + TEST_ASSERT(read(fd, buf, sizeof(buf)) < 0, + "read on a guest_mem fd should fail"); + TEST_ASSERT(write(fd, buf, sizeof(buf)) < 0, + "write on a guest_mem fd should fail"); + TEST_ASSERT(pread(fd, buf, sizeof(buf), 0) < 0, + "pread on a guest_mem fd should fail"); + TEST_ASSERT(pwrite(fd, buf, sizeof(buf), 0) < 0, + "pwrite on a guest_mem fd should fail"); +} + +static void test_mmap(int fd, size_t page_size) +{ + char *mem; + + mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT_EQ(mem, MAP_FAILED); +} + +static void test_file_size(int fd, size_t page_size, size_t total_size) +{ + struct stat sb; + int ret; + + ret = fstat(fd, &sb); + TEST_ASSERT(!ret, "fstat should succeed"); + TEST_ASSERT_EQ(sb.st_size, total_size); + TEST_ASSERT_EQ(sb.st_blksize, page_size); +} + +static void test_fallocate(int fd, size_t page_size, size_t total_size) +{ + int ret; + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, total_size); + TEST_ASSERT(!ret, "fallocate with aligned offset and size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + page_size - 1, page_size); + TEST_ASSERT(ret, "fallocate with unaligned offset should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, total_size, page_size); + TEST_ASSERT(ret, "fallocate beginning at total_size should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, total_size + page_size, page_size); + TEST_ASSERT(ret, "fallocate beginning after total_size should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + total_size, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) at total_size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + total_size + page_size, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) after total_size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + page_size, page_size - 1); + TEST_ASSERT(ret, "fallocate with unaligned size should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + page_size, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) with aligned offset and size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, page_size, page_size); + TEST_ASSERT(!ret, "fallocate to restore punched hole should succeed"); +} + +static void test_invalid_punch_hole(int fd, size_t page_size, size_t total_size) +{ + struct { + off_t offset; + off_t len; + } testcases[] = { + {0, 1}, + {0, page_size - 1}, + {0, page_size + 1}, + + {1, 1}, + {1, page_size - 1}, + {1, page_size}, + {1, page_size + 1}, + + {page_size, 1}, + {page_size, page_size - 1}, + {page_size, page_size + 1}, + }; + int ret, i; + + for (i = 0; i < ARRAY_SIZE(testcases); i++) { + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + testcases[i].offset, testcases[i].len); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "PUNCH_HOLE with !PAGE_SIZE offset (%lx) and/or length (%lx) should fail", + testcases[i].offset, testcases[i].len); + } +} + +static void test_create_guest_memfd_invalid(struct kvm_vm *vm) +{ + size_t page_size = getpagesize(); + uint64_t flag; + size_t size; + int fd; + + for (size = 1; size < page_size; size++) { + fd = __vm_create_guest_memfd(vm, size, 0); + TEST_ASSERT(fd == -1 && errno == EINVAL, + "guest_memfd() with non-page-aligned page size '0x%lx' should fail with EINVAL", + size); + } + + for (flag = 1; flag; flag <<= 1) { + uint64_t bit; + + fd = __vm_create_guest_memfd(vm, page_size, flag); + TEST_ASSERT(fd == -1 && errno == EINVAL, + "guest_memfd() with flag '0x%lx' should fail with EINVAL", + flag); + + for_each_set_bit(bit, &valid_flags, 64) { + fd = __vm_create_guest_memfd(vm, page_size, flag | BIT_ULL(bit)); + TEST_ASSERT(fd == -1 && errno == EINVAL, + "guest_memfd() with flags '0x%llx' should fail with EINVAL", + flag | BIT_ULL(bit)); + } + } +} + +static void test_create_guest_memfd_multiple(struct kvm_vm *vm) +{ + int fd1, fd2, ret; + struct stat st1, st2; + + fd1 = __vm_create_guest_memfd(vm, 4096, 0); + TEST_ASSERT(fd1 != -1, "memfd creation should succeed"); + + ret = fstat(fd1, &st1); + TEST_ASSERT(ret != -1, "memfd fstat should succeed"); + TEST_ASSERT(st1.st_size == 4096, "memfd st_size should match requested size"); + + fd2 = __vm_create_guest_memfd(vm, 8192, 0); + TEST_ASSERT(fd2 != -1, "memfd creation should succeed"); + + ret = fstat(fd2, &st2); + TEST_ASSERT(ret != -1, "memfd fstat should succeed"); + TEST_ASSERT(st2.st_size == 8192, "second memfd st_size should match requested size"); + + ret = fstat(fd1, &st1); + TEST_ASSERT(ret != -1, "memfd fstat should succeed"); + TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size"); + TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers"); +} + +int main(int argc, char *argv[]) +{ + size_t page_size; + size_t total_size; + int fd; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD)); + + page_size = getpagesize(); + total_size = page_size * 4; + + vm = vm_create_barebones(); + + test_create_guest_memfd_invalid(vm); + test_create_guest_memfd_multiple(vm); + + fd = vm_create_guest_memfd(vm, total_size, 0); + + test_file_read_write(fd); + test_mmap(fd, page_size); + test_file_size(fd, page_size, total_size); + test_fallocate(fd, page_size, total_size); + test_invalid_punch_hole(fd, page_size, total_size); + + close(fd); +} -- Gitee From d6440853115215134e2627384815f9ad5b637aaf Mon Sep 17 00:00:00 2001 From: Ackerley Tng Date: Fri, 27 Oct 2023 11:22:17 -0700 Subject: [PATCH 14/25] KVM: selftests: Test KVM exit behavior for private memory/access ANBZ: #34453 commit e3577788de64139202d89224fe31613c0f02b790 upstream. "Testing private access when memslot gets deleted" tests the behavior of KVM when a private memslot gets deleted while the VM is using the private memslot. When KVM looks up the deleted (slot = NULL) memslot, KVM should exit to userspace with KVM_EXIT_MEMORY_FAULT. In the second test, upon a private access to non-private memslot, KVM should also exit to userspace with KVM_EXIT_MEMORY_FAULT. Intentionally don't take a requirement on KVM_CAP_GUEST_MEMFD, KVM_CAP_MEMORY_FAULT_INFO, KVM_MEMORY_ATTRIBUTE_PRIVATE, etc., as it's a KVM bug to advertise KVM_X86_SW_PROTECTED_VM without its prerequisites. Signed-off-by: Ackerley Tng [sean: call out the similarities with set_memory_region_test] Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-36-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- tools/testing/selftests/kvm/Makefile | 1 + .../kvm/x86_64/private_mem_kvm_exits_test.c | 120 ++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index d00c0955c3aa..27af0edc1824 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -92,6 +92,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test +TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c new file mode 100644 index 000000000000..13e72fcec8dd --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023, Google LLC. + */ +#include +#include +#include + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +/* Arbitrarily selected to avoid overlaps with anything else */ +#define EXITS_TEST_GVA 0xc0000000 +#define EXITS_TEST_GPA EXITS_TEST_GVA +#define EXITS_TEST_NPAGES 1 +#define EXITS_TEST_SIZE (EXITS_TEST_NPAGES * PAGE_SIZE) +#define EXITS_TEST_SLOT 10 + +static uint64_t guest_repeatedly_read(void) +{ + volatile uint64_t value; + + while (true) + value = *((uint64_t *) EXITS_TEST_GVA); + + return value; +} + +static uint32_t run_vcpu_get_exit_reason(struct kvm_vcpu *vcpu) +{ + int r; + + r = _vcpu_run(vcpu); + if (r) { + TEST_ASSERT(errno == EFAULT, KVM_IOCTL_ERROR(KVM_RUN, r)); + TEST_ASSERT_EQ(vcpu->run->exit_reason, KVM_EXIT_MEMORY_FAULT); + } + return vcpu->run->exit_reason; +} + +const struct vm_shape protected_vm_shape = { + .mode = VM_MODE_DEFAULT, + .type = KVM_X86_SW_PROTECTED_VM, +}; + +static void test_private_access_memslot_deleted(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + pthread_t vm_thread; + void *thread_return; + uint32_t exit_reason; + + vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu, + guest_repeatedly_read); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + EXITS_TEST_GPA, EXITS_TEST_SLOT, + EXITS_TEST_NPAGES, + KVM_MEM_GUEST_MEMFD); + + virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES); + + /* Request to access page privately */ + vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE); + + pthread_create(&vm_thread, NULL, + (void *(*)(void *))run_vcpu_get_exit_reason, + (void *)vcpu); + + vm_mem_region_delete(vm, EXITS_TEST_SLOT); + + pthread_join(vm_thread, &thread_return); + exit_reason = (uint32_t)(uint64_t)thread_return; + + TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT); + TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE); + TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA); + TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE); + + kvm_vm_free(vm); +} + +static void test_private_access_memslot_not_private(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + uint32_t exit_reason; + + vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu, + guest_repeatedly_read); + + /* Add a non-private memslot (flags = 0) */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + EXITS_TEST_GPA, EXITS_TEST_SLOT, + EXITS_TEST_NPAGES, 0); + + virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES); + + /* Request to access page privately */ + vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE); + + exit_reason = run_vcpu_get_exit_reason(vcpu); + + TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT); + TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE); + TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA); + TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)); + + test_private_access_memslot_deleted(); + test_private_access_memslot_not_private(); +} -- Gitee From 02513694f6c9f4737d344ed767cedf6898b14152 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 30 Oct 2023 17:20:49 -0700 Subject: [PATCH 15/25] KVM: selftests: Add a memory region subtest to validate invalid flags ANBZ: #34453 commit 5d74316466f4aabdd2ee1e33b45e4933c9bc3ea1 upstream. Add a subtest to set_memory_region_test to verify that KVM rejects invalid flags and combinations with -EINVAL. KVM might or might not fail with EINVAL anyways, but we can at least try. Signed-off-by: Sean Christopherson Message-Id: <20231031002049.3915752-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../testing/selftests/kvm/guest_memfd_test.c | 9 +--- .../selftests/kvm/set_memory_region_test.c | 49 +++++++++++++++++++ 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index fd389663c49b..318ba6ba8bd3 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -136,20 +136,13 @@ static void test_create_guest_memfd_invalid(struct kvm_vm *vm) size); } - for (flag = 1; flag; flag <<= 1) { + for (flag = 0; flag; flag <<= 1) { uint64_t bit; fd = __vm_create_guest_memfd(vm, page_size, flag); TEST_ASSERT(fd == -1 && errno == EINVAL, "guest_memfd() with flag '0x%lx' should fail with EINVAL", flag); - - for_each_set_bit(bit, &valid_flags, 64) { - fd = __vm_create_guest_memfd(vm, page_size, flag | BIT_ULL(bit)); - TEST_ASSERT(fd == -1 && errno == EINVAL, - "guest_memfd() with flags '0x%llx' should fail with EINVAL", - flag | BIT_ULL(bit)); - } } } diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index a78394faee7c..1efee1cfcff0 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -326,6 +326,53 @@ static void test_zero_memory_regions(void) } #endif /* __x86_64__ */ +static void test_invalid_memory_region_flags(void) +{ + uint32_t supported_flags = KVM_MEM_LOG_DIRTY_PAGES; + const uint32_t v2_only_flags = KVM_MEM_GUEST_MEMFD; + struct kvm_vm *vm; + int r, i; + +#ifdef __x86_64__ + supported_flags |= KVM_MEM_READONLY; + + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) + vm = vm_create_barebones_protected_vm(); + else +#endif + vm = vm_create_barebones(); + + if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES) & KVM_MEMORY_ATTRIBUTE_PRIVATE) + supported_flags |= KVM_MEM_GUEST_MEMFD; + + for (i = 0; i < 32; i++) { + if ((supported_flags & BIT(i)) && !(v2_only_flags & BIT(i))) + continue; + + r = __vm_set_user_memory_region(vm, MEM_REGION_SLOT, BIT(i), + MEM_REGION_GPA, MEM_REGION_SIZE, NULL); + + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION should have failed on v2 only flag 0x%lx", BIT(i)); + + if (supported_flags & BIT(i)) + continue; + + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, BIT(i), + MEM_REGION_GPA, MEM_REGION_SIZE, NULL, 0, 0); + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION2 should have failed on unsupported flag 0x%lx", BIT(i)); + } + + if (supported_flags & KVM_MEM_GUEST_MEMFD) { + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, + KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, NULL, 0, 0); + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported"); + } +} + /* * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any * tentative to add further slots should fail. @@ -491,6 +538,8 @@ int main(int argc, char *argv[]) test_zero_memory_regions(); #endif + test_invalid_memory_region_flags(); + test_add_max_memory_regions(); #ifdef __x86_64__ -- Gitee From 2cd1294c56307a457274030a178e69ce592c7a8e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 21:58:23 +0000 Subject: [PATCH 16/25] fs: Rename mapping private members ANBZ: #34453 commit 600f111ef51dc2cbdb330b09d09f1856efa64912 upstream. It is hard to find where mapping->private_lock, mapping->private_list and mapping->private_data are used, due to private_XXX being a relatively common name for variables and structure members in the kernel. To fit with other members of struct address_space, rename them all to have an i_ prefix. Tested with an allmodconfig build. [Backport Changes] 1. In include/linux/fs.h, the member private_list in struct address_space was renamed to i_private_list as per the upstream change. So, to maintain consistency with the updated structure, all existing references to private_list in virt/kvm/guest_memfd.c and fs/inode.c were also updated to i_private_list, thereby matching the changes introduced in upstream commit 09d1c6a80f2cf9. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20231117215823.2821906-1-willy@infradead.org Acked-by: Darrick J. Wong Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner Signed-off-by: Hemanth Selam Signed-off-by: mohanasv --- fs/aio.c | 16 +++--- fs/btrfs/extent_io.c | 52 ++++++++++---------- fs/btrfs/subpage.c | 4 +- fs/buffer.c | 108 ++++++++++++++++++++--------------------- fs/ext4/inode.c | 4 +- fs/gfs2/glock.c | 2 +- fs/gfs2/ops_fstype.c | 2 +- fs/hugetlbfs/inode.c | 4 +- fs/inode.c | 10 ++-- fs/nfs/write.c | 12 ++--- fs/nilfs2/inode.c | 4 +- fs/ntfs/aops.c | 10 ++-- include/linux/fs.h | 12 ++--- mm/hugetlb.c | 2 +- mm/migrate.c | 6 +-- virt/kvm/guest_memfd.c | 6 +-- 16 files changed, 127 insertions(+), 127 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 4a9b5e4719ee..416e9b37e79a 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -266,7 +266,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) return ERR_CAST(inode); inode->i_mapping->a_ops = &aio_ctx_aops; - inode->i_mapping->private_data = ctx; + inode->i_mapping->i_private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; file = alloc_file_pseudo(inode, aio_mnt, "[aio]", @@ -316,10 +316,10 @@ static void put_aio_ring_file(struct kioctx *ctx) /* Prevent further access to the kioctx from migratepages */ i_mapping = aio_ring_file->f_mapping; - spin_lock(&i_mapping->private_lock); - i_mapping->private_data = NULL; + spin_lock(&i_mapping->i_private_lock); + i_mapping->i_private_data = NULL; ctx->aio_ring_file = NULL; - spin_unlock(&i_mapping->private_lock); + spin_unlock(&i_mapping->i_private_lock); fput(aio_ring_file); } @@ -422,9 +422,9 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, rc = 0; - /* mapping->private_lock here protects against the kioctx teardown. */ - spin_lock(&mapping->private_lock); - ctx = mapping->private_data; + /* mapping->i_private_lock here protects against the kioctx teardown. */ + spin_lock(&mapping->i_private_lock); + ctx = mapping->i_private_data; if (!ctx) { rc = -EINVAL; goto out; @@ -476,7 +476,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, out_unlock: mutex_unlock(&ctx->ring_lock); out: - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return rc; } #else diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 669f80fb8710..d3a779322aac 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -885,7 +885,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, * will not race with any other ebs. */ if (page->mapping) - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); if (fs_info->nodesize >= PAGE_SIZE) { if (!PagePrivate(page)) @@ -1780,16 +1780,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); bit_start += sectors_per_node; continue; } @@ -1803,7 +1803,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1855,9 +1855,9 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } @@ -1868,16 +1868,16 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) * crashing the machine for something we can survive anyway. */ if (WARN_ON(!eb)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } if (eb == ctx->eb) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (!ret) return 0; @@ -3313,7 +3313,7 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) { struct btrfs_subpage *subpage; - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); if (PagePrivate(page)) { subpage = (struct btrfs_subpage *)page->private; @@ -3336,14 +3336,14 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag /* * For mapped eb, we're going to change the page private, which should - * be done under the private_lock. + * be done under the i_private_lock. */ if (mapped) - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return; } @@ -3367,7 +3367,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag detach_page_private(page); } if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return; } @@ -3390,7 +3390,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag if (!page_range_has_eb(fs_info, page)) btrfs_detach_subpage(fs_info, page); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); } /* Release all pages attached to the extent buffer */ @@ -3768,7 +3768,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock nor page lock hold. + * allocate memory with i_private_lock nor page lock hold. * * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. @@ -3789,10 +3789,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, goto free_eb; } - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); exists = grab_extent_buffer(fs_info, p); if (exists) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); unlock_page(p); put_page(p); mark_extent_buffer_accessed(exists, p); @@ -3812,7 +3812,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * Thus needs no special handling in error path. */ btrfs_page_inc_eb_refs(fs_info, p); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; @@ -4830,12 +4830,12 @@ static int try_release_subpage_extent_buffer(struct page *page) * Finally to check if we have cleared page private, as if we have * released all ebs in the page, the page private should be cleared now. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) ret = 1; else ret = 0; - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return ret; } @@ -4851,9 +4851,9 @@ int try_release_extent_buffer(struct page *page) * We need to make sure nobody is changing page->private, as we rely on * page->private as the pointer to extent buffer. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return 1; } @@ -4868,10 +4868,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index b98d42ca5564..4fe2f14f5de6 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -200,7 +200,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); subpage = (struct btrfs_subpage *)page->private; atomic_inc(&subpage->eb_refs); @@ -215,7 +215,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); subpage = (struct btrfs_subpage *)page->private; ASSERT(atomic_read(&subpage->eb_refs)); diff --git a/fs/buffer.c b/fs/buffer.c index 6b5a7a678361..41f117ecd0f7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -180,11 +180,11 @@ EXPORT_SYMBOL(end_buffer_write_sync); * Various filesystems appear to want __find_get_block to be non-blocking. * But it's the page lock which protects the buffers. To get around this, * we get exclusion from try_to_free_buffers with the blockdev mapping's - * private_lock. + * i_private_lock. * - * Hack idea: for the blockdev mapping, private_lock contention + * Hack idea: for the blockdev mapping, i_private_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. + * succeeds, there is no need to take i_private_lock. */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) @@ -204,7 +204,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) if (IS_ERR(folio)) goto out; - spin_lock(&bd_mapping->private_lock); + spin_lock(&bd_mapping->i_private_lock); head = folio_buffers(folio); if (!head) goto out_unlock; @@ -236,7 +236,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) 1 << bd_inode->i_blkbits); } out_unlock: - spin_unlock(&bd_mapping->private_lock); + spin_unlock(&bd_mapping->i_private_lock); folio_put(folio); out: return ret; @@ -467,25 +467,25 @@ EXPORT_SYMBOL(mark_buffer_async_write); * * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the - * management of a list of dependent buffers at ->i_mapping->private_list. + * management of a list of dependent buffers at ->i_mapping->i_private_list. * * Locking is a little subtle: try_to_free_buffers() will remove buffers * from their controlling inode's queue when they are being freed. But * try_to_free_buffers() will be operating against the *blockdev* mapping * at the time, not against the S_ISREG file which depends on those buffers. - * So the locking for private_list is via the private_lock in the address_space + * So the locking for i_private_list is via the i_private_lock in the address_space * which backs the buffers. Which is different from the address_space * against which the buffers are listed. So for a particular address_space, - * mapping->private_lock does *not* protect mapping->private_list! In fact, - * mapping->private_list will always be protected by the backing blockdev's - * ->private_lock. + * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact, + * mapping->i_private_list will always be protected by the backing blockdev's + * ->i_private_lock. * * Which introduces a requirement: all buffers on an address_space's - * ->private_list must be from the same address_space: the blockdev's. + * ->i_private_list must be from the same address_space: the blockdev's. * - * address_spaces which do not place buffers at ->private_list via these - * utility functions are free to use private_lock and private_list for - * whatever they want. The only requirement is that list_empty(private_list) + * address_spaces which do not place buffers at ->i_private_list via these + * utility functions are free to use i_private_lock and i_private_list for + * whatever they want. The only requirement is that list_empty(i_private_list) * be true at clear_inode() time. * * FIXME: clear_inode should not call invalidate_inode_buffers(). The @@ -508,7 +508,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); */ /* - * The buffer's backing address_space's private_lock must be held + * The buffer's backing address_space's i_private_lock must be held */ static void __remove_assoc_queue(struct buffer_head *bh) { @@ -519,7 +519,7 @@ static void __remove_assoc_queue(struct buffer_head *bh) int inode_has_buffers(struct inode *inode) { - return !list_empty(&inode->i_data.private_list); + return !list_empty(&inode->i_data.i_private_list); } /* @@ -561,7 +561,7 @@ static int osync_buffers_list(spinlock_t *lock, struct list_head *list) * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written * - * Starts I/O against the buffers at mapping->private_list, and waits upon + * Starts I/O against the buffers at mapping->i_private_list, and waits upon * that I/O. * * Basically, this is a convenience function for fsync(). @@ -570,13 +570,13 @@ static int osync_buffers_list(spinlock_t *lock, struct list_head *list) */ int sync_mapping_buffers(struct address_space *mapping) { - struct address_space *buffer_mapping = mapping->private_data; + struct address_space *buffer_mapping = mapping->i_private_data; - if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + if (buffer_mapping == NULL || list_empty(&mapping->i_private_list)) return 0; - return fsync_buffers_list(&buffer_mapping->private_lock, - &mapping->private_list); + return fsync_buffers_list(&buffer_mapping->i_private_lock, + &mapping->i_private_list); } EXPORT_SYMBOL(sync_mapping_buffers); @@ -673,17 +673,17 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) struct address_space *buffer_mapping = bh->b_folio->mapping; mark_buffer_dirty(bh); - if (!mapping->private_data) { - mapping->private_data = buffer_mapping; + if (!mapping->i_private_data) { + mapping->i_private_data = buffer_mapping; } else { - BUG_ON(mapping->private_data != buffer_mapping); + BUG_ON(mapping->i_private_data != buffer_mapping); } if (!bh->b_assoc_map) { - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); list_move_tail(&bh->b_assoc_buffers, - &mapping->private_list); + &mapping->i_private_list); bh->b_assoc_map = mapping; - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(mark_buffer_dirty_inode); @@ -706,7 +706,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean * page on the dirty page list. * - * We use private_lock to lock against try_to_free_buffers while using the + * We use i_private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. * @@ -718,7 +718,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) struct buffer_head *head; bool newly_dirty; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; @@ -734,7 +734,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) */ folio_memcg_lock(folio); newly_dirty = !folio_test_set_dirty(folio); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (newly_dirty) __folio_mark_dirty(folio, mapping, 1); @@ -827,7 +827,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) smp_mb(); if (buffer_dirty(bh)) { list_add(&bh->b_assoc_buffers, - &mapping->private_list); + &mapping->i_private_list); bh->b_assoc_map = mapping; } spin_unlock(lock); @@ -851,7 +851,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. * - * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which * assumes that all the buffers are against the blockdev. Not true * for reiserfs. */ @@ -859,13 +859,13 @@ void invalidate_inode_buffers(struct inode *inode) { if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->private_data; + struct list_head *list = &mapping->i_private_list; + struct address_space *buffer_mapping = mapping->i_private_data; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) __remove_assoc_queue(BH_ENTRY(list->next)); - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(invalidate_inode_buffers); @@ -882,10 +882,10 @@ int remove_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->private_data; + struct list_head *list = &mapping->i_private_list; + struct address_space *buffer_mapping = mapping->i_private_data; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) { struct buffer_head *bh = BH_ENTRY(list->next); if (buffer_dirty(bh)) { @@ -894,7 +894,7 @@ int remove_inode_buffers(struct inode *inode) } __remove_assoc_queue(bh); } - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } return ret; } @@ -1074,10 +1074,10 @@ static bool grow_dev_folio(struct block_device *bdev, sector_t block, * lock to be atomic wrt __find_get_block(), which does not * run under the folio lock. */ - spin_lock(&inode->i_mapping->private_lock); + spin_lock(&inode->i_mapping->i_private_lock); link_dev_buffers(folio, bh); end_block = folio_init_buffers(folio, bdev, size); - spin_unlock(&inode->i_mapping->private_lock); + spin_unlock(&inode->i_mapping->i_private_lock); unlock: folio_unlock(folio); folio_put(folio); @@ -1169,7 +1169,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * and then attach the address_space's inode to its superblock's dirty * inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock, * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1247,10 +1247,10 @@ void __bforget(struct buffer_head *bh) if (bh->b_assoc_map) { struct address_space *buffer_mapping = bh->b_folio->mapping; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); list_del_init(&bh->b_assoc_buffers); bh->b_assoc_map = NULL; - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } __brelse(bh); } @@ -1639,7 +1639,7 @@ EXPORT_SYMBOL(block_invalidate_folio); /* * We attach and possibly dirty the buffers atomically wrt - * block_dirty_folio() via private_lock. try_to_free_buffers + * block_dirty_folio() via i_private_lock. try_to_free_buffers * is already excluded via the folio lock. */ struct buffer_head *folio_create_empty_buffers(struct folio *folio, @@ -1657,7 +1657,7 @@ struct buffer_head *folio_create_empty_buffers(struct folio *folio, } while (bh); tail->b_this_page = head; - spin_lock(&folio->mapping->private_lock); + spin_lock(&folio->mapping->i_private_lock); if (folio_test_uptodate(folio) || folio_test_dirty(folio)) { bh = head; do { @@ -1669,7 +1669,7 @@ struct buffer_head *folio_create_empty_buffers(struct folio *folio, } while (bh != head); } folio_attach_private(folio, head); - spin_unlock(&folio->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return head; } @@ -1723,7 +1723,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) if (!folio_buffers(folio)) continue; /* - * We use folio lock instead of bd_mapping->private_lock + * We use folio lock instead of bd_mapping->i_private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock. */ @@ -2855,7 +2855,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * are unused, and releases them if so. * * Exclusion against try_to_free_buffers may be obtained by either - * locking the folio or by holding its mapping's private_lock. + * locking the folio or by holding its mapping's i_private_lock. * * If the folio is dirty but all the buffers are clean then we need to * be sure to mark the folio clean as well. This is because the folio @@ -2866,7 +2866,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * The same applies to regular filesystem folios: if all the buffers are * clean then we set the folio clean and proceed. To do that, we require * total exclusion from block_dirty_folio(). That is obtained with - * private_lock. + * i_private_lock. * * try_to_free_buffers() is non-blocking. */ @@ -2918,7 +2918,7 @@ bool try_to_free_buffers(struct folio *folio) goto out; } - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); ret = drop_buffers(folio, &buffers_to_free); /* @@ -2931,13 +2931,13 @@ bool try_to_free_buffers(struct folio *folio) * the folio's buffers clean. We discover that here and clean * the folio also. * - * private_lock must be held over this entire operation in order + * i_private_lock must be held over this entire operation in order * to synchronise against block_dirty_folio and prevent the * dirty bit from being lost. */ if (ret) folio_cancel_dirty(folio); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b90931aff6ec..379577f6951f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1320,7 +1320,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, * We need to pick up the new inode size which generic_commit_write gave us * `iocb` can be NULL - eg, when called from page_symlink(). * - * ext4 never places buffers on inode->i_mapping->private_list. metadata + * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(const struct kiocb *iocb, @@ -3360,7 +3360,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) } /* Any metadata buffers to write? */ - if (!list_empty(&inode->i_mapping->private_list)) + if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 60c27913ce3c..4d5a6ae2dc2c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1258,7 +1258,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->host = s->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 65da3ae6d48e..0a3489d4ea17 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -117,7 +117,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mapping->host = sb->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b072110eeb30..0626e8813751 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -700,7 +700,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) * at inode creation time. If this is a device special inode, * i_mapping may not point to the original address space. */ - resv_map = (struct resv_map *)(&inode->i_data)->private_data; + resv_map = (struct resv_map *)(&inode->i_data)->i_private_data; /* Only regular and link inodes have associated reserve maps */ if (resv_map) resv_map_release(&resv_map->refs); @@ -1014,7 +1014,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; simple_inode_init_ts(inode); - inode->i_mapping->private_data = resv_map; + inode->i_mapping->i_private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: diff --git a/fs/inode.c b/fs/inode.c index 83b58eeecb6e..530e6c651586 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -211,7 +211,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&mapping->nr_thps, 0); #endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, @@ -400,8 +400,8 @@ static void __address_space_init_once(struct address_space *mapping) { xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); + INIT_LIST_HEAD(&mapping->i_private_list); + spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED; } @@ -657,7 +657,7 @@ void clear_inode(struct inode *inode) */ xa_destroy(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); - BUG_ON(!list_empty(&inode->i_data.private_list)); + BUG_ON(!list_empty(&inode->i_data.i_private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); @@ -962,7 +962,7 @@ static enum lru_status inode_lru_cold_count(struct list_head *item, } if (inode->i_data.nrpages || - !list_empty(&inode->i_data.private_list)) { + !list_empty(&inode->i_data.i_private_list)) { if (unlikely(inode_age)) kidled_set_slab_age(inode, 0); goto out; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0148719cc6dc..3bd3b23203cc 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -182,13 +182,13 @@ static struct nfs_page *nfs_folio_find_private_request(struct folio *folio) if (!folio_test_private(folio)) return NULL; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); req = nfs_folio_private_request(folio); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return req; } @@ -809,13 +809,13 @@ static void nfs_inode_add_request(struct nfs_page *req) * Swap-space should not get truncated. Hence no need to plug the race * with invalidate/truncate. */ - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(!folio_test_swapcache(folio))) { set_bit(PG_MAPPED, &req->wb_flags); folio_set_private(folio); folio->private = req; } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. @@ -837,13 +837,13 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio_file_mapping(folio); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(folio && !folio_test_swapcache(folio))) { folio->private = NULL; folio_clear_private(folio); clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); } nfs_page_group_unlock(req); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 9f60172f9cba..7a27e7df3634 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -214,7 +214,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping, /* * The page may not be locked, eg if called from try_to_unmap_one() */ - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; @@ -230,7 +230,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping, } else if (ret) { nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (nr_dirty) nilfs_set_file_dirty(inode, nr_dirty); diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index fea577007f4f..4227c1cd0260 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1707,7 +1707,7 @@ const struct address_space_operations ntfs_mst_aops = { * * If the page does not have buffers, we create them and set them uptodate. * The page may not be locked which is why we need to handle the buffers under - * the mapping->private_lock. Once the buffers are marked dirty we no longer + * the mapping->i_private_lock. Once the buffers are marked dirty we no longer * need the lock since try_to_free_buffers() does not free dirty buffers. */ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { @@ -1719,11 +1719,11 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { BUG_ON(!PageUptodate(page)); end = ofs + ni->itype.index.block_size; bh_size = VFS_I(ni)->i_sb->s_blocksize; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (unlikely(!page_has_buffers(page))) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); bh = head = alloc_page_buffers(page, bh_size, true); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(!page_has_buffers(page))) { struct buffer_head *tail; @@ -1747,7 +1747,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { break; set_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); filemap_dirty_folio(mapping, page_folio(page)); if (unlikely(buffers_to_free)) { do { diff --git a/include/linux/fs.h b/include/linux/fs.h index 0e296544eb59..98e39f6f73ab 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -503,9 +503,9 @@ extern const struct address_space_operations empty_aops; * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. - * @private_lock: For use by the owner of the address_space. - * @private_list: For use by the owner of the address_space. - * @private_data: For use by the owner of the address_space. + * @i_private_lock: For use by the owner of the address_space. + * @i_private_list: For use by the owner of the address_space. + * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; @@ -527,10 +527,10 @@ struct address_space { const struct address_space_operations *a_ops; unsigned long flags; errseq_t wb_err; - spinlock_t private_lock; - struct list_head private_list; + spinlock_t i_private_lock; + struct list_head i_private_list; struct rw_semaphore i_mmap_rwsem; - void *private_data; + void * i_private_data; struct fast_reflink_work *fast_reflink_work; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6481232d8c7b..1fe541c27e7f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1142,7 +1142,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode) * The VERY common case is inode->mapping == &inode->i_data but, * this may not be true for device special inodes. */ - return (struct resv_map *)(&inode->i_data)->private_data; + return (struct resv_map *)(&inode->i_data)->i_private_data; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) diff --git a/mm/migrate.c b/mm/migrate.c index de7a950db819..83e5ab599f99 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -799,7 +799,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, recheck_buffers: busy = false; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); bh = head; do { if (atomic_read(&bh->b_count)) { @@ -813,7 +813,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, rc = -EAGAIN; goto unlock_buffers; } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); invalidate_bh_lrus(); invalidated = true; goto recheck_buffers; @@ -840,7 +840,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, rc = MIGRATEPAGE_SUCCESS; unlock_buffers: if (check_refs) - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); bh = head; do { unlock_buffer(bh); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index eccf0e901131..9467e1f6c851 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -97,7 +97,7 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) { - struct list_head *gmem_list = &inode->i_mapping->private_list; + struct list_head *gmem_list = &inode->i_mapping->i_private_list; pgoff_t start = offset >> PAGE_SHIFT; pgoff_t end = (offset + len) >> PAGE_SHIFT; struct kvm_gmem *gmem; @@ -273,7 +273,7 @@ static int kvm_gmem_migrate_folio(struct address_space *mapping, static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) { - struct list_head *gmem_list = &mapping->private_list; + struct list_head *gmem_list = &mapping->i_private_list; struct kvm_gmem *gmem; pgoff_t start, end; @@ -373,7 +373,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) kvm_get_kvm(kvm); gmem->kvm = kvm; xa_init(&gmem->bindings); - list_add(&gmem->entry, &inode->i_mapping->private_list); + list_add(&gmem->entry, &inode->i_mapping->i_private_list); fd_install(fd, file); return fd; -- Gitee From c5204b00a609864d3233fbb4f631cc623e944977 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Nov 2023 11:24:08 -0500 Subject: [PATCH 17/25] selftests/kvm: fix compilation on non-x86_64 platforms ANBZ: #34453 commit e9e60c82fe391d04db55a91c733df4a017c28b2f upstream. MEM_REGION_SLOT and MEM_REGION_GPA are not really needed in test_invalid_memory_region_flags; the VM never runs and there are no other slots, so it is okay to use slot 0 and place it at address zero. This fixes compilation on architectures that do not define them. Fixes: 5d74316466f4 ("KVM: selftests: Add a memory region subtest to validate invalid flags") Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- tools/testing/selftests/kvm/guest_memfd_test.c | 2 -- tools/testing/selftests/kvm/set_memory_region_test.c | 12 ++++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 318ba6ba8bd3..c78a98c1a915 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -137,8 +137,6 @@ static void test_create_guest_memfd_invalid(struct kvm_vm *vm) } for (flag = 0; flag; flag <<= 1) { - uint64_t bit; - fd = __vm_create_guest_memfd(vm, page_size, flag); TEST_ASSERT(fd == -1 && errno == EINVAL, "guest_memfd() with flag '0x%lx' should fail with EINVAL", diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 1efee1cfcff0..6637a0845acf 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -349,8 +349,8 @@ static void test_invalid_memory_region_flags(void) if ((supported_flags & BIT(i)) && !(v2_only_flags & BIT(i))) continue; - r = __vm_set_user_memory_region(vm, MEM_REGION_SLOT, BIT(i), - MEM_REGION_GPA, MEM_REGION_SIZE, NULL); + r = __vm_set_user_memory_region(vm, 0, BIT(i), + 0, MEM_REGION_SIZE, NULL); TEST_ASSERT(r && errno == EINVAL, "KVM_SET_USER_MEMORY_REGION should have failed on v2 only flag 0x%lx", BIT(i)); @@ -358,16 +358,16 @@ static void test_invalid_memory_region_flags(void) if (supported_flags & BIT(i)) continue; - r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, BIT(i), - MEM_REGION_GPA, MEM_REGION_SIZE, NULL, 0, 0); + r = __vm_set_user_memory_region2(vm, 0, BIT(i), + 0, MEM_REGION_SIZE, NULL, 0, 0); TEST_ASSERT(r && errno == EINVAL, "KVM_SET_USER_MEMORY_REGION2 should have failed on unsupported flag 0x%lx", BIT(i)); } if (supported_flags & KVM_MEM_GUEST_MEMFD) { - r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, + r = __vm_set_user_memory_region2(vm, 0, KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_GUEST_MEMFD, - MEM_REGION_GPA, MEM_REGION_SIZE, NULL, 0, 0); + 0, MEM_REGION_SIZE, NULL, 0, 0); TEST_ASSERT(r && errno == EINVAL, "KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported"); } -- Gitee From 38994e4e2ee4eaa8d5d19d56ab6ab38d1d74cb5d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Nov 2023 17:09:52 -0800 Subject: [PATCH 18/25] KVM: selftests: Drop the single-underscore ioctl() helpers ANBZ: #34453 commit 6542a00369284c951185be8fa2ed45cf423cce06 upstream. Drop _kvm_ioctl(), _vm_ioctl(), and _vcpu_ioctl(), as they are no longer used by anything other than the no-underscores variants (and may have never been used directly). The single-underscore variants were never intended to be a "feature", they were a stopgap of sorts to ease the conversion to pretty printing ioctl() names when reporting errors. Opportunistically add a comment explaining when to use __KVM_IOCTL_ERROR() versus KVM_IOCTL_ERROR(). The single-underscore macros were subtly ensuring that the name of the ioctl() was printed on error, i.e. it's all too easy to overlook the fact that using __KVM_IOCTL_ERROR() is intentional. Link: https://lore.kernel.org/r/20231108010953.560824-2-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../selftests/kvm/include/kvm_util_base.h | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 8c714071e503..c692cca3774f 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -268,6 +268,13 @@ static inline bool kvm_has_cap(long cap) #define __KVM_SYSCALL_ERROR(_name, _ret) \ "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) +/* + * Use the "inner", double-underscore macro when reporting errors from within + * other macros so that the name of ioctl() and not its literal numeric value + * is printed on error. The "outer" macro is strongly preferred when reporting + * errors "directly", i.e. without an additional layer of macros, as it reduces + * the probability of passing in the wrong string. + */ #define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) #define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) @@ -280,17 +287,13 @@ static inline bool kvm_has_cap(long cap) #define __kvm_ioctl(kvm_fd, cmd, arg) \ kvm_do_ioctl(kvm_fd, cmd, arg) - -#define _kvm_ioctl(kvm_fd, cmd, name, arg) \ +#define kvm_ioctl(kvm_fd, cmd, arg) \ ({ \ int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ }) -#define kvm_ioctl(kvm_fd, cmd, arg) \ - _kvm_ioctl(kvm_fd, cmd, #cmd, arg) - static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } #define __vm_ioctl(vm, cmd, arg) \ @@ -299,17 +302,13 @@ static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } kvm_do_ioctl((vm)->fd, cmd, arg); \ }) -#define _vm_ioctl(vm, cmd, name, arg) \ +#define vm_ioctl(vm, cmd, arg) \ ({ \ int ret = __vm_ioctl(vm, cmd, arg); \ \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ }) -#define vm_ioctl(vm, cmd, arg) \ - _vm_ioctl(vm, cmd, #cmd, arg) - - static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } #define __vcpu_ioctl(vcpu, cmd, arg) \ @@ -318,16 +317,13 @@ static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } kvm_do_ioctl((vcpu)->fd, cmd, arg); \ }) -#define _vcpu_ioctl(vcpu, cmd, name, arg) \ +#define vcpu_ioctl(vcpu, cmd, arg) \ ({ \ int ret = __vcpu_ioctl(vcpu, cmd, arg); \ \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ }) -#define vcpu_ioctl(vcpu, cmd, arg) \ - _vcpu_ioctl(vcpu, cmd, #cmd, arg) - /* * Looks up and returns the value corresponding to the capability * (KVM_CAP_*) given by cap. -- Gitee From 31cdcdcfac948496cb28922ada4298894b3a1746 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Nov 2023 17:09:53 -0800 Subject: [PATCH 19/25] KVM: selftests: Add logic to detect if ioctl() failed because VM was killed ANBZ: #34453 commit 1b78d474ce4ecbf15a4a8b08994b8ed8ceec0dab upstream. Add yet another macro to the VM/vCPU ioctl() framework to detect when an ioctl() failed because KVM killed/bugged the VM, i.e. when there was nothing wrong with the ioctl() itself. If KVM kills a VM, e.g. by way of a failed KVM_BUG_ON(), all subsequent VM and vCPU ioctl()s will fail with -EIO, which can be quite misleading and ultimately waste user/developer time. Use KVM_CHECK_EXTENSION on KVM_CAP_USER_MEMORY to detect if the VM is dead and/or bug, as KVM doesn't provide a dedicated ioctl(). Using a heuristic is obviously less than ideal, but practically speaking the logic is bulletproof barring a KVM change, and any such change would arguably break userspace, e.g. if KVM returns something other than -EIO. Without the detection, tearing down a bugged VM yields a cryptic failure when deleting memslots: ==== Test Assertion Failure ==== lib/kvm_util.c:689: !ret pid=45131 tid=45131 errno=5 - Input/output error 1 0x00000000004036c3: __vm_mem_region_delete at kvm_util.c:689 2 0x00000000004042f0: kvm_vm_free at kvm_util.c:724 (discriminator 12) 3 0x0000000000402929: race_sync_regs at sync_regs_test.c:193 4 0x0000000000401cab: main at sync_regs_test.c:334 (discriminator 6) 5 0x0000000000416f13: __libc_start_call_main at libc-start.o:? 6 0x000000000041855f: __libc_start_main_impl at ??:? 7 0x0000000000401d40: _start at ??:? KVM_SET_USER_MEMORY_REGION failed, rc: -1 errno: 5 (Input/output error) Which morphs into a more pointed error message with the detection: ==== Test Assertion Failure ==== lib/kvm_util.c:689: false pid=80347 tid=80347 errno=5 - Input/output error 1 0x00000000004039ab: __vm_mem_region_delete at kvm_util.c:689 (discriminator 5) 2 0x0000000000404660: kvm_vm_free at kvm_util.c:724 (discriminator 12) 3 0x0000000000402ac9: race_sync_regs at sync_regs_test.c:193 4 0x0000000000401cb7: main at sync_regs_test.c:334 (discriminator 6) 5 0x0000000000418263: __libc_start_call_main at libc-start.o:? 6 0x00000000004198af: __libc_start_main_impl at ??:? 7 0x0000000000401d90: _start at ??:? KVM killed/bugged the VM, check the kernel log for clues Suggested-by: Michal Luczaj Cc: Oliver Upton Cc: Colton Lewis Link: https://lore.kernel.org/r/20231108010953.560824-3-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../selftests/kvm/include/kvm_util_base.h | 39 ++++++++++++++++--- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c692cca3774f..e1335a6b1588 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -302,11 +302,40 @@ static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } kvm_do_ioctl((vm)->fd, cmd, arg); \ }) +/* + * Assert that a VM or vCPU ioctl() succeeded, with extra magic to detect if + * the ioctl() failed because KVM killed/bugged the VM. To detect a dead VM, + * probe KVM_CAP_USER_MEMORY, which (a) has been supported by KVM since before + * selftests existed and (b) should never outright fail, i.e. is supposed to + * return 0 or 1. If KVM kills a VM, KVM returns -EIO for all ioctl()s for the + * VM and its vCPUs, including KVM_CHECK_EXTENSION. + */ +#define __TEST_ASSERT_VM_VCPU_IOCTL(cond, name, ret, vm) \ +do { \ + int __errno = errno; \ + \ + static_assert_is_vm(vm); \ + \ + if (cond) \ + break; \ + \ + if (errno == EIO && \ + __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY) < 0) { \ + TEST_ASSERT(errno == EIO, "KVM killed the VM, should return -EIO"); \ + TEST_FAIL("KVM killed/bugged the VM, check the kernel log for clues"); \ + } \ + errno = __errno; \ + TEST_ASSERT(cond, __KVM_IOCTL_ERROR(name, ret)); \ +} while (0) + +#define TEST_ASSERT_VM_VCPU_IOCTL(cond, cmd, ret, vm) \ + __TEST_ASSERT_VM_VCPU_IOCTL(cond, #cmd, ret, vm) + #define vm_ioctl(vm, cmd, arg) \ ({ \ int ret = __vm_ioctl(vm, cmd, arg); \ \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ }) static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } @@ -321,7 +350,7 @@ static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } ({ \ int ret = __vcpu_ioctl(vcpu, cmd, arg); \ \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, (vcpu)->vm); \ }) /* @@ -332,7 +361,7 @@ static inline int vm_check_cap(struct kvm_vm *vm, long cap) { int ret = __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)cap); - TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret)); + TEST_ASSERT_VM_VCPU_IOCTL(ret >= 0, KVM_CHECK_EXTENSION, ret, vm); return ret; } @@ -439,7 +468,7 @@ static inline int vm_get_stats_fd(struct kvm_vm *vm) { int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); - TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_GET_STATS_FD, fd)); + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vm); return fd; } @@ -681,7 +710,7 @@ static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu) { int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL); - TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_GET_STATS_FD, fd)); + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_CHECK_EXTENSION, fd, vcpu->vm); return fd; } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8964027cc299..f9deddaccacf 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1273,7 +1273,7 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) vcpu->vm = vm; vcpu->id = vcpu_id; vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id); - TEST_ASSERT(vcpu->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu->fd)); + TEST_ASSERT_VM_VCPU_IOCTL(vcpu->fd >= 0, KVM_CREATE_VCPU, vcpu->fd, vm); TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size " "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi", -- Gitee From b61256610337b23af658b3ce5e93844e558ea279 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sat, 7 Oct 2023 14:40:19 +0800 Subject: [PATCH 20/25] KVM: x86: Remove 'return void' expression for 'void function' ANBZ: #34453 commit ef8d89033c3f1f6a64757f066b2c17e76d1189f8 upstream. The requested info will be stored in 'guest_xsave->region' referenced by the incoming pointer "struct kvm_xsave *guest_xsave", thus there is no need to explicitly use return void expression for a void function "static void kvm_vcpu_ioctl_x86_get_xsave(...)". The issue is caught with [-Wpedantic]. Fixes: 2d287ec65e79 ("x86/fpu: Allow caller to constrain xfeatures when copying to uabi buffer") Signed-off-by: Like Xu Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20231007064019.17472-1-likexu@tencent.com Signed-off-by: Sean Christopherson Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9dfd05078fcb..4491e29de445 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5456,8 +5456,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, - sizeof(guest_xsave->region)); + kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, + sizeof(guest_xsave->region)); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, -- Gitee From 39378d130f208e157b0c9a133d7bc030f0f6a36e Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Wed, 22 Nov 2023 15:35:54 -0800 Subject: [PATCH 21/25] Support rv32 ULEB128 test ANBZ: #34453 commit 82180b1fae2432ee88b4a54cc6c376ba01e57b22 upstream. Use opcodes available to both rv32 and rv64 in uleb128 module linking test. Fixes: af71bc194916 ("riscv: Add tests for riscv module loading") Signed-off-by: Charlie Jenkins Reported-by: Randy Dunlap Closes: https://lore.kernel.org/lkml/1d7c71ee-5742-4df4-b8ef-a2aea0a624eb@infradead.org/ Tested-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/r/20231122-module_fixup-v2-1-dfb9565e9ea5@rivosinc.com Signed-off-by: Palmer Dabbelt Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/riscv/kernel/tests/module_test/test_uleb128.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/tests/module_test/test_uleb128.S b/arch/riscv/kernel/tests/module_test/test_uleb128.S index 90f22049d553..8515ed7cd8c1 100644 --- a/arch/riscv/kernel/tests/module_test/test_uleb128.S +++ b/arch/riscv/kernel/tests/module_test/test_uleb128.S @@ -6,13 +6,13 @@ .text .global test_uleb_basic test_uleb_basic: - ld a0, second + lw a0, second addi a0, a0, -127 ret .global test_uleb_large test_uleb_large: - ld a0, fourth + lw a0, fourth addi a0, a0, -0x07e8 ret @@ -22,10 +22,10 @@ first: second: .reloc second, R_RISCV_SET_ULEB128, second .reloc second, R_RISCV_SUB_ULEB128, first - .dword 0 + .word 0 third: .space 1000 fourth: .reloc fourth, R_RISCV_SET_ULEB128, fourth .reloc fourth, R_RISCV_SUB_ULEB128, third - .dword 0 + .word 0 -- Gitee From deb7de88b8fd80621be9804e8110e2d5bea4fcb7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:15 +0100 Subject: [PATCH 22/25] KVM: x86/xen: Remove unneeded xen context from kvm_arch when !CONFIG_KVM_XEN ANBZ: #34453 commit 87562052c965ba7de6dc490434e53691fe46c898 upstream. Saving a few bytes of memory per KVM VM is certainly great but what's more important is the ability to see where the code accesses Xen emulation context while CONFIG_KVM_XEN is not enabled. Currently, kvm_cpu_get_extint() is the only such place and it is harmless: kvm_xen_has_interrupt() always returns '0' when !CONFIG_KVM_XEN. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-2-vkuznets@redhat.com Signed-off-by: Sean Christopherson Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/include/asm/kvm_host.h | 5 +++++ arch/x86/kvm/irq.c | 2 ++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ce369b21fa68..e353aa21031b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1131,6 +1131,7 @@ struct msr_bitmap_range { unsigned long *bitmap; }; +#ifdef CONFIG_KVM_XEN /* Xen emulation context */ struct kvm_xen { struct mutex xen_lock; @@ -1142,6 +1143,7 @@ struct kvm_xen { struct idr evtchn_ports; unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; }; +#endif enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, @@ -1344,7 +1346,10 @@ struct kvm_arch { struct hlist_head mask_notifier_list; struct kvm_hv hyperv; + +#ifdef CONFIG_KVM_XEN struct kvm_xen xen; +#endif bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b2c397dd2bc6..ad9ca8a60144 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -118,8 +118,10 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v) if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; +#ifdef CONFIG_KVM_XEN if (kvm_xen_has_interrupt(v)) return v->kvm->arch.xen.upcall_vector; +#endif if (irqchip_split(v->kvm)) { int vector = v->arch.pending_external_vector; -- Gitee From 6636d2fca1510859dff28cab57d7a132d16f68f8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:20 +0100 Subject: [PATCH 23/25] KVM: VMX: Split off hyperv_evmcs.{ch} ANBZ: #34453 commit e7ad84db4d718e18c7a133e941ba4c7d4c6d4cbf upstream. Some Enlightened VMCS related code is needed both by Hyper-V on KVM and KVM on Hyper-V. As a preparation to making Hyper-V emulation optional, create dedicated 'hyperv_evmcs.{ch}' files which are used by both. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-7-vkuznets@redhat.com Signed-off-by: Sean Christopherson Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/vmx/hyperv.c | 308 ------------------------------- arch/x86/kvm/vmx/hyperv.h | 163 +---------------- arch/x86/kvm/vmx/hyperv_evmcs.c | 315 ++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/hyperv_evmcs.h | 166 +++++++++++++++++ arch/x86/kvm/vmx/nested.h | 1 + arch/x86/kvm/vmx/vmx_onhyperv.h | 3 +- 7 files changed, 486 insertions(+), 472 deletions(-) create mode 100644 arch/x86/kvm/vmx/hyperv_evmcs.c create mode 100644 arch/x86/kvm/vmx/hyperv_evmcs.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 39bbce9b1685..cabb2bc8129b 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -23,7 +23,7 @@ kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ - vmx/hyperv.o vmx/nested.o vmx/posted_intr.o + vmx/hyperv.o vmx/hyperv_evmcs.o vmx/nested.o vmx/posted_intr.o kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o ifdef CONFIG_HYPERV diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index de13dc14fe1d..fab6a1ad98dc 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -13,314 +13,6 @@ #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK -#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) -#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ - {EVMCS1_OFFSET(name), clean_field} - -const struct evmcs_field vmcs_field_to_evmcs_1[] = { - /* 64 bit rw */ - EVMCS1_FIELD(GUEST_RIP, guest_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_RSP, guest_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR0, host_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR3, host_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR4, host_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_RIP, host_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), - EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(TSC_OFFSET, tsc_offset, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR0, guest_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR3, guest_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR4, guest_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_DR7, guest_dr7, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_RSP, host_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(EPT_POINTER, ept_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), - EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - /* - * Not used by KVM: - * - * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - * EVMCS1_FIELD(0x0000682A, guest_ssp, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - * EVMCS1_FIELD(0x00006C1A, host_ssp, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - */ - - /* 64 bit read only */ - EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - /* - * Not defined in KVM: - * - * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - */ - EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* - * No mask defined in the spec as Hyper-V doesn't currently support - * these. Future proof by resetting the whole clean field mask on - * access. - */ - EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 32 bit rw */ - EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), - EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), - EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), - EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, - vm_entry_exception_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - - /* 32 bit read only */ - EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* No mask defined in the spec (not used) */ - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 16 bit rw */ - EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), -}; -const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); - u64 nested_get_evmptr(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 9401dbfaea7c..d4ed99008518 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -2,170 +2,9 @@ #ifndef __KVM_X86_VMX_HYPERV_H #define __KVM_X86_VMX_HYPERV_H -#include - -#include -#include -#include - -#include "../hyperv.h" - -#include "capabilities.h" -#include "vmcs.h" +#include #include "vmcs12.h" -#define KVM_EVMCS_VERSION 1 - -/* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - * VM_FUNCTION_CONTROL = 0x00002018, - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - * - * TSC_MULTIPLIER = 0x00002032, - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - * - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ -#define EVMCS1_SUPPORTED_PINCTRL \ - (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - PIN_BASED_EXT_INTR_MASK | \ - PIN_BASED_NMI_EXITING | \ - PIN_BASED_VIRTUAL_NMIS) - -#define EVMCS1_SUPPORTED_EXEC_CTRL \ - (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - CPU_BASED_HLT_EXITING | \ - CPU_BASED_CR3_LOAD_EXITING | \ - CPU_BASED_CR3_STORE_EXITING | \ - CPU_BASED_UNCOND_IO_EXITING | \ - CPU_BASED_MOV_DR_EXITING | \ - CPU_BASED_USE_TSC_OFFSETTING | \ - CPU_BASED_MWAIT_EXITING | \ - CPU_BASED_MONITOR_EXITING | \ - CPU_BASED_INVLPG_EXITING | \ - CPU_BASED_RDPMC_EXITING | \ - CPU_BASED_INTR_WINDOW_EXITING | \ - CPU_BASED_CR8_LOAD_EXITING | \ - CPU_BASED_CR8_STORE_EXITING | \ - CPU_BASED_RDTSC_EXITING | \ - CPU_BASED_TPR_SHADOW | \ - CPU_BASED_USE_IO_BITMAPS | \ - CPU_BASED_MONITOR_TRAP_FLAG | \ - CPU_BASED_USE_MSR_BITMAPS | \ - CPU_BASED_NMI_WINDOW_EXITING | \ - CPU_BASED_PAUSE_EXITING | \ - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - -#define EVMCS1_SUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ - SECONDARY_EXEC_WBINVD_EXITING | \ - SECONDARY_EXEC_ENABLE_VPID | \ - SECONDARY_EXEC_ENABLE_EPT | \ - SECONDARY_EXEC_UNRESTRICTED_GUEST | \ - SECONDARY_EXEC_DESC | \ - SECONDARY_EXEC_ENABLE_RDTSCP | \ - SECONDARY_EXEC_ENABLE_INVPCID | \ - SECONDARY_EXEC_ENABLE_XSAVES | \ - SECONDARY_EXEC_RDSEED_EXITING | \ - SECONDARY_EXEC_RDRAND_EXITING | \ - SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ - SECONDARY_EXEC_PT_USE_GPA | \ - SECONDARY_EXEC_PT_CONCEAL_VMX | \ - SECONDARY_EXEC_BUS_LOCK_DETECTION | \ - SECONDARY_EXEC_NOTIFY_VM_EXITING | \ - SECONDARY_EXEC_ENCLS_EXITING) - -#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) - -#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_EXIT_SAVE_DEBUG_CONTROLS | \ - VM_EXIT_ACK_INTR_ON_EXIT | \ - VM_EXIT_HOST_ADDR_SPACE_SIZE | \ - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_EXIT_SAVE_IA32_PAT | \ - VM_EXIT_LOAD_IA32_PAT | \ - VM_EXIT_SAVE_IA32_EFER | \ - VM_EXIT_LOAD_IA32_EFER | \ - VM_EXIT_CLEAR_BNDCFGS | \ - VM_EXIT_PT_CONCEAL_PIP | \ - VM_EXIT_CLEAR_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ - (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_ENTRY_LOAD_DEBUG_CONTROLS | \ - VM_ENTRY_IA32E_MODE | \ - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_ENTRY_LOAD_IA32_PAT | \ - VM_ENTRY_LOAD_IA32_EFER | \ - VM_ENTRY_LOAD_BNDCFGS | \ - VM_ENTRY_PT_CONCEAL_PIP | \ - VM_ENTRY_LOAD_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMFUNC (0) - -struct evmcs_field { - u16 offset; - u16 clean_field; -}; - -extern const struct evmcs_field vmcs_field_to_evmcs_1[]; -extern const unsigned int nr_evmcs_1_fields; - -static __always_inline int evmcs_field_offset(unsigned long field, - u16 *clean_field) -{ - unsigned int index = ROL16(field, 6); - const struct evmcs_field *evmcs_field; - - if (unlikely(index >= nr_evmcs_1_fields)) - return -ENOENT; - - evmcs_field = &vmcs_field_to_evmcs_1[index]; - - /* - * Use offset=0 to detect holes in eVMCS. This offset belongs to - * 'revision_id' but this field has no encoding and is supposed to - * be accessed directly. - */ - if (unlikely(!evmcs_field->offset)) - return -ENOENT; - - if (clean_field) - *clean_field = evmcs_field->clean_field; - - return evmcs_field->offset; -} - -static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, - unsigned long field, u16 offset) -{ - /* - * vmcs12_read_any() doesn't care whether the supplied structure - * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes - * the exact offset of the required field, use it for convenience - * here. - */ - return vmcs12_read_any((void *)evmcs, field, offset); -} - #define EVMPTR_INVALID (-1ULL) #define EVMPTR_MAP_PENDING (-2ULL) diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.c b/arch/x86/kvm/vmx/hyperv_evmcs.c new file mode 100644 index 000000000000..904bfcd1519b --- /dev/null +++ b/arch/x86/kvm/vmx/hyperv_evmcs.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains common code for working with Enlightened VMCS which is + * used both by Hyper-V on KVM and KVM on Hyper-V. + */ + +#include "hyperv_evmcs.h" + +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) +#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ + {EVMCS1_OFFSET(name), clean_field} + +const struct evmcs_field vmcs_field_to_evmcs_1[] = { + /* 64 bit rw */ + EVMCS1_FIELD(GUEST_RIP, guest_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_RSP, guest_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR0, host_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR3, host_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR4, host_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_RIP, host_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), + EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(TSC_OFFSET, tsc_offset, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR0, guest_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR3, guest_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR4, guest_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_DR7, guest_dr7, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_RSP, host_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(EPT_POINTER, ept_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), + EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + /* + * Not used by KVM: + * + * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x0000682A, guest_ssp, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + * EVMCS1_FIELD(0x00006C1A, host_ssp, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + */ + + /* 64 bit read only */ + EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + /* + * Not defined in KVM: + * + * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + */ + EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* + * No mask defined in the spec as Hyper-V doesn't currently support + * these. Future proof by resetting the whole clean field mask on + * access. + */ + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 32 bit rw */ + EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), + EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), + EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), + EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, + vm_entry_exception_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + + /* 32 bit read only */ + EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* No mask defined in the spec (not used) */ + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 16 bit rw */ + EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), +}; +const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h new file mode 100644 index 000000000000..a543fccfc574 --- /dev/null +++ b/arch/x86/kvm/vmx/hyperv_evmcs.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This file contains common definitions for working with Enlightened VMCS which + * are used both by Hyper-V on KVM and KVM on Hyper-V. + */ +#ifndef __KVM_X86_VMX_HYPERV_EVMCS_H +#define __KVM_X86_VMX_HYPERV_EVMCS_H + +#include + +#include "capabilities.h" +#include "vmcs12.h" + +#define KVM_EVMCS_VERSION 1 + +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_SUPPORTED_PINCTRL \ + (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING | \ + PIN_BASED_VIRTUAL_NMIS) + +#define EVMCS1_SUPPORTED_EXEC_CTRL \ + (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING | \ + CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) + +#define EVMCS1_SUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_ENABLE_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) + +#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE | \ + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_IA32E_MODE | \ + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMFUNC (0) + +struct evmcs_field { + u16 offset; + u16 clean_field; +}; + +extern const struct evmcs_field vmcs_field_to_evmcs_1[]; +extern const unsigned int nr_evmcs_1_fields; + +static __always_inline int evmcs_field_offset(unsigned long field, + u16 *clean_field) +{ + const struct evmcs_field *evmcs_field; + unsigned int index = ROL16(field, 6); + + if (unlikely(index >= nr_evmcs_1_fields)) + return -ENOENT; + + evmcs_field = &vmcs_field_to_evmcs_1[index]; + + /* + * Use offset=0 to detect holes in eVMCS. This offset belongs to + * 'revision_id' but this field has no encoding and is supposed to + * be accessed directly. + */ + if (unlikely(!evmcs_field->offset)) + return -ENOENT; + + if (clean_field) + *clean_field = evmcs_field->clean_field; + + return evmcs_field->offset; +} + +static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, + unsigned long field, u16 offset) +{ + /* + * vmcs12_read_any() doesn't care whether the supplied structure + * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes + * the exact offset of the required field, use it for convenience + * here. + */ + return vmcs12_read_any((void *)evmcs, field, offset); +} + +#endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index b4b9d51438c6..b0f2e26c1aea 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -3,6 +3,7 @@ #define __KVM_X86_VMX_NESTED_H #include "kvm_cache_regs.h" +#include "hyperv.h" #include "vmcs12.h" #include "vmx.h" diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h index 11541d272dbd..eb48153bfd73 100644 --- a/arch/x86/kvm/vmx/vmx_onhyperv.h +++ b/arch/x86/kvm/vmx/vmx_onhyperv.h @@ -4,11 +4,12 @@ #define __ARCH_X86_KVM_VMX_ONHYPERV_H__ #include +#include #include #include "capabilities.h" -#include "hyperv.h" +#include "hyperv_evmcs.h" #include "vmcs12.h" #define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) -- Gitee From 6273ce7ef74f924172b2aa4530a887877c4b5bf4 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:22 +0100 Subject: [PATCH 24/25] KVM: nVMX: Split off helper for emulating VMCLEAR on Hyper-V eVMCS ANBZ: #34453 commit b2e02f82b7f76234305c5a7fba4dbebc47ce4cb5 upstream. To avoid overloading handle_vmclear() with Hyper-V specific details and to prepare the code to making Hyper-V emulation optional, create a dedicated nested_evmcs_handle_vmclear() helper. No functional change intended. Suggested-by: Sean Christopherson Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20231205103630.1391318-9-vkuznets@redhat.com Signed-off-by: Sean Christopherson Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/kvm/vmx/nested.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 684ba76379d7..6b6a42554aca 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -244,6 +244,29 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) } } +static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * When Enlightened VMEntry is enabled on the calling CPU we treat + * memory area pointer by vmptr as Enlightened VMCS (as there's no good + * way to distinguish it from VMCS12) and we must not corrupt it by + * writing to the non-existent 'launch_state' field. The area doesn't + * have to be the currently active EVMCS on the calling CPU and there's + * nothing KVM has to do to transition it from 'active' to 'non-active' + * state. It is possible that the area will stay mapped as + * vmx->nested.hv_evmcs but this shouldn't be a problem. + */ + if (!guest_cpuid_has_evmcs(vcpu) || + !evmptr_is_valid(nested_get_evmptr(vcpu))) + return false; + + if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) + nested_release_evmcs(vcpu); + + return true; +} + static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, struct loaded_vmcs *prev) { @@ -5356,18 +5379,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (vmptr == vmx->nested.vmxon_ptr) return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - /* - * When Enlightened VMEntry is enabled on the calling CPU we treat - * memory area pointer by vmptr as Enlightened VMCS (as there's no good - * way to distinguish it from VMCS12) and we must not corrupt it by - * writing to the non-existent 'launch_state' field. The area doesn't - * have to be the currently active EVMCS on the calling CPU and there's - * nothing KVM has to do to transition it from 'active' to 'non-active' - * state. It is possible that the area will stay mapped as - * vmx->nested.hv_evmcs but this shouldn't be a problem. - */ - if (likely(!guest_cpuid_has_evmcs(vcpu) || - !evmptr_is_valid(nested_get_evmptr(vcpu)))) { + if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); @@ -5384,8 +5396,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); - } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { - nested_release_evmcs(vcpu); } return nested_vmx_succeed(vcpu); -- Gitee From 117a353ccdd4420652924b06070d331268e0ce1a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:25 +0100 Subject: [PATCH 25/25] KVM: nVMX: Move guest_cpuid_has_evmcs() to hyperv.h ANBZ: #34453 commit f97314626734deaef49564a429c6f8eee3846bd3 upstream. In preparation for making Hyper-V emulation optional, move Hyper-V specific guest_cpuid_has_evmcs() to hyperv.h. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Link: https://lore.kernel.org/r/20231205103630.1391318-12-vkuznets@redhat.com Signed-off-by: Sean Christopherson Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/kvm/vmx/hyperv.h | 11 +++++++++++ arch/x86/kvm/vmx/vmx.h | 10 ---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index d4ed99008518..6e1ee951e360 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -4,6 +4,7 @@ #include #include "vmcs12.h" +#include "vmx.h" #define EVMPTR_INVALID (-1ULL) #define EVMPTR_MAP_PENDING (-2ULL) @@ -20,6 +21,16 @@ enum nested_evmptrld_status { EVMPTRLD_ERROR, }; +static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) +{ + /* + * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and + * eVMCS has been explicitly enabled by userspace. + */ + return vcpu->arch.hyperv_enabled && + to_vmx(vcpu)->nested.enlightened_vmcs_enabled; +} + u64 nested_get_evmptr(struct kvm_vcpu *vcpu); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index e65d98493e35..6e77dcd549ce 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -790,14 +790,4 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && enable_ipiv; } -static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) -{ - /* - * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and - * eVMCS has been explicitly enabled by userspace. - */ - return vcpu->arch.hyperv_enabled && - to_vmx(vcpu)->nested.enlightened_vmcs_enabled; -} - #endif /* __KVM_X86_VMX_H */ -- Gitee