From 4c7d35a20e6924fb20e175beed1be9e32297a524 Mon Sep 17 00:00:00 2001 From: Xu Yu Date: Tue, 31 Jan 2023 17:33:28 +0800 Subject: [PATCH 01/12] anolis: mm: introduce vm_insert_page(s)_mkspecial ANBZ: #34745 cherry picked from devel-6.6 commit 30d55c99011614915377353a4672043ae1b8b31e. This adds the ability to insert anonymous pages or file pages, used for direct IO or buffer IO respectively, to a user VM. The intention behind this is to facilitate mapping pages in IO requests to user space, which is usually the backend of remote block device. This integrates the advantage of vm_insert_pages (batching the pmd lock), and eliminates the overhead of remap_pfn_range (track_pfn_remap), since the pages to be inserted should always be ram. NOTE that it is the caller's responsibility to ensure the validity of pages to be inserted, i.e., that such pages are used for IO requests. Depending on this premise, such pages can be inserted as special PTE, without increasing the page refcount and mapcount. On the other hand, the special mapping should be carefully managed (e.g., zapped) when the IO request is done. Signed-off-by: Xu Yu Signed-off-by: Guixin Liu Signed-off-by: Joseph Qi --- include/linux/mm.h | 4 ++ mm/memory.c | 170 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 95ad2c61151c..b78c7b4951f9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4512,6 +4512,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); +int vm_insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page *page); +int vm_insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num); int map_kernel_pages_prepare(struct vm_area_desc *desc); int map_kernel_pages_complete(struct vm_area_struct *vma, struct mmap_action *action); diff --git a/mm/memory.c b/mm/memory.c index 821a585fccc0..edb51fdc5ec4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3291,6 +3291,176 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long } EXPORT_SYMBOL(vm_iomap_memory); +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL +static int insert_page_into_pte_locked_mkspecial(struct mm_struct *mm, pte_t *pte, + unsigned long addr, struct page *page, pgprot_t prot) +{ + /* + * The page to be inserted should be either anonymous page or file page. + * + * In general, the anonymous page used in dio should be pinned, while + * the file page used in buffer IO is either locked (read) or writeback + * (sync). On the other hand, file page used in IO metadata read (e.g., + * ext4_get_inode_loc) can be unlocked, and the buffer_head is locked + * instead. + * + * Finally, it is the caller's responsibility to ensure the validity of + * pages to be inserted, i.e., such pages are used for IO requests. + */ + if (!PageAnon(page) && !folio_is_file_lru(page_folio(page))) + return -EINVAL; + + flush_dcache_page(page); + + if (!pte_none(*pte)) + return -EBUSY; + set_pte_at(mm, addr, pte, pte_mkspecial(mk_pte(page, prot))); + return 0; +} + +static int insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page *page, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + int retval; + pte_t *pte; + spinlock_t *ptl; + + retval = -ENOMEM; + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = insert_page_into_pte_locked_mkspecial(mm, pte, addr, page, prot); + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +int vm_insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, struct page *page) +{ + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(mmap_read_trylock(vma->vm_mm)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vm_flags_set(vma, VM_MIXEDMAP); + } + return insert_page_mkspecial(vma, addr, page, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_page_mkspecial); + +#ifdef pte_index +/* + * insert_pages_mkspecial() amortizes the cost of spinlock operations + * when inserting pages in a loop. Arch *must* define pte_index. + */ +static int insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num, pgprot_t prot) +{ + pmd_t *pmd = NULL; + pte_t *start_pte, *pte; + spinlock_t *pte_lock; + struct mm_struct *const mm = vma->vm_mm; + unsigned long curr_page_idx = 0; + unsigned long remaining_pages_total = *num; + unsigned long pages_to_write_in_pmd; + int ret; +more: + ret = -EFAULT; + pmd = walk_to_pmd(mm, addr); + if (!pmd) + goto out; + + pages_to_write_in_pmd = min_t(unsigned long, + remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); + + /* Allocate the PTE if necessary; takes PMD lock once only. */ + ret = -ENOMEM; + if (pte_alloc(mm, pmd)) + goto out; + + while (pages_to_write_in_pmd) { + int pte_idx = 0; + const int batch_size = min_t(int, pages_to_write_in_pmd, 8); + + start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { + int err = insert_page_into_pte_locked_mkspecial(mm, pte, + addr, pages[curr_page_idx], prot); + if (unlikely(err)) { + pte_unmap_unlock(start_pte, pte_lock); + ret = err; + remaining_pages_total -= pte_idx; + goto out; + } + addr += PAGE_SIZE; + ++curr_page_idx; + } + pte_unmap_unlock(start_pte, pte_lock); + pages_to_write_in_pmd -= batch_size; + remaining_pages_total -= batch_size; + } + if (remaining_pages_total) + goto more; + ret = 0; +out: + *num = remaining_pages_total; + return ret; +} +#endif /* pte_index */ + +/* + * vm_insert_pages_mkspecial - variant of vm_insert_pages using insert_pfn. + * + * The main purpose of vm_insert_pages_mkspecial is to combine the advantages of + * vm_insert_pages (batching the pmd lock) and remap_pfn_range_notrack (skipping + * track_pfn_insert). + * + * The caller should ensure the isolation (refcounted, PG_locked, PG_writeback, etc.) + * of @pages, and account for error case where a subset of @pages are mapped. + */ +int vm_insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ +#ifdef pte_index + const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; + + if (addr < vma->vm_start || end_addr >= vma->vm_end) + return -EFAULT; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(mmap_read_trylock(vma->vm_mm)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vm_flags_set(vma, VM_MIXEDMAP); + } + return insert_pages_mkspecial(vma, addr, pages, num, vma->vm_page_prot); +#else + unsigned long idx = 0, pgcount = *num; + int err = -EINVAL; + + for (; idx < pgcount; ++idx) { + err = vm_insert_page_mkspecial(vma, addr + (PAGE_SIZE * idx), pages[idx]); + if (err) + break; + } + *num = pgcount - idx; + return err; +#endif /* pte_index */ +} +EXPORT_SYMBOL(vm_insert_pages_mkspecial); +#else +int vm_insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, struct page *page) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_page_mkspecial); +int vm_insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_pages_mkspecial); +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ + static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, -- Gitee From 8c1d651315b19f85a549f06bcab005436ca0fb60 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Mon, 14 Feb 2022 15:38:29 +0800 Subject: [PATCH 02/12] anolis: uio: add ioctl to uio ANBZ: #34745 cherry picked from devel-6.6 commit 19185932d656c72d212f0807db3d5d53545ef40c. In TCMU, if backstore holds its own userspace buffer, for read cmd, the data needs to be copied from userspace buffer to tcmu data area first, and then needs to be copied from tcmu data area to scsi sgl pages again. To solve this problem, add ioctl to uio to let userspace backstore can copy data between scsi sgl pages and its own buffer directly. Signed-off-by: Guixin Liu Signed-off-by: Joseph Qi --- drivers/uio/uio.c | 21 +++++++++++++++++++++ include/linux/uio_driver.h | 1 + 2 files changed, 22 insertions(+) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 1e4ade78ed84..75533f7c6bea 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -882,6 +882,25 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) return ret; } +static long uio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct uio_listener *listener = filp->private_data; + struct uio_device *idev = listener->dev; + long retval = 0; + + mutex_lock(&idev->info_lock); + + if (!idev->info || !idev->info->ioctl) { + retval = -EINVAL; + goto out; + } + + retval = idev->info->ioctl(idev->info, cmd, arg); +out: + mutex_unlock(&idev->info_lock); + return retval; +} + static const struct file_operations uio_fops = { .owner = THIS_MODULE, .open = uio_open, @@ -892,6 +911,8 @@ static const struct file_operations uio_fops = { .poll = uio_poll, .fasync = uio_fasync, .llseek = noop_llseek, + .unlocked_ioctl = uio_ioctl, + .compat_ioctl = uio_ioctl, }; static int uio_major_init(void) diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 02eaac47ac44..7fcf1c36fd0d 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -116,6 +116,7 @@ struct uio_info { int (*open)(struct uio_info *info, struct inode *inode); int (*release)(struct uio_info *info, struct inode *inode); int (*irqcontrol)(struct uio_info *info, s32 irq_on); + long (*ioctl)(struct uio_info *info, unsigned int cmd, unsigned long arg); }; extern int __must_check -- Gitee From ba22c39ec239b3e6f5784572fd47e6ea221561ac Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Mon, 14 Feb 2022 15:59:11 +0800 Subject: [PATCH 03/12] anolis: scsi: target: reduce one copy by using uio ioctl ANBZ: #34745 cherry picked from devel-6.6 commit 9c2579d319ab02c0323e8da25d59f725d4a882a1. Currently there are two copies between sg, tcmu data area and userspace buffer, if the backstore holds its own userspace buffer, we can use uio ioctl to copy between sg and userspace buffer directly to improve performance. Use tcm_loop and tcmu(backstore is file) to evaluate performance, fio job: fio -filename=/dev/sdb -ioengine=libaio -direct=1 -size=2G -name=1 -thread -runtime=60 -time_based -rw=randread -numjobs=16 -iodepth=16 -bs=128k Without this patch: READ: bw=2511MiB/s (2633MB/s), 154MiB/s-158MiB/s (162MB/s-166MB/s), io=147GiB (158GB), run=60006-60008msec With this patch: READ: bw=2965MiB/s (3110MB/s), 183MiB/s-188MiB/s (192MB/s-197MB/s), io=174GiB (187GB), run=60005-60007msec There is about a 20% performance improvement in this case. Signed-off-by: Guixin Liu [ correctly check copy_page_[from|to]_user return value ] Signed-off-by: Joseph Qi --- drivers/target/target_core_user.c | 191 +++++++++++++++++++++++--- include/uapi/linux/target_core_user.h | 10 ++ 2 files changed, 179 insertions(+), 22 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index edc2afd5f4ee..7f7c5f68cb1f 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -123,6 +123,7 @@ struct tcmu_dev { #define TCMU_DEV_BIT_BLOCKED 2 #define TCMU_DEV_BIT_TMR_NOTIFY 3 #define TCMU_DEV_BIT_PLUGGED 4 +#define TCMU_DEV_BIT_BYPASS_DATA_AREA 5 unsigned long flags; struct uio_info uio_info; @@ -644,12 +645,17 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) tcmu_cmd->se_cmd = se_cmd; tcmu_cmd->tcmu_dev = udev; - tcmu_cmd_set_block_cnts(tcmu_cmd); - tcmu_cmd->dbi = kcalloc(tcmu_cmd->dbi_cnt, sizeof(uint32_t), - GFP_NOIO); - if (!tcmu_cmd->dbi) { - kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); - return NULL; + if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { + tcmu_cmd_set_block_cnts(tcmu_cmd); + tcmu_cmd->dbi = kcalloc(tcmu_cmd->dbi_cnt, sizeof(uint32_t), + GFP_NOIO); + if (!tcmu_cmd->dbi) { + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + return NULL; + } + } else { + tcmu_cmd->dbi_cnt = 0; + tcmu_cmd->dbi = NULL; } return tcmu_cmd; @@ -1095,16 +1101,19 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) tcmu_cmd_reset_dbi_cur(tcmu_cmd); iov = &entry->req.iov[0]; - if (se_cmd->data_direction == DMA_TO_DEVICE || - se_cmd->se_cmd_flags & SCF_BIDI) - scatter_data_area(udev, tcmu_cmd, &iov); - else - tcmu_setup_iovs(udev, tcmu_cmd, &iov, se_cmd->data_length); + if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { + if (se_cmd->data_direction == DMA_TO_DEVICE || + se_cmd->se_cmd_flags & SCF_BIDI) + scatter_data_area(udev, tcmu_cmd, &iov); + else + tcmu_setup_iovs(udev, tcmu_cmd, &iov, se_cmd->data_length); + } entry->req.iov_cnt = iov_cnt - iov_bidi_cnt; /* Handle BIDI commands */ - if (se_cmd->se_cmd_flags & SCF_BIDI) { + if ((se_cmd->se_cmd_flags & SCF_BIDI) + && !test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { iov++; tcmu_setup_iovs(udev, tcmu_cmd, &iov, tcmu_cmd->data_len_bidi); entry->req.iov_bidi_cnt = iov_bidi_cnt; @@ -1368,16 +1377,18 @@ static bool tcmu_handle_completion(struct tcmu_cmd *cmd, else se_cmd->se_cmd_flags |= SCF_TREAT_READ_AS_NORMAL; } - if (se_cmd->se_cmd_flags & SCF_BIDI) { - /* Get Data-In buffer before clean up */ - gather_data_area(udev, cmd, true, read_len); - } else if (se_cmd->data_direction == DMA_FROM_DEVICE) { - gather_data_area(udev, cmd, false, read_len); - } else if (se_cmd->data_direction == DMA_TO_DEVICE) { - /* TODO: */ - } else if (se_cmd->data_direction != DMA_NONE) { - pr_warn("TCMU: data direction was %d!\n", - se_cmd->data_direction); + if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { + if (se_cmd->se_cmd_flags & SCF_BIDI) { + /* Get Data-In buffer before clean up */ + gather_data_area(udev, cmd, true, read_len); + } else if (se_cmd->data_direction == DMA_FROM_DEVICE) { + gather_data_area(udev, cmd, false, read_len); + } else if (se_cmd->data_direction == DMA_TO_DEVICE) { + /* TODO: */ + } else if (se_cmd->data_direction != DMA_NONE) { + pr_warn("TCMU: data direction was %d!\n", + se_cmd->data_direction); + } } done: @@ -2010,6 +2021,106 @@ static int tcmu_release(struct uio_info *info, struct inode *inode) return 0; } +static long tcmu_do_copy_data(struct tcmu_cmd *tcmu_cmd, + struct iovec __user *uiovec, + unsigned int vcnt, + bool is_copy_to_sgl) +{ + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + struct scatterlist *data_sg, *sg; + int i; + unsigned int data_nents; + size_t copied; + + if (se_cmd->se_cmd_flags & SCF_BIDI) { + data_sg = se_cmd->t_bidi_data_sg; + data_nents = se_cmd->t_bidi_data_nents; + } else { + data_sg = se_cmd->t_data_sg; + data_nents = se_cmd->t_data_nents; + } + + ret = import_iovec(is_copy_to_sgl ? ITER_SOURCE : ITER_DEST, + uiovec, vcnt, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) { + pr_err("import iovec failed.\n"); + return -EFAULT; + } + + for_each_sg(data_sg, sg, data_nents, i) { + if (is_copy_to_sgl) + copied = copy_page_from_iter(sg_page(sg), sg->offset, sg->length, &iter); + else + copied = copy_page_to_iter(sg_page(sg), sg->offset, sg->length, &iter); + if (copied != sg->length) { + pr_err("copy failed.\n"); + ret = -EFAULT; + break; + } + } + kfree(iov); + return ret < 0 ? -EFAULT : 0; +} + +static long tcmu_bypass_data_area_copy_data(struct tcmu_dev *udev, + unsigned long arg, + bool is_copy_to_sgl) +{ + struct tcmu_data_xfer __user *uxfer = (struct tcmu_data_xfer __user *)arg; + struct tcmu_data_xfer xfer; + struct tcmu_cmd *tcmu_cmd; + long ret; + + if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) + return -EINVAL; + + if (copy_from_user(&xfer, uxfer, sizeof(xfer))) + return -EFAULT; + + mutex_lock(&udev->cmdr_lock); + tcmu_cmd = xa_load(&udev->commands, xfer.cmd_id); + if (!tcmu_cmd) { + pr_err("Can not find tcmu command, cmd_id:%d\n", xfer.cmd_id); + set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags); + ret = -EFAULT; + goto out; + } + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &tcmu_cmd->flags)) { + pr_err("Command is expired, cmd_id:%d\n", xfer.cmd_id); + ret = -EFAULT; + goto out; + } + + ret = tcmu_do_copy_data(tcmu_cmd, xfer.iovec, + xfer.iov_cnt, is_copy_to_sgl); +out: + mutex_unlock(&udev->cmdr_lock); + return ret; +} + +static long tcmu_ioctl(struct uio_info *info, unsigned int cmd, unsigned long arg) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + long ret; + + switch (cmd) { + case TCMU_IOCTL_CMD_COPY_TO_SGL: + ret = tcmu_bypass_data_area_copy_data(udev, arg, true); + break; + case TCMU_IOCTL_CMD_COPY_FROM_SGL: + ret = tcmu_bypass_data_area_copy_data(udev, arg, false); + break; + default: + ret = -EINVAL; + } + return ret; +} + static int tcmu_init_genl_cmd_reply(struct tcmu_dev *udev, int cmd) { struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd; @@ -2266,6 +2377,7 @@ static int tcmu_configure_device(struct se_device *dev) info->mmap_prepare = tcmu_mmap_prepare; info->open = tcmu_open; info->release = tcmu_release; + info->ioctl = tcmu_ioctl; ret = uio_register_device(tcmu_root_device, info); if (ret) @@ -3147,6 +3259,40 @@ static ssize_t tcmu_free_kept_buf_store(struct config_item *item, const char *pa } CONFIGFS_ATTR_WO(tcmu_, free_kept_buf); +static ssize_t tcmu_bypass_data_area_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + if (test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) + return snprintf(page, PAGE_SIZE, "%s\n", "true"); + else + return snprintf(page, PAGE_SIZE, "%s\n", "false"); +} + +static ssize_t tcmu_bypass_data_area_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + bool bypass_data_area; + int ret; + + ret = kstrtobool(page, &bypass_data_area); + if (ret < 0) + return ret; + + if (bypass_data_area) + set_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags); + else + clear_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags); + + return count; +} +CONFIGFS_ATTR(tcmu_, bypass_data_area); + static struct configfs_attribute *tcmu_attrib_attrs[] = { &tcmu_attr_cmd_time_out, &tcmu_attr_qfull_time_out, @@ -3158,6 +3304,7 @@ static struct configfs_attribute *tcmu_attrib_attrs[] = { &tcmu_attr_emulate_write_cache, &tcmu_attr_tmr_notification, &tcmu_attr_nl_reply_supported, + &tcmu_attr_bypass_data_area, NULL, }; diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index f925a77f19ed..2ce13568f196 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -185,4 +185,14 @@ enum tcmu_genl_attr { }; #define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) +struct tcmu_data_xfer { + __u16 cmd_id; + __u16 __pad1; + __u32 iov_cnt; + struct iovec __user *iovec; +}; + +#define TCMU_IOCTL_CMD_COPY_TO_SGL _IOW('T', 0xe0, struct tcmu_data_xfer) +#define TCMU_IOCTL_CMD_COPY_FROM_SGL _IOR('T', 0xe1, struct tcmu_data_xfer) + #endif -- Gitee From f192935f8b4205da57f7c0dee607682110f651bb Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Tue, 31 May 2022 14:33:16 +0800 Subject: [PATCH 04/12] anolis: scsi: target: tcmu: Introduce cmd_lock to tcmu_cmd ANBZ: #34745 cherry picked from devel-6.6 commit de0599010e505b8abd6802911952d4b97a2776e9. Currently uio framework's info_lock and tcmu's cmdr_lock force tcmu bypass data area ioctl commands to copy data sequentially, which impacts io throughput greatly, but because tcmu timeout handler may also run in, handle and set cmd expired, then the sg pages in se_cmd can not be accessed any more, we must hold tcmu's cmdr_lock to avoid race between tcmu timeout handler and bypass data area ioctls. To improve this a bit, introduce a cmd_lock per tcmu_cmd, then multiple bypass data area ioctl commands can run concurrently(of course, uio`s info_lock needs to disappear too), also ensure there isn`t race against tcmu timeout handler too. Signed-off-by: Guixin Liu Signed-off-by: Joseph Qi --- drivers/target/target_core_user.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 7f7c5f68cb1f..7c54f029d1b6 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -195,6 +195,8 @@ struct tcmu_cmd { #define TCMU_CMD_BIT_EXPIRED 0 #define TCMU_CMD_BIT_KEEP_BUF 1 unsigned long flags; + + struct mutex cmd_lock; }; struct tcmu_tmr { @@ -644,6 +646,7 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) INIT_LIST_HEAD(&tcmu_cmd->queue_entry); tcmu_cmd->se_cmd = se_cmd; tcmu_cmd->tcmu_dev = udev; + mutex_init(&tcmu_cmd->cmd_lock); if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { tcmu_cmd_set_block_cnts(tcmu_cmd); @@ -1533,11 +1536,13 @@ static void tcmu_check_expired_ring_cmd(struct tcmu_cmd *cmd) if (!time_after_eq(jiffies, cmd->deadline)) return; + mutex_lock(&cmd->cmd_lock); set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags); list_del_init(&cmd->queue_entry); se_cmd = cmd->se_cmd; se_cmd->priv = NULL; cmd->se_cmd = NULL; + mutex_unlock(&cmd->cmd_lock); pr_debug("Timing out inflight cmd %u on dev %s.\n", cmd->cmd_id, cmd->tcmu_dev->name); @@ -2081,15 +2086,14 @@ static long tcmu_bypass_data_area_copy_data(struct tcmu_dev *udev, if (copy_from_user(&xfer, uxfer, sizeof(xfer))) return -EFAULT; - mutex_lock(&udev->cmdr_lock); tcmu_cmd = xa_load(&udev->commands, xfer.cmd_id); if (!tcmu_cmd) { pr_err("Can not find tcmu command, cmd_id:%d\n", xfer.cmd_id); set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags); - ret = -EFAULT; - goto out; + return -EFAULT; } + mutex_lock(&tcmu_cmd->cmd_lock); if (test_bit(TCMU_CMD_BIT_EXPIRED, &tcmu_cmd->flags)) { pr_err("Command is expired, cmd_id:%d\n", xfer.cmd_id); ret = -EFAULT; @@ -2099,7 +2103,7 @@ static long tcmu_bypass_data_area_copy_data(struct tcmu_dev *udev, ret = tcmu_do_copy_data(tcmu_cmd, xfer.iovec, xfer.iov_cnt, is_copy_to_sgl); out: - mutex_unlock(&udev->cmdr_lock); + mutex_unlock(&tcmu_cmd->cmd_lock); return ret; } -- Gitee From 0f35c382d568beb8d1984030fefdf0a72dd42068 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Mon, 14 Feb 2022 17:15:07 +0800 Subject: [PATCH 05/12] anolis: uio: Replace mutex info_lock with percpu_ref to improve performance ANBZ: #34745 cherry picked from devel-6.6 commit f49f7deb94c7bd05ceff282c74e8a1218773bc25. The mutex info_lock was introduced to fix crash after the device is unregistered in commit 57c5f4df0a5a ("uio: fix crash after the device is unregistered"), we can replace it with more powerful percpu-ref to improve performance. Use tcm_loop and tcmu(backstore is file) to evaluate performance, fio job: fio -filename=/dev/sdb -ioengine=libaio -direct=1 -size=2G -name=1 -thread -runtime=60 -time_based -rw=randread -numjobs=16 -iodepth=16 -bs=128k Without this patch: READ: bw=2965MiB/s (3110MB/s), 183MiB/s-188MiB/s (192MB/s-197MB/s), io=174GiB (187GB), run=60005-60007msec With this patch: READ: bw=5823MiB/s (6106MB/s), 338MiB/s-379MiB/s (354MB/s-397MB/s), io=341GiB (366GB), run=60002-60005msec There is about a 100% performance improvement in this case. Signed-off-by: Guixin Liu [ also fix percpu_ref leak ] Signed-off-by: Joseph Qi --- drivers/uio/uio.c | 103 +++++++++++++++++++++++++++---------- include/linux/uio_driver.h | 5 +- 2 files changed, 81 insertions(+), 27 deletions(-) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 75533f7c6bea..44b6dcae191d 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -12,6 +12,8 @@ * Base Functions */ +#include +#include #include #include #include @@ -219,7 +221,9 @@ static ssize_t name_show(struct device *dev, struct uio_device *idev = dev_get_drvdata(dev); int ret; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { ret = -EINVAL; dev_err(dev, "the device has been unregistered\n"); @@ -229,7 +233,7 @@ static ssize_t name_show(struct device *dev, ret = sprintf(buf, "%s\n", idev->info->name); out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return ret; } static DEVICE_ATTR_RO(name); @@ -240,7 +244,9 @@ static ssize_t version_show(struct device *dev, struct uio_device *idev = dev_get_drvdata(dev); int ret; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { ret = -EINVAL; dev_err(dev, "the device has been unregistered\n"); @@ -250,7 +256,7 @@ static ssize_t version_show(struct device *dev, ret = sprintf(buf, "%s\n", idev->info->version); out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return ret; } static DEVICE_ATTR_RO(version); @@ -504,16 +510,20 @@ static int uio_open(struct inode *inode, struct file *filep) listener->event_count = atomic_read(&idev->event); filep->private_data = listener; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) { + ret = -EINVAL; + goto err_infoopen; + } + if (!idev->info) { - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); ret = -EINVAL; goto err_infoopen; } if (idev->info->open) ret = idev->info->open(idev->info, inode); - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); if (ret) goto err_infoopen; @@ -546,10 +556,12 @@ static int uio_release(struct inode *inode, struct file *filep) struct uio_listener *listener = filep->private_data; struct uio_device *idev = listener->dev; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (idev->info && idev->info->release) ret = idev->info->release(idev->info, inode); - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); module_put(idev->owner); kfree(listener); @@ -563,10 +575,12 @@ static __poll_t uio_poll(struct file *filep, poll_table *wait) struct uio_device *idev = listener->dev; __poll_t ret = 0; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info || !idev->info->irq) ret = EPOLLERR; - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); if (ret) return ret; @@ -592,13 +606,17 @@ static ssize_t uio_read(struct file *filep, char __user *buf, add_wait_queue(&idev->wait, &wait); do { - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) { + retval = -EINVAL; + break; + } + if (!idev->info || !idev->info->irq) { retval = -EIO; - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); break; } - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); set_current_state(TASK_INTERRUPTIBLE); @@ -646,7 +664,9 @@ static ssize_t uio_write(struct file *filep, const char __user *buf, if (copy_from_user(&irq_on, buf, count)) return -EFAULT; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { retval = -EINVAL; goto out; @@ -665,7 +685,7 @@ static ssize_t uio_write(struct file *filep, const char __user *buf, retval = idev->info->irqcontrol(idev->info, irq_on); out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return retval ? retval : sizeof(s32); } @@ -690,7 +710,9 @@ static vm_fault_t uio_vma_fault(struct vm_fault *vmf) vm_fault_t ret = 0; int mi; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return VM_FAULT_SIGBUS; + if (!idev->info) { ret = VM_FAULT_SIGBUS; goto out; @@ -717,8 +739,7 @@ static vm_fault_t uio_vma_fault(struct vm_fault *vmf) vmf->page = page; out: - mutex_unlock(&idev->info_lock); - + percpu_ref_put(&idev->info_ref); return ret; } @@ -830,7 +851,9 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) vma->vm_private_data = idev; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { ret = -EINVAL; goto out; @@ -878,7 +901,7 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) } out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return ret; } @@ -888,7 +911,8 @@ static long uio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct uio_device *idev = listener->dev; long retval = 0; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; if (!idev->info || !idev->info->ioctl) { retval = -EINVAL; @@ -897,7 +921,7 @@ static long uio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) retval = idev->info->ioctl(idev->info, cmd, arg); out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return retval; } @@ -992,9 +1016,18 @@ static void uio_device_release(struct device *dev) { struct uio_device *idev = dev_get_drvdata(dev); + percpu_ref_exit(&idev->info_ref); kfree(idev); } +static void uio_info_free(struct percpu_ref *ref) +{ + struct uio_device *idev = container_of(ref, struct uio_device, info_ref); + + complete(&idev->free_done); +} + + /** * __uio_register_device - register a new userspace IO device * @owner: module that creates the new device @@ -1025,12 +1058,21 @@ int __uio_register_device(struct module *owner, idev->owner = owner; idev->info = info; - mutex_init(&idev->info_lock); init_waitqueue_head(&idev->wait); atomic_set(&idev->event, 0); + ret = percpu_ref_init(&idev->info_ref, uio_info_free, 0, GFP_KERNEL); + if (ret) { + pr_err("percpu_ref init failed!\n"); + kfree(idev); + return ret; + } + init_completion(&idev->confirm_done); + init_completion(&idev->free_done); + ret = uio_get_minor(idev); if (ret) { + percpu_ref_exit(&idev->info_ref); kfree(idev); return ret; } @@ -1124,6 +1166,13 @@ int __devm_uio_register_device(struct module *owner, } EXPORT_SYMBOL_GPL(__devm_uio_register_device); +static void uio_confirm_info(struct percpu_ref *ref) +{ + struct uio_device *idev = container_of(ref, struct uio_device, info_ref); + + complete(&idev->confirm_done); +} + /** * uio_unregister_device - unregister a industrial IO device * @info: UIO device capabilities @@ -1140,14 +1189,16 @@ void uio_unregister_device(struct uio_info *info) idev = info->uio_dev; minor = idev->minor; - mutex_lock(&idev->info_lock); + percpu_ref_kill_and_confirm(&idev->info_ref, uio_confirm_info); + wait_for_completion(&idev->confirm_done); + wait_for_completion(&idev->free_done); + /* now, we can set info to NULL */ uio_dev_del_attributes(idev); if (info->irq && info->irq != UIO_IRQ_CUSTOM) free_irq(info->irq, idev); idev->info = NULL; - mutex_unlock(&idev->info_lock); wake_up_interruptible(&idev->wait); kill_fasync(&idev->async_queue, SIGIO, POLL_HUP); diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 7fcf1c36fd0d..4ee9b76ae8e1 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -16,6 +16,7 @@ #include #include #include +#include struct module; struct uio_map; @@ -81,9 +82,11 @@ struct uio_device { struct fasync_struct *async_queue; wait_queue_head_t wait; struct uio_info *info; - struct mutex info_lock; struct kobject *map_dir; struct kobject *portio_dir; + struct percpu_ref info_ref; + struct completion confirm_done; + struct completion free_done; }; /** -- Gitee From 19adcfae24d4ebe566db90a615ce865a89664edd Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Wed, 16 Mar 2022 13:40:00 +0800 Subject: [PATCH 06/12] anolis: mm: export zap_vma_range() ANBZ: #34745 cherry picked from devel-6.6 commit f79ad9b5139ecb06bce8d7989486c9c64e8a7f3f. Module target_core_user will use it to implement zero copy feature. Signed-off-by: Xiaoguang Wang Signed-off-by: Guixin Liu [ export zap_vma_range() correspondingly ] Signed-off-by: Joseph Qi --- mm/memory.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memory.c b/mm/memory.c index edb51fdc5ec4..95bc5c10d0ed 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2231,6 +2231,7 @@ void zap_vma_range(struct vm_area_struct *vma, unsigned long address, zap_vma_range_batched(&tlb, vma, address, size, NULL); tlb_finish_mmu(&tlb); } +EXPORT_SYMBOL_GPL(zap_vma_range); /** * zap_special_vma_range - zap all page table entries in a special vma range -- Gitee From 451742e2abf8737661eb25dfed7a3ef2c32ad1e2 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 15 Mar 2022 14:40:19 +0800 Subject: [PATCH 07/12] anolis: scsi: target: tcmu: Support zero copy ANBZ: #34745 cherry picked from devel-6.6 commit a12a23b43065a58db184778afed42449267f5b09. Currently in tcmu, for READ commands, it copies user space backstore's data buffer to tcmu internal data area, then copies data in data area to READ commands sgl pages. For WRITE commands, tcmu copies sgl pages to tcmu internal data area, then copies data in data area to user space backstore. For both cases, there are obvious copy overhead, which impact io throughput, especially for large io size. To mitigate this issue, we implement zero copy feature to tcmu, which map sgl pages to user space backstore's address space. Currently only sgl pages's offset and length are both aligned to page size, can this command go into tcmu zero copy path. Signed-off-by: Xiaoguang Wang Signed-off-by: Guixin Liu [ fix conflicts for vm_flags_set_flags() in tcmu_mmap_prepare(), and mark tcmu_ioctl_cmd_zerocopy() as static ] Signed-off-by: Joseph Qi --- drivers/target/target_core_user.c | 285 ++++++++++++++++++++++++-- include/uapi/linux/target_core_user.h | 8 + 2 files changed, 271 insertions(+), 22 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 7c54f029d1b6..4ec8affb1f9e 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include #include @@ -73,6 +75,7 @@ */ #define DATA_PAGES_PER_BLK_DEF 1 #define DATA_AREA_PAGES_DEF (256 * 1024) +#define ZC_DATA_AREA_PAGES_DEF (256 * 1024) #define TCMU_MBS_TO_PAGES(_mbs) ((size_t)_mbs << (20 - PAGE_SHIFT)) #define TCMU_PAGES_TO_MBS(_pages) (_pages >> (20 - PAGE_SHIFT)) @@ -140,6 +143,7 @@ struct tcmu_dev { /* Must add data_off and mb_addr to get the address */ size_t data_off; int data_area_mb; + uint32_t zc_max_blocks; uint32_t max_blocks; size_t mmap_pages; @@ -154,6 +158,10 @@ struct tcmu_dev { uint32_t data_pages_per_blk; uint32_t data_blk_size; + uint32_t zc_dbi_max; + uint32_t zc_dbi_thresh; + unsigned long *zc_data_bitmap; + struct xarray commands; struct timer_list cmd_timer; @@ -179,6 +187,12 @@ struct tcmu_cmd { struct tcmu_dev *tcmu_dev; struct list_head queue_entry; + /* for zero_copy */ + struct mm_struct *vma_vm_mm; + struct vm_area_struct *vma; + struct iovec *iov; + int iov_cnt; + uint16_t cmd_id; /* Can't use se_cmd when cleaning up expired cmds, because if @@ -194,6 +208,7 @@ struct tcmu_cmd { #define TCMU_CMD_BIT_EXPIRED 0 #define TCMU_CMD_BIT_KEEP_BUF 1 +#define TCMU_CMD_BIT_ZEROCOPY 2 unsigned long flags; struct mutex cmd_lock; @@ -500,10 +515,38 @@ static struct genl_family tcmu_genl_family __ro_after_init = { static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd, uint32_t len) { struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + unsigned long *data_bitmap; uint32_t i; + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags)) + data_bitmap = udev->zc_data_bitmap; + else + data_bitmap = udev->data_bitmap; + for (i = 0; i < len; i++) - clear_bit(tcmu_cmd->dbi[i], udev->data_bitmap); + clear_bit(tcmu_cmd->dbi[i], data_bitmap); +} + +static inline int tcmu_get_zc_empty_block(struct tcmu_dev *udev, + struct tcmu_cmd *tcmu_cmd, + int prev_dbi, int *iov_cnt) +{ + int dbi; + + dbi = find_first_zero_bit(udev->zc_data_bitmap, udev->zc_dbi_thresh); + if (dbi == udev->zc_dbi_thresh) + return -1; + + if (dbi > udev->zc_dbi_max) + udev->zc_dbi_max = dbi; + + set_bit(dbi, udev->zc_data_bitmap); + tcmu_cmd_set_dbi(tcmu_cmd, dbi); + + if (dbi != prev_dbi + 1) + *iov_cnt += 1; + + return dbi; } static inline int tcmu_get_empty_block(struct tcmu_dev *udev, @@ -555,7 +598,8 @@ static inline int tcmu_get_empty_block(struct tcmu_dev *udev, } static int tcmu_get_empty_blocks(struct tcmu_dev *udev, - struct tcmu_cmd *tcmu_cmd, int length) + struct tcmu_cmd *tcmu_cmd, int length, + bool zero_copy) { /* start value of dbi + 1 must not be a valid dbi */ int dbi = -2; @@ -564,7 +608,10 @@ static int tcmu_get_empty_blocks(struct tcmu_dev *udev, for (; length > 0; length -= blk_size) { blk_data_len = min_t(uint32_t, length, blk_size); - dbi = tcmu_get_empty_block(udev, tcmu_cmd, dbi, blk_data_len, + if (zero_copy) + dbi = tcmu_get_zc_empty_block(udev, tcmu_cmd, dbi, &iov_cnt); + else + dbi = tcmu_get_empty_block(udev, tcmu_cmd, dbi, blk_data_len, &iov_cnt); if (dbi < 0) return -1; @@ -572,8 +619,40 @@ static int tcmu_get_empty_blocks(struct tcmu_dev *udev, return iov_cnt; } +static void tcmu_cmd_zerocopy_unmap(struct tcmu_cmd *cmd) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct iovec *iov = cmd->iov; + unsigned long address; + int i; + + mm = cmd->vma_vm_mm; + vma = cmd->vma; + if (!mm) + return; + + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + for (i = 0; i < cmd->iov_cnt; i++) { + address = (unsigned long)iov->iov_base; + zap_vma_range(vma, address, iov->iov_len); + iov++; + } + mmap_read_unlock(mm); + mmput(mm); + } + + cmd->vma_vm_mm = NULL; + cmd->vma = NULL; + mmdrop(mm); + kfree(cmd->iov); +} + static inline void tcmu_free_cmd(struct tcmu_cmd *tcmu_cmd) { + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags)) + tcmu_cmd_zerocopy_unmap(tcmu_cmd); kfree(tcmu_cmd->dbi); kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); } @@ -861,37 +940,51 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size) * Called with ring lock held. */ static int tcmu_alloc_data_space(struct tcmu_dev *udev, struct tcmu_cmd *cmd, - int *iov_bidi_cnt) + int *iov_bidi_cnt, bool zero_copy) { int space, iov_cnt = 0, ret = 0; + unsigned long *data_bitmap; + uint32_t *dbi_thresh, max_blocks; if (!cmd->dbi_cnt) goto wr_iov_cnts; + if (zero_copy) { + data_bitmap = udev->zc_data_bitmap; + dbi_thresh = &udev->zc_dbi_thresh; + max_blocks = udev->zc_max_blocks; + } else { + data_bitmap = udev->data_bitmap; + dbi_thresh = &udev->dbi_thresh; + max_blocks = udev->max_blocks; + } + /* try to check and get the data blocks as needed */ - space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh); + space = spc_bitmap_free(data_bitmap, *dbi_thresh); if (space < cmd->dbi_cnt) { - unsigned long blocks_left = - (udev->max_blocks - udev->dbi_thresh) + space; + unsigned long blocks_left = max_blocks - *dbi_thresh + space; if (blocks_left < cmd->dbi_cnt) { - pr_debug("no data space: only %lu available, but ask for %u\n", + pr_debug("no data space[%s]: only %lu available, but ask for %u\n", ++ zero_copy ? "zero copy" : "non zero copy", blocks_left * udev->data_blk_size, cmd->dbi_cnt * udev->data_blk_size); return -1; } - udev->dbi_thresh += cmd->dbi_cnt; - if (udev->dbi_thresh > udev->max_blocks) - udev->dbi_thresh = udev->max_blocks; + *dbi_thresh += cmd->dbi_cnt; + if (*dbi_thresh > max_blocks) + *dbi_thresh = max_blocks; } - iov_cnt = tcmu_get_empty_blocks(udev, cmd, cmd->se_cmd->data_length); + iov_cnt = tcmu_get_empty_blocks(udev, cmd, cmd->se_cmd->data_length, + zero_copy); if (iov_cnt < 0) return -1; if (cmd->dbi_bidi_cnt) { - ret = tcmu_get_empty_blocks(udev, cmd, cmd->data_len_bidi); + ret = tcmu_get_empty_blocks(udev, cmd, cmd->data_len_bidi, + zero_copy); if (ret < 0) return -1; } @@ -1032,6 +1125,7 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) uint32_t blk_size = udev->data_blk_size; /* size of data buffer needed */ size_t data_length = (size_t)tcmu_cmd->dbi_cnt * blk_size; + bool zero_copy = false; *scsi_err = TCM_NO_SENSE; @@ -1055,7 +1149,22 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) return -1; } - iov_cnt = tcmu_alloc_data_space(udev, tcmu_cmd, &iov_bidi_cnt); + if (!(se_cmd->se_cmd_flags & SCF_BIDI) && se_cmd->data_length && + IS_ALIGNED(se_cmd->data_length, PAGE_SIZE)) { + struct scatterlist *data_sg = se_cmd->t_data_sg, *sg; + unsigned int data_nents = se_cmd->t_data_nents; + int i; + + for_each_sg(data_sg, sg, data_nents, i) { + if ((sg->offset && !IS_ALIGNED(sg->offset, PAGE_SIZE)) || + !IS_ALIGNED(sg->length, PAGE_SIZE)) + break; + } + if (i == data_nents) + zero_copy = true; + } + + iov_cnt = tcmu_alloc_data_space(udev, tcmu_cmd, &iov_bidi_cnt, zero_copy); if (iov_cnt < 0) goto free_and_queue; @@ -1105,7 +1214,7 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) iov = &entry->req.iov[0]; if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { - if (se_cmd->data_direction == DMA_TO_DEVICE || + if (((se_cmd->data_direction == DMA_TO_DEVICE) && !zero_copy) || se_cmd->se_cmd_flags & SCF_BIDI) scatter_data_area(udev, tcmu_cmd, &iov); else @@ -1125,6 +1234,19 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) tcmu_setup_cmd_timer(tcmu_cmd, udev->cmd_time_out, &udev->cmd_timer); entry->hdr.cmd_id = tcmu_cmd->cmd_id; + if (zero_copy) { + int i; + struct iovec *tiov; + + tiov = &entry->req.iov[0]; + for (i = 0; i < entry->req.iov_cnt; i++) { + tiov->iov_base = tiov->iov_base + + (TCMU_MBS_TO_PAGES(udev->data_area_mb) << PAGE_SHIFT); + tiov++; + } + entry->hdr.kflags |= TCMU_KFLAG_ZERO_COPY; + set_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); + } tcmu_hdr_set_len(&entry->hdr.len_op, command_size); @@ -1381,7 +1503,9 @@ static bool tcmu_handle_completion(struct tcmu_cmd *cmd, se_cmd->se_cmd_flags |= SCF_TREAT_READ_AS_NORMAL; } if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { - if (se_cmd->se_cmd_flags & SCF_BIDI) { + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &cmd->flags)) { + tcmu_cmd_zerocopy_unmap(cmd); + } else if (se_cmd->se_cmd_flags & SCF_BIDI) { /* Get Data-In buffer before clean up */ gather_data_area(udev, cmd, true, read_len); } else if (se_cmd->data_direction == DMA_FROM_DEVICE) { @@ -1537,6 +1661,8 @@ static void tcmu_check_expired_ring_cmd(struct tcmu_cmd *cmd) return; mutex_lock(&cmd->cmd_lock); + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &cmd->flags)) + tcmu_cmd_zerocopy_unmap(cmd); set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags); list_del_init(&cmd->queue_entry); se_cmd = cmd->se_cmd; @@ -1635,6 +1761,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) udev->data_pages_per_blk = DATA_PAGES_PER_BLK_DEF; udev->max_blocks = DATA_AREA_PAGES_DEF / udev->data_pages_per_blk; + udev->zc_max_blocks = ZC_DATA_AREA_PAGES_DEF / udev->data_pages_per_blk; udev->cmdr_size = CMDR_SIZE_DEF; udev->data_area_mb = TCMU_PAGES_TO_MBS(DATA_AREA_PAGES_DEF); @@ -1756,6 +1883,7 @@ static void tcmu_dev_kref_release(struct kref *kref) tcmu_blocks_release(udev, 0, udev->dbi_max); bitmap_free(udev->data_bitmap); + bitmap_free(udev->zc_data_bitmap); mutex_unlock(&udev->cmdr_lock); pr_debug("dev_kref_release\n"); @@ -1956,7 +2084,7 @@ static int tcmu_mmap_prepare(struct uio_info *info, struct vm_area_desc *desc) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); - vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT, VMA_MIXEDMAP_BIT); desc->vm_ops = &tcmu_vm_ops; desc->private_data = udev; @@ -1968,6 +2096,109 @@ static int tcmu_mmap_prepare(struct uio_info *info, struct vm_area_desc *desc) return 0; } +#define TCMU_ZEROCOPY_PAGE_BATCH 32 + +static inline int tcmu_zerocopy_one_seg(struct iovec *iov, + struct vm_area_struct *vma, + struct sg_page_iter *sgiter) +{ + struct page *pages[TCMU_ZEROCOPY_PAGE_BATCH]; + unsigned int len = iov->iov_len; + unsigned long address = (unsigned long)iov->iov_base; + unsigned long pages_remaining, pg_index = 0; + struct page *page; + int ret; + + while (len > 0) { + __sg_page_iter_next(sgiter); + page = sg_page_iter_page(sgiter); + pages[pg_index++] = page; + len -= PAGE_SIZE; + if (pg_index == TCMU_ZEROCOPY_PAGE_BATCH || !len) { + pages_remaining = pg_index; + ret = vm_insert_pages_mkspecial(vma, address, pages, + &pages_remaining); + if (ret < 0) { + pr_err("vm insert pages failed, error code: %d\n", ret); + return ret; + } + address = address + pg_index * PAGE_SIZE; + pg_index = 0; + } + } + + return 0; +} + +static long tcmu_ioctl_cmd_zerocopy(struct tcmu_dev *udev, unsigned long arg) +{ + struct tcmu_cmd *cmd; + struct se_cmd *se_cmd; + struct scatterlist *data_sg; + unsigned int data_nents; + struct tcmu_cmd_zerocopy zc; + struct iovec *iov, *tiov; + struct sg_page_iter sgiter; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int i, ret = 0; + + if (copy_from_user(&zc, (struct tcmu_cmd_zerocopy __user *)arg, sizeof(zc))) + return -EFAULT; + + if (zc.iov_cnt <= 0) + return -EINVAL; + + iov = kmalloc_array(zc.iov_cnt, sizeof(struct iovec), GFP_KERNEL); + if (!iov) + return -ENOMEM; + if (copy_from_user(iov, zc.iov, sizeof(struct iovec) * zc.iov_cnt)) { + kfree(iov); + return -EFAULT; + } + + mutex_lock(&udev->cmdr_lock); + mmap_read_lock(mm); + cmd = xa_load(&udev->commands, zc.cmd_id); + if (!cmd) { + ret = -EINVAL; + kfree(iov); + pr_err("tcmu zero copy: cmd_id %d not found\n", zc.cmd_id); + goto out; + } + se_cmd = cmd->se_cmd; + + vma = find_vma(current->mm, (unsigned long)iov->iov_base); + if (!vma) { + ret = -EINVAL; + kfree(iov); + pr_err("tcmu zero copy: invalid iov_base\n"); + goto out; + } + data_sg = se_cmd->t_data_sg; + data_nents = se_cmd->t_data_nents; + __sg_page_iter_start(&sgiter, data_sg, data_nents, 0); + tiov = iov; + for (i = 0; i < zc.iov_cnt; i++) { + ret = tcmu_zerocopy_one_seg(tiov, vma, &sgiter); + if (ret < 0) { + kfree(iov); + goto out; + } + tiov++; + } + + cmd->iov = iov; + cmd->iov_cnt = zc.iov_cnt; + cmd->vma_vm_mm = vma->vm_mm; + cmd->vma = vma; + mmgrab(cmd->vma_vm_mm); +out: + mmap_read_unlock(mm); + mutex_unlock(&udev->cmdr_lock); + return ret; +} + static int tcmu_open(struct uio_info *info, struct inode *inode) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); @@ -2119,6 +2350,9 @@ static long tcmu_ioctl(struct uio_info *info, unsigned int cmd, unsigned long ar case TCMU_IOCTL_CMD_COPY_FROM_SGL: ret = tcmu_bypass_data_area_copy_data(udev, arg, false); break; + case TCMU_IOCTL_CMD_ZEROCOPY: + ret = tcmu_ioctl_cmd_zerocopy(udev, arg); + break; default: ret = -EINVAL; } @@ -2325,6 +2559,7 @@ static int tcmu_configure_device(struct se_device *dev) struct uio_info *info; struct tcmu_mailbox *mb; size_t data_size; + size_t zc_data_size; int ret = 0; ret = tcmu_update_uio_info(udev); @@ -2335,10 +2570,11 @@ static int tcmu_configure_device(struct se_device *dev) mutex_lock(&udev->cmdr_lock); udev->data_bitmap = bitmap_zalloc(udev->max_blocks, GFP_KERNEL); + udev->zc_data_bitmap = bitmap_zalloc(udev->zc_max_blocks, GFP_KERNEL); mutex_unlock(&udev->cmdr_lock); - if (!udev->data_bitmap) { + if (!udev->data_bitmap || !udev->zc_data_bitmap) { ret = -ENOMEM; - goto err_bitmap_alloc; + goto err_vzalloc; } mb = vzalloc(udev->cmdr_size + CMDR_OFF); @@ -2352,9 +2588,12 @@ static int tcmu_configure_device(struct se_device *dev) udev->cmdr = (void *)mb + CMDR_OFF; udev->data_off = udev->cmdr_size + CMDR_OFF; data_size = TCMU_MBS_TO_PAGES(udev->data_area_mb) << PAGE_SHIFT; - udev->mmap_pages = (data_size + udev->cmdr_size + CMDR_OFF) >> PAGE_SHIFT; + zc_data_size = (udev->zc_max_blocks * udev->data_pages_per_blk) << PAGE_SHIFT; + udev->mmap_pages = (data_size + zc_data_size + udev->cmdr_size + + CMDR_OFF) >> PAGE_SHIFT; udev->data_blk_size = udev->data_pages_per_blk * PAGE_SIZE; udev->dbi_thresh = 0; /* Default in Idle state */ + udev->zc_dbi_thresh = 0; /* Default in Idle state */ /* Initialise the mailbox of the ring buffer */ mb->version = TCMU_MAILBOX_VERSION; @@ -2372,7 +2611,8 @@ static int tcmu_configure_device(struct se_device *dev) info->mem[0].name = "tcm-user command & data buffer"; info->mem[0].addr = (phys_addr_t)(uintptr_t)udev->mb_addr; - info->mem[0].size = data_size + udev->cmdr_size + CMDR_OFF; + info->mem[0].size = data_size + zc_data_size + + udev->cmdr_size + CMDR_OFF; info->mem[0].memtype = UIO_MEM_NONE; info->irqcontrol = tcmu_irqcontrol; @@ -2428,7 +2668,8 @@ static int tcmu_configure_device(struct se_device *dev) err_vzalloc: bitmap_free(udev->data_bitmap); udev->data_bitmap = NULL; -err_bitmap_alloc: + kfree(udev->zc_data_bitmap); + udev->zc_data_bitmap = NULL; kfree(info->name); info->name = NULL; diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index 2ce13568f196..eba0cac0c8d2 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -73,6 +73,7 @@ enum tcmu_opcode { struct tcmu_cmd_entry_hdr { __u32 len_op; __u16 cmd_id; +#define TCMU_KFLAG_ZERO_COPY 0x1 __u8 kflags; #define TCMU_UFLAG_UNKNOWN_OP 0x1 #define TCMU_UFLAG_READ_LEN 0x2 @@ -194,5 +195,12 @@ struct tcmu_data_xfer { #define TCMU_IOCTL_CMD_COPY_TO_SGL _IOW('T', 0xe0, struct tcmu_data_xfer) #define TCMU_IOCTL_CMD_COPY_FROM_SGL _IOR('T', 0xe1, struct tcmu_data_xfer) +#define TCMU_IOCTL_CMD_ZEROCOPY _IOW('T', 0xe2, struct tcmu_cmd_zerocopy) + +struct tcmu_cmd_zerocopy { + struct iovec __user *iov; + __u32 iov_cnt; + __u16 cmd_id; +}; #endif -- Gitee From b05e3500cbabe4c5cdd6d40b3b7c608429aefa38 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 30 Mar 2022 13:42:06 +0800 Subject: [PATCH 08/12] anolis: scsi: target: tcmu: make zero copy and bypass data area configurable ANBZ: #34745 cherry picked from devel-6.6 commit ba96c5bf9a824daffedcc26576d1b8a2614a9e22. Add configfs file read_zc_size, write_zc_size, read_bypass_data_area, and write_bypass_data_area to control which cmd to bypass data area or zero copy. Signed-off-by: Guixin Liu Signed-off-by: Joseph Qi --- drivers/target/target_core_user.c | 248 +++++++++++++++++++++----- include/uapi/linux/target_core_user.h | 1 + 2 files changed, 202 insertions(+), 47 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 4ec8affb1f9e..2a9b2de8df93 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -126,7 +126,8 @@ struct tcmu_dev { #define TCMU_DEV_BIT_BLOCKED 2 #define TCMU_DEV_BIT_TMR_NOTIFY 3 #define TCMU_DEV_BIT_PLUGGED 4 -#define TCMU_DEV_BIT_BYPASS_DATA_AREA 5 +#define TCMU_DEV_BIT_READ_BYPASS_DATA_AREA 5 +#define TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA 6 unsigned long flags; struct uio_info uio_info; @@ -161,6 +162,8 @@ struct tcmu_dev { uint32_t zc_dbi_max; uint32_t zc_dbi_thresh; unsigned long *zc_data_bitmap; + uint32_t read_zc_size; + uint32_t write_zc_size; struct xarray commands; @@ -209,6 +212,7 @@ struct tcmu_cmd { #define TCMU_CMD_BIT_EXPIRED 0 #define TCMU_CMD_BIT_KEEP_BUF 1 #define TCMU_CMD_BIT_ZEROCOPY 2 +#define TCMU_CMD_BIT_BYPASS_DATA_AREA 3 unsigned long flags; struct mutex cmd_lock; @@ -712,11 +716,67 @@ static void tcmu_setup_iovs(struct tcmu_dev *udev, struct tcmu_cmd *cmd, dbi = new_block_to_iov(udev, cmd, iov, dbi, data_length); } +static void tcmu_set_cmd_bypass_data_area(struct tcmu_cmd *tcmu_cmd) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + + /* + * Zero copy is map sg pages to userspace, and bypass data area + * is copy data between sg pages and userspace buffer, so they + * are completely different. + */ + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags)) + return; + + if (se_cmd->data_direction == DMA_FROM_DEVICE && + test_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags)) + set_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); + + if (se_cmd->data_direction == DMA_TO_DEVICE && + test_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags)) + set_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); +} + +static void tcmu_set_cmd_do_zero_copy(struct tcmu_cmd *tcmu_cmd) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + struct scatterlist *data_sg = se_cmd->t_data_sg, *sg; + unsigned int data_nents = se_cmd->t_data_nents; + int i; + + if ((se_cmd->se_cmd_flags & SCF_BIDI) || !se_cmd->data_length || + !IS_ALIGNED(se_cmd->data_length, PAGE_SIZE)) + return; + + if ((se_cmd->data_direction == DMA_FROM_DEVICE) && + (!udev->read_zc_size || + se_cmd->data_length < (udev->read_zc_size << 10))) + return; + + if ((se_cmd->data_direction == DMA_TO_DEVICE) && + (!udev->write_zc_size || + se_cmd->data_length < (udev->write_zc_size << 10))) + return; + + /* Now, check every sg pages is aligned. */ + for_each_sg(data_sg, sg, data_nents, i) { + if ((sg->offset && !IS_ALIGNED(sg->offset, PAGE_SIZE)) || + !IS_ALIGNED(sg->length, PAGE_SIZE)) + break; + } + if (i == data_nents) + set_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); +} + static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) { struct se_device *se_dev = se_cmd->se_dev; struct tcmu_dev *udev = TCMU_DEV(se_dev); struct tcmu_cmd *tcmu_cmd; + bool zero_copy; + bool bypass_data_area; tcmu_cmd = kmem_cache_zalloc(tcmu_cmd_cache, GFP_NOIO); if (!tcmu_cmd) @@ -727,7 +787,12 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) tcmu_cmd->tcmu_dev = udev; mutex_init(&tcmu_cmd->cmd_lock); - if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { + tcmu_set_cmd_do_zero_copy(tcmu_cmd); + tcmu_set_cmd_bypass_data_area(tcmu_cmd); + + zero_copy = test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); + bypass_data_area = test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); + if (zero_copy || !bypass_data_area) { tcmu_cmd_set_block_cnts(tcmu_cmd); tcmu_cmd->dbi = kcalloc(tcmu_cmd->dbi_cnt, sizeof(uint32_t), GFP_NOIO); @@ -946,7 +1011,7 @@ static int tcmu_alloc_data_space(struct tcmu_dev *udev, struct tcmu_cmd *cmd, unsigned long *data_bitmap; uint32_t *dbi_thresh, max_blocks; - if (!cmd->dbi_cnt) + if (test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &cmd->flags)) goto wr_iov_cnts; if (zero_copy) { @@ -1125,7 +1190,8 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) uint32_t blk_size = udev->data_blk_size; /* size of data buffer needed */ size_t data_length = (size_t)tcmu_cmd->dbi_cnt * blk_size; - bool zero_copy = false; + bool zero_copy = test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); + bool bypass_data_area = test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); *scsi_err = TCM_NO_SENSE; @@ -1149,21 +1215,6 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) return -1; } - if (!(se_cmd->se_cmd_flags & SCF_BIDI) && se_cmd->data_length && - IS_ALIGNED(se_cmd->data_length, PAGE_SIZE)) { - struct scatterlist *data_sg = se_cmd->t_data_sg, *sg; - unsigned int data_nents = se_cmd->t_data_nents; - int i; - - for_each_sg(data_sg, sg, data_nents, i) { - if ((sg->offset && !IS_ALIGNED(sg->offset, PAGE_SIZE)) || - !IS_ALIGNED(sg->length, PAGE_SIZE)) - break; - } - if (i == data_nents) - zero_copy = true; - } - iov_cnt = tcmu_alloc_data_space(udev, tcmu_cmd, &iov_bidi_cnt, zero_copy); if (iov_cnt < 0) goto free_and_queue; @@ -1213,7 +1264,7 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) tcmu_cmd_reset_dbi_cur(tcmu_cmd); iov = &entry->req.iov[0]; - if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { + if (zero_copy || !bypass_data_area) { if (((se_cmd->data_direction == DMA_TO_DEVICE) && !zero_copy) || se_cmd->se_cmd_flags & SCF_BIDI) scatter_data_area(udev, tcmu_cmd, &iov); @@ -1224,8 +1275,7 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) entry->req.iov_cnt = iov_cnt - iov_bidi_cnt; /* Handle BIDI commands */ - if ((se_cmd->se_cmd_flags & SCF_BIDI) - && !test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { + if ((se_cmd->se_cmd_flags & SCF_BIDI) && !bypass_data_area) { iov++; tcmu_setup_iovs(udev, tcmu_cmd, &iov, tcmu_cmd->data_len_bidi); entry->req.iov_bidi_cnt = iov_bidi_cnt; @@ -1245,9 +1295,11 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) tiov++; } entry->hdr.kflags |= TCMU_KFLAG_ZERO_COPY; - set_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); } + if (bypass_data_area) + entry->hdr.kflags |= TCMU_KFLAG_BYPASS_DATA_AREA; + tcmu_hdr_set_len(&entry->hdr.len_op, command_size); /* All offsets relative to mb_addr, not start of entry! */ @@ -1502,20 +1554,25 @@ static bool tcmu_handle_completion(struct tcmu_cmd *cmd, else se_cmd->se_cmd_flags |= SCF_TREAT_READ_AS_NORMAL; } - if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) { - if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &cmd->flags)) { - tcmu_cmd_zerocopy_unmap(cmd); - } else if (se_cmd->se_cmd_flags & SCF_BIDI) { - /* Get Data-In buffer before clean up */ - gather_data_area(udev, cmd, true, read_len); - } else if (se_cmd->data_direction == DMA_FROM_DEVICE) { - gather_data_area(udev, cmd, false, read_len); - } else if (se_cmd->data_direction == DMA_TO_DEVICE) { - /* TODO: */ - } else if (se_cmd->data_direction != DMA_NONE) { - pr_warn("TCMU: data direction was %d!\n", - se_cmd->data_direction); - } + + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &cmd->flags)) { + tcmu_cmd_zerocopy_unmap(cmd); + goto done; + } + + if (test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &cmd->flags)) + goto done; + + if (se_cmd->se_cmd_flags & SCF_BIDI) { + /* Get Data-In buffer before clean up */ + gather_data_area(udev, cmd, true, read_len); + } else if (se_cmd->data_direction == DMA_FROM_DEVICE) { + gather_data_area(udev, cmd, false, read_len); + } else if (se_cmd->data_direction == DMA_TO_DEVICE) { + /* TODO: */ + } else if (se_cmd->data_direction != DMA_NONE) { + pr_warn("TCMU: data direction was %d!\n", + se_cmd->data_direction); } done: @@ -1766,6 +1823,8 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) udev->data_area_mb = TCMU_PAGES_TO_MBS(DATA_AREA_PAGES_DEF); mutex_init(&udev->cmdr_lock); + udev->read_zc_size = 0; + udev->write_zc_size = 0; INIT_LIST_HEAD(&udev->node); INIT_LIST_HEAD(&udev->timedout_entry); @@ -2311,9 +2370,6 @@ static long tcmu_bypass_data_area_copy_data(struct tcmu_dev *udev, struct tcmu_cmd *tcmu_cmd; long ret; - if (!test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) - return -EINVAL; - if (copy_from_user(&xfer, uxfer, sizeof(xfer))) return -EFAULT; @@ -2325,6 +2381,11 @@ static long tcmu_bypass_data_area_copy_data(struct tcmu_dev *udev, } mutex_lock(&tcmu_cmd->cmd_lock); + if (!test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags)) { + ret = -EINVAL; + goto out; + } + if (test_bit(TCMU_CMD_BIT_EXPIRED, &tcmu_cmd->flags)) { pr_err("Command is expired, cmd_id:%d\n", xfer.cmd_id); ret = -EFAULT; @@ -3504,19 +3565,19 @@ static ssize_t tcmu_free_kept_buf_store(struct config_item *item, const char *pa } CONFIGFS_ATTR_WO(tcmu_, free_kept_buf); -static ssize_t tcmu_bypass_data_area_show(struct config_item *item, char *page) +static ssize_t tcmu_read_bypass_data_area_show(struct config_item *item, char *page) { struct se_dev_attrib *da = container_of(to_config_group(item), struct se_dev_attrib, da_group); struct tcmu_dev *udev = TCMU_DEV(da->da_dev); - if (test_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags)) + if (test_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags)) return snprintf(page, PAGE_SIZE, "%s\n", "true"); else return snprintf(page, PAGE_SIZE, "%s\n", "false"); } -static ssize_t tcmu_bypass_data_area_store(struct config_item *item, const char *page, +static ssize_t tcmu_read_bypass_data_area_store(struct config_item *item, const char *page, size_t count) { struct se_dev_attrib *da = container_of(to_config_group(item), @@ -3530,13 +3591,103 @@ static ssize_t tcmu_bypass_data_area_store(struct config_item *item, const char return ret; if (bypass_data_area) - set_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags); + set_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags); else - clear_bit(TCMU_DEV_BIT_BYPASS_DATA_AREA, &udev->flags); + clear_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags); + + return count; +} +CONFIGFS_ATTR(tcmu_, read_bypass_data_area); + +static ssize_t tcmu_write_bypass_data_area_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + if (test_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags)) + return snprintf(page, PAGE_SIZE, "%s\n", "true"); + else + return snprintf(page, PAGE_SIZE, "%s\n", "false"); +} + +static ssize_t tcmu_write_bypass_data_area_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + bool bypass_data_area; + int ret; + + ret = kstrtobool(page, &bypass_data_area); + if (ret < 0) + return ret; + + if (bypass_data_area) + set_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags); + else + clear_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags); + + return count; +} +CONFIGFS_ATTR(tcmu_, write_bypass_data_area); + +static ssize_t tcmu_read_zc_size_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + return snprintf(page, PAGE_SIZE, "%ukb\n", udev->read_zc_size); +} + +static ssize_t tcmu_read_zc_size_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + uint32_t read_zc_size; + int ret; + + ret = kstrtou32(page, 0, &read_zc_size); + if (ret < 0) + return ret; + + udev->read_zc_size = read_zc_size; + + return count; +} +CONFIGFS_ATTR(tcmu_, read_zc_size); + +static ssize_t tcmu_write_zc_size_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + return snprintf(page, PAGE_SIZE, "%ukb\n", udev->write_zc_size); +} + +static ssize_t tcmu_write_zc_size_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + uint32_t write_zc_size; + int ret; + + ret = kstrtou32(page, 0, &write_zc_size); + if (ret < 0) + return ret; + + udev->write_zc_size = write_zc_size; return count; } -CONFIGFS_ATTR(tcmu_, bypass_data_area); +CONFIGFS_ATTR(tcmu_, write_zc_size); static struct configfs_attribute *tcmu_attrib_attrs[] = { &tcmu_attr_cmd_time_out, @@ -3549,7 +3700,10 @@ static struct configfs_attribute *tcmu_attrib_attrs[] = { &tcmu_attr_emulate_write_cache, &tcmu_attr_tmr_notification, &tcmu_attr_nl_reply_supported, - &tcmu_attr_bypass_data_area, + &tcmu_attr_read_bypass_data_area, + &tcmu_attr_write_bypass_data_area, + &tcmu_attr_read_zc_size, + &tcmu_attr_write_zc_size, NULL, }; diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index eba0cac0c8d2..8931c2bb0afe 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -74,6 +74,7 @@ struct tcmu_cmd_entry_hdr { __u32 len_op; __u16 cmd_id; #define TCMU_KFLAG_ZERO_COPY 0x1 +#define TCMU_KFLAG_BYPASS_DATA_AREA 0x2 __u8 kflags; #define TCMU_UFLAG_UNKNOWN_OP 0x1 #define TCMU_UFLAG_READ_LEN 0x2 -- Gitee From 023b47eff9ecf3f0fe750ae416ec98e4b0e98f37 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 15 Mar 2022 15:57:57 +0800 Subject: [PATCH 09/12] anolis: scsi: target: tcmu: use new rw_semaphore to protect truncate ANBZ: #34745 cherry picked from devel-6.6 commit 09dbb16291e12cacfd7c1d99f780b1c370a25ab5. Currently tcmu_vma_fault() uses udev->cmdr_lock to avoid concurrent find_free_blocks(), which unmaps idle pages and truncates them. This work is really like many filesystem's truncate operations, but they use inode's i_mmap_sem to protect race normally. This patch replaces cmdr_lock with a new rw_semaphore in tcmu fault procedure, which will also make page-fault have concurrency. Signed-off-by: Guixin Liu Signed-off-by: Joseph Qi --- drivers/target/target_core_user.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 2a9b2de8df93..79fda71781c6 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -149,6 +149,7 @@ struct tcmu_dev { size_t mmap_pages; struct mutex cmdr_lock; + struct rw_semaphore i_mmap_sem; struct list_head qfull_queue; struct list_head tmr_queue; @@ -1825,6 +1826,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) mutex_init(&udev->cmdr_lock); udev->read_zc_size = 0; udev->write_zc_size = 0; + init_rwsem(&udev->i_mmap_sem); INIT_LIST_HEAD(&udev->node); INIT_LIST_HEAD(&udev->timedout_entry); @@ -2043,12 +2045,12 @@ static struct page *tcmu_try_get_data_page(struct tcmu_dev *udev, uint32_t dpi) { struct page *page; - mutex_lock(&udev->cmdr_lock); + down_read(&udev->i_mmap_sem); page = xa_load(&udev->data_pages, dpi); if (likely(page)) { get_page(page); lock_page(page); - mutex_unlock(&udev->cmdr_lock); + up_read(&udev->i_mmap_sem); return page; } @@ -2058,7 +2060,7 @@ static struct page *tcmu_try_get_data_page(struct tcmu_dev *udev, uint32_t dpi) */ pr_err("Invalid addr to data page mapping (dpi %u) on device %s\n", dpi, udev->name); - mutex_unlock(&udev->cmdr_lock); + up_read(&udev->i_mmap_sem); return NULL; } @@ -3768,6 +3770,7 @@ static void find_free_blocks(void) continue; } + down_write(&udev->i_mmap_sem); end = udev->dbi_max + 1; block = find_last_bit(udev->data_bitmap, end); if (block == udev->dbi_max) { @@ -3775,6 +3778,7 @@ static void find_free_blocks(void) * The last bit is dbi_max, so it is not possible * reclaim any blocks. */ + up_write(&udev->i_mmap_sem); mutex_unlock(&udev->cmdr_lock); continue; } else if (block == end) { @@ -3802,6 +3806,7 @@ static void find_free_blocks(void) off = udev->data_off + (loff_t)start * udev->data_blk_size; unmap_mapping_range(udev->inode->i_mapping, off, 0, 1); + up_write(&udev->i_mmap_sem); mutex_unlock(&udev->cmdr_lock); total_pages_freed += pages_freed; -- Gitee From aa401df0f12ffba662935cdc5223e6d3794bfc4b Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 21 Feb 2024 17:24:49 +0800 Subject: [PATCH 10/12] anolis: tcm_loop: allow sg_tablesize to be settable ANBZ: #34745 cherry picked from devel-6.6 commit 2087e1c2fb3488b607d113d674a5daddf97e06d3. Currently tcm_loop default sg_tablesize is 256 and can only support aximum 1M io in the worst case, e.g. each segment takes only 4k. This won't fulfill requirement in some user scenarios. Just like "scsi: tcm_loop: Allow queues, can_queue and cmd_per_lun to be settable", make 'sg_tablesize' also can be settable by user. Signed-off-by: Joseph Qi Signed-off-by: Guixin Liu --- drivers/target/loopback/tcm_loop.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index 528883d989b8..88d6843d1783 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -55,6 +55,9 @@ module_param_named(can_queue, tcm_loop_can_queue, uint, 0644); static unsigned int tcm_loop_cmd_per_lun = 1024; module_param_named(cmd_per_lun, tcm_loop_cmd_per_lun, uint, 0644); +static unsigned short tcm_loop_sg_tablesize = 256; +module_param_named(sg_tablesize, tcm_loop_sg_tablesize, ushort, 0644); + /* * Called from struct target_core_fabric_ops->check_stop_free() */ @@ -343,7 +346,6 @@ static const struct scsi_host_template tcm_loop_driver_template = { .eh_device_reset_handler = tcm_loop_device_reset, .eh_target_reset_handler = tcm_loop_target_reset, .this_id = -1, - .sg_tablesize = 256, .max_sectors = 0xFFFF, .dma_boundary = PAGE_SIZE - 1, .module = THIS_MODULE, @@ -381,6 +383,7 @@ static int tcm_loop_driver_probe(struct device *dev) sh->nr_hw_queues = tcm_loop_nr_hw_queues; sh->can_queue = tcm_loop_can_queue; sh->cmd_per_lun = tcm_loop_cmd_per_lun; + sh->sg_tablesize = tcm_loop_sg_tablesize; host_prot = SHOST_DIF_TYPE1_PROTECTION | SHOST_DIF_TYPE2_PROTECTION | SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE1_PROTECTION | -- Gitee From dfb6eff2cc5a4597737d9ca7849b972a38984339 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 17 Jan 2024 11:14:39 +0800 Subject: [PATCH 11/12] anolis: Revert "block: always define BIO_MAX_PAGES as 256" ANBZ: #34745 cherry picked from devel-6.6 commit 9d50589ab3de74c1cecacdcf04c4cb98ec1b29ed. This reverts commit 6861428921b51113520cd47897be6c2774e4fc58. In some user scenarios, we want to write a big IO (e.g. 2M) and also expect it not to split. But in the worst case, each bio_vec only contains single page, so it can only support maximum 1M bio actually. Revert BIO_MAX_PAGES back to 512 to fulfill the above user scenarios. This is also to keep consistent with kernel 4.19. Signed-off-by: Joseph Qi Signed-off-by: Guixin Liu --- include/linux/bio.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/bio.h b/include/linux/bio.h index 92da13d3fdea..a80db0464744 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -11,7 +11,15 @@ #include #include +#ifdef CONFIG_THP_SWAP +#if HPAGE_PMD_NR > 256 +#define BIO_MAX_VECS (HPAGE_PMD_NR * 1U) +#else #define BIO_MAX_VECS 256U +#endif +#else +#define BIO_MAX_VECS 256U +#endif #define BIO_MAX_INLINE_VECS UIO_MAXIOV struct queue_limits; -- Gitee From 6e7d89fb1a14ac17cbf4ddf34f63c51abfe10a37 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 8 May 2026 10:12:42 +0800 Subject: [PATCH 12/12] anolis: scsi: target: tcmu: fix mapped page leak on zerocopy error ANBZ: #34745 In tcmu_ioctl_cmd_zerocopy(), if tcmu_zerocopy_one_seg() fails in the middle of the loop, pages successfully mapped by prior iterations are left in userspace with no cleanup. The iov array is freed but the special PTE mappings remain, causing a resource leak. Additionally, tcmu_zerocopy_one_seg() maps pages in batches of TCMU_ZEROCOPY_PAGE_BATCH. If a batch fails mid-segment, prior batches within that same segment have already inserted PTEs. The rollback must also zap the current (partially mapped) segment. Fix by zapping the already-mapped pages for all prior iov entries plus the current failing entry before freeing iov and returning the error. zap_vma_range is safe on not-present PTEs, so zapping the full segment length handles the partial case correctly. Fixes: 451742e2abf87 ("anolis: scsi: target: tcmu: Support zero copy") Signed-off-by: Joseph Qi --- drivers/target/target_core_user.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 79fda71781c6..9fd6425f41b2 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -2243,6 +2243,14 @@ static long tcmu_ioctl_cmd_zerocopy(struct tcmu_dev *udev, unsigned long arg) for (i = 0; i < zc.iov_cnt; i++) { ret = tcmu_zerocopy_one_seg(tiov, vma, &sgiter); if (ret < 0) { + /* Rollback already mapped pages */ + int j; + unsigned long address; + + for (j = 0; j <= i; j++) { + address = (unsigned long)iov[j].iov_base; + zap_vma_range(vma, address, iov[j].iov_len); + } kfree(iov); goto out; } -- Gitee