diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index 528883d989b8f0d4754658575f48a1d0b82973b4..88d6843d1783bccfee33cd1b9e65012965bdabad 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -55,6 +55,9 @@ module_param_named(can_queue, tcm_loop_can_queue, uint, 0644); static unsigned int tcm_loop_cmd_per_lun = 1024; module_param_named(cmd_per_lun, tcm_loop_cmd_per_lun, uint, 0644); +static unsigned short tcm_loop_sg_tablesize = 256; +module_param_named(sg_tablesize, tcm_loop_sg_tablesize, ushort, 0644); + /* * Called from struct target_core_fabric_ops->check_stop_free() */ @@ -343,7 +346,6 @@ static const struct scsi_host_template tcm_loop_driver_template = { .eh_device_reset_handler = tcm_loop_device_reset, .eh_target_reset_handler = tcm_loop_target_reset, .this_id = -1, - .sg_tablesize = 256, .max_sectors = 0xFFFF, .dma_boundary = PAGE_SIZE - 1, .module = THIS_MODULE, @@ -381,6 +383,7 @@ static int tcm_loop_driver_probe(struct device *dev) sh->nr_hw_queues = tcm_loop_nr_hw_queues; sh->can_queue = tcm_loop_can_queue; sh->cmd_per_lun = tcm_loop_cmd_per_lun; + sh->sg_tablesize = tcm_loop_sg_tablesize; host_prot = SHOST_DIF_TYPE1_PROTECTION | SHOST_DIF_TYPE2_PROTECTION | SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE1_PROTECTION | diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index edc2afd5f4ee2c10b4bb84ee7290751c15a5e3b4..9fd6425f41b2a03d2301e780b355c6770dbef7d9 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include #include @@ -73,6 +75,7 @@ */ #define DATA_PAGES_PER_BLK_DEF 1 #define DATA_AREA_PAGES_DEF (256 * 1024) +#define ZC_DATA_AREA_PAGES_DEF (256 * 1024) #define TCMU_MBS_TO_PAGES(_mbs) ((size_t)_mbs << (20 - PAGE_SHIFT)) #define TCMU_PAGES_TO_MBS(_pages) (_pages >> (20 - PAGE_SHIFT)) @@ -123,6 +126,8 @@ struct tcmu_dev { #define TCMU_DEV_BIT_BLOCKED 2 #define TCMU_DEV_BIT_TMR_NOTIFY 3 #define TCMU_DEV_BIT_PLUGGED 4 +#define TCMU_DEV_BIT_READ_BYPASS_DATA_AREA 5 +#define TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA 6 unsigned long flags; struct uio_info uio_info; @@ -139,10 +144,12 @@ struct tcmu_dev { /* Must add data_off and mb_addr to get the address */ size_t data_off; int data_area_mb; + uint32_t zc_max_blocks; uint32_t max_blocks; size_t mmap_pages; struct mutex cmdr_lock; + struct rw_semaphore i_mmap_sem; struct list_head qfull_queue; struct list_head tmr_queue; @@ -153,6 +160,12 @@ struct tcmu_dev { uint32_t data_pages_per_blk; uint32_t data_blk_size; + uint32_t zc_dbi_max; + uint32_t zc_dbi_thresh; + unsigned long *zc_data_bitmap; + uint32_t read_zc_size; + uint32_t write_zc_size; + struct xarray commands; struct timer_list cmd_timer; @@ -178,6 +191,12 @@ struct tcmu_cmd { struct tcmu_dev *tcmu_dev; struct list_head queue_entry; + /* for zero_copy */ + struct mm_struct *vma_vm_mm; + struct vm_area_struct *vma; + struct iovec *iov; + int iov_cnt; + uint16_t cmd_id; /* Can't use se_cmd when cleaning up expired cmds, because if @@ -193,7 +212,11 @@ struct tcmu_cmd { #define TCMU_CMD_BIT_EXPIRED 0 #define TCMU_CMD_BIT_KEEP_BUF 1 +#define TCMU_CMD_BIT_ZEROCOPY 2 +#define TCMU_CMD_BIT_BYPASS_DATA_AREA 3 unsigned long flags; + + struct mutex cmd_lock; }; struct tcmu_tmr { @@ -497,10 +520,38 @@ static struct genl_family tcmu_genl_family __ro_after_init = { static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd, uint32_t len) { struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + unsigned long *data_bitmap; uint32_t i; + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags)) + data_bitmap = udev->zc_data_bitmap; + else + data_bitmap = udev->data_bitmap; + for (i = 0; i < len; i++) - clear_bit(tcmu_cmd->dbi[i], udev->data_bitmap); + clear_bit(tcmu_cmd->dbi[i], data_bitmap); +} + +static inline int tcmu_get_zc_empty_block(struct tcmu_dev *udev, + struct tcmu_cmd *tcmu_cmd, + int prev_dbi, int *iov_cnt) +{ + int dbi; + + dbi = find_first_zero_bit(udev->zc_data_bitmap, udev->zc_dbi_thresh); + if (dbi == udev->zc_dbi_thresh) + return -1; + + if (dbi > udev->zc_dbi_max) + udev->zc_dbi_max = dbi; + + set_bit(dbi, udev->zc_data_bitmap); + tcmu_cmd_set_dbi(tcmu_cmd, dbi); + + if (dbi != prev_dbi + 1) + *iov_cnt += 1; + + return dbi; } static inline int tcmu_get_empty_block(struct tcmu_dev *udev, @@ -552,7 +603,8 @@ static inline int tcmu_get_empty_block(struct tcmu_dev *udev, } static int tcmu_get_empty_blocks(struct tcmu_dev *udev, - struct tcmu_cmd *tcmu_cmd, int length) + struct tcmu_cmd *tcmu_cmd, int length, + bool zero_copy) { /* start value of dbi + 1 must not be a valid dbi */ int dbi = -2; @@ -561,7 +613,10 @@ static int tcmu_get_empty_blocks(struct tcmu_dev *udev, for (; length > 0; length -= blk_size) { blk_data_len = min_t(uint32_t, length, blk_size); - dbi = tcmu_get_empty_block(udev, tcmu_cmd, dbi, blk_data_len, + if (zero_copy) + dbi = tcmu_get_zc_empty_block(udev, tcmu_cmd, dbi, &iov_cnt); + else + dbi = tcmu_get_empty_block(udev, tcmu_cmd, dbi, blk_data_len, &iov_cnt); if (dbi < 0) return -1; @@ -569,8 +624,40 @@ static int tcmu_get_empty_blocks(struct tcmu_dev *udev, return iov_cnt; } +static void tcmu_cmd_zerocopy_unmap(struct tcmu_cmd *cmd) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct iovec *iov = cmd->iov; + unsigned long address; + int i; + + mm = cmd->vma_vm_mm; + vma = cmd->vma; + if (!mm) + return; + + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + for (i = 0; i < cmd->iov_cnt; i++) { + address = (unsigned long)iov->iov_base; + zap_vma_range(vma, address, iov->iov_len); + iov++; + } + mmap_read_unlock(mm); + mmput(mm); + } + + cmd->vma_vm_mm = NULL; + cmd->vma = NULL; + mmdrop(mm); + kfree(cmd->iov); +} + static inline void tcmu_free_cmd(struct tcmu_cmd *tcmu_cmd) { + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags)) + tcmu_cmd_zerocopy_unmap(tcmu_cmd); kfree(tcmu_cmd->dbi); kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); } @@ -630,11 +717,67 @@ static void tcmu_setup_iovs(struct tcmu_dev *udev, struct tcmu_cmd *cmd, dbi = new_block_to_iov(udev, cmd, iov, dbi, data_length); } +static void tcmu_set_cmd_bypass_data_area(struct tcmu_cmd *tcmu_cmd) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + + /* + * Zero copy is map sg pages to userspace, and bypass data area + * is copy data between sg pages and userspace buffer, so they + * are completely different. + */ + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags)) + return; + + if (se_cmd->data_direction == DMA_FROM_DEVICE && + test_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags)) + set_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); + + if (se_cmd->data_direction == DMA_TO_DEVICE && + test_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags)) + set_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); +} + +static void tcmu_set_cmd_do_zero_copy(struct tcmu_cmd *tcmu_cmd) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + struct scatterlist *data_sg = se_cmd->t_data_sg, *sg; + unsigned int data_nents = se_cmd->t_data_nents; + int i; + + if ((se_cmd->se_cmd_flags & SCF_BIDI) || !se_cmd->data_length || + !IS_ALIGNED(se_cmd->data_length, PAGE_SIZE)) + return; + + if ((se_cmd->data_direction == DMA_FROM_DEVICE) && + (!udev->read_zc_size || + se_cmd->data_length < (udev->read_zc_size << 10))) + return; + + if ((se_cmd->data_direction == DMA_TO_DEVICE) && + (!udev->write_zc_size || + se_cmd->data_length < (udev->write_zc_size << 10))) + return; + + /* Now, check every sg pages is aligned. */ + for_each_sg(data_sg, sg, data_nents, i) { + if ((sg->offset && !IS_ALIGNED(sg->offset, PAGE_SIZE)) || + !IS_ALIGNED(sg->length, PAGE_SIZE)) + break; + } + if (i == data_nents) + set_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); +} + static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) { struct se_device *se_dev = se_cmd->se_dev; struct tcmu_dev *udev = TCMU_DEV(se_dev); struct tcmu_cmd *tcmu_cmd; + bool zero_copy; + bool bypass_data_area; tcmu_cmd = kmem_cache_zalloc(tcmu_cmd_cache, GFP_NOIO); if (!tcmu_cmd) @@ -643,13 +786,24 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) INIT_LIST_HEAD(&tcmu_cmd->queue_entry); tcmu_cmd->se_cmd = se_cmd; tcmu_cmd->tcmu_dev = udev; - - tcmu_cmd_set_block_cnts(tcmu_cmd); - tcmu_cmd->dbi = kcalloc(tcmu_cmd->dbi_cnt, sizeof(uint32_t), - GFP_NOIO); - if (!tcmu_cmd->dbi) { - kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); - return NULL; + mutex_init(&tcmu_cmd->cmd_lock); + + tcmu_set_cmd_do_zero_copy(tcmu_cmd); + tcmu_set_cmd_bypass_data_area(tcmu_cmd); + + zero_copy = test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); + bypass_data_area = test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); + if (zero_copy || !bypass_data_area) { + tcmu_cmd_set_block_cnts(tcmu_cmd); + tcmu_cmd->dbi = kcalloc(tcmu_cmd->dbi_cnt, sizeof(uint32_t), + GFP_NOIO); + if (!tcmu_cmd->dbi) { + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + return NULL; + } + } else { + tcmu_cmd->dbi_cnt = 0; + tcmu_cmd->dbi = NULL; } return tcmu_cmd; @@ -852,37 +1006,51 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size) * Called with ring lock held. */ static int tcmu_alloc_data_space(struct tcmu_dev *udev, struct tcmu_cmd *cmd, - int *iov_bidi_cnt) + int *iov_bidi_cnt, bool zero_copy) { int space, iov_cnt = 0, ret = 0; + unsigned long *data_bitmap; + uint32_t *dbi_thresh, max_blocks; - if (!cmd->dbi_cnt) + if (test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &cmd->flags)) goto wr_iov_cnts; + if (zero_copy) { + data_bitmap = udev->zc_data_bitmap; + dbi_thresh = &udev->zc_dbi_thresh; + max_blocks = udev->zc_max_blocks; + } else { + data_bitmap = udev->data_bitmap; + dbi_thresh = &udev->dbi_thresh; + max_blocks = udev->max_blocks; + } + /* try to check and get the data blocks as needed */ - space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh); + space = spc_bitmap_free(data_bitmap, *dbi_thresh); if (space < cmd->dbi_cnt) { - unsigned long blocks_left = - (udev->max_blocks - udev->dbi_thresh) + space; + unsigned long blocks_left = max_blocks - *dbi_thresh + space; if (blocks_left < cmd->dbi_cnt) { - pr_debug("no data space: only %lu available, but ask for %u\n", + pr_debug("no data space[%s]: only %lu available, but ask for %u\n", ++ zero_copy ? "zero copy" : "non zero copy", blocks_left * udev->data_blk_size, cmd->dbi_cnt * udev->data_blk_size); return -1; } - udev->dbi_thresh += cmd->dbi_cnt; - if (udev->dbi_thresh > udev->max_blocks) - udev->dbi_thresh = udev->max_blocks; + *dbi_thresh += cmd->dbi_cnt; + if (*dbi_thresh > max_blocks) + *dbi_thresh = max_blocks; } - iov_cnt = tcmu_get_empty_blocks(udev, cmd, cmd->se_cmd->data_length); + iov_cnt = tcmu_get_empty_blocks(udev, cmd, cmd->se_cmd->data_length, + zero_copy); if (iov_cnt < 0) return -1; if (cmd->dbi_bidi_cnt) { - ret = tcmu_get_empty_blocks(udev, cmd, cmd->data_len_bidi); + ret = tcmu_get_empty_blocks(udev, cmd, cmd->data_len_bidi, + zero_copy); if (ret < 0) return -1; } @@ -1023,6 +1191,8 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) uint32_t blk_size = udev->data_blk_size; /* size of data buffer needed */ size_t data_length = (size_t)tcmu_cmd->dbi_cnt * blk_size; + bool zero_copy = test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); + bool bypass_data_area = test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); *scsi_err = TCM_NO_SENSE; @@ -1046,7 +1216,7 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) return -1; } - iov_cnt = tcmu_alloc_data_space(udev, tcmu_cmd, &iov_bidi_cnt); + iov_cnt = tcmu_alloc_data_space(udev, tcmu_cmd, &iov_bidi_cnt, zero_copy); if (iov_cnt < 0) goto free_and_queue; @@ -1095,16 +1265,18 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) tcmu_cmd_reset_dbi_cur(tcmu_cmd); iov = &entry->req.iov[0]; - if (se_cmd->data_direction == DMA_TO_DEVICE || - se_cmd->se_cmd_flags & SCF_BIDI) - scatter_data_area(udev, tcmu_cmd, &iov); - else - tcmu_setup_iovs(udev, tcmu_cmd, &iov, se_cmd->data_length); + if (zero_copy || !bypass_data_area) { + if (((se_cmd->data_direction == DMA_TO_DEVICE) && !zero_copy) || + se_cmd->se_cmd_flags & SCF_BIDI) + scatter_data_area(udev, tcmu_cmd, &iov); + else + tcmu_setup_iovs(udev, tcmu_cmd, &iov, se_cmd->data_length); + } entry->req.iov_cnt = iov_cnt - iov_bidi_cnt; /* Handle BIDI commands */ - if (se_cmd->se_cmd_flags & SCF_BIDI) { + if ((se_cmd->se_cmd_flags & SCF_BIDI) && !bypass_data_area) { iov++; tcmu_setup_iovs(udev, tcmu_cmd, &iov, tcmu_cmd->data_len_bidi); entry->req.iov_bidi_cnt = iov_bidi_cnt; @@ -1113,6 +1285,21 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) tcmu_setup_cmd_timer(tcmu_cmd, udev->cmd_time_out, &udev->cmd_timer); entry->hdr.cmd_id = tcmu_cmd->cmd_id; + if (zero_copy) { + int i; + struct iovec *tiov; + + tiov = &entry->req.iov[0]; + for (i = 0; i < entry->req.iov_cnt; i++) { + tiov->iov_base = tiov->iov_base + + (TCMU_MBS_TO_PAGES(udev->data_area_mb) << PAGE_SHIFT); + tiov++; + } + entry->hdr.kflags |= TCMU_KFLAG_ZERO_COPY; + } + + if (bypass_data_area) + entry->hdr.kflags |= TCMU_KFLAG_BYPASS_DATA_AREA; tcmu_hdr_set_len(&entry->hdr.len_op, command_size); @@ -1368,6 +1555,15 @@ static bool tcmu_handle_completion(struct tcmu_cmd *cmd, else se_cmd->se_cmd_flags |= SCF_TREAT_READ_AS_NORMAL; } + + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &cmd->flags)) { + tcmu_cmd_zerocopy_unmap(cmd); + goto done; + } + + if (test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &cmd->flags)) + goto done; + if (se_cmd->se_cmd_flags & SCF_BIDI) { /* Get Data-In buffer before clean up */ gather_data_area(udev, cmd, true, read_len); @@ -1522,11 +1718,15 @@ static void tcmu_check_expired_ring_cmd(struct tcmu_cmd *cmd) if (!time_after_eq(jiffies, cmd->deadline)) return; + mutex_lock(&cmd->cmd_lock); + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &cmd->flags)) + tcmu_cmd_zerocopy_unmap(cmd); set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags); list_del_init(&cmd->queue_entry); se_cmd = cmd->se_cmd; se_cmd->priv = NULL; cmd->se_cmd = NULL; + mutex_unlock(&cmd->cmd_lock); pr_debug("Timing out inflight cmd %u on dev %s.\n", cmd->cmd_id, cmd->tcmu_dev->name); @@ -1619,10 +1819,14 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) udev->data_pages_per_blk = DATA_PAGES_PER_BLK_DEF; udev->max_blocks = DATA_AREA_PAGES_DEF / udev->data_pages_per_blk; + udev->zc_max_blocks = ZC_DATA_AREA_PAGES_DEF / udev->data_pages_per_blk; udev->cmdr_size = CMDR_SIZE_DEF; udev->data_area_mb = TCMU_PAGES_TO_MBS(DATA_AREA_PAGES_DEF); mutex_init(&udev->cmdr_lock); + udev->read_zc_size = 0; + udev->write_zc_size = 0; + init_rwsem(&udev->i_mmap_sem); INIT_LIST_HEAD(&udev->node); INIT_LIST_HEAD(&udev->timedout_entry); @@ -1740,6 +1944,7 @@ static void tcmu_dev_kref_release(struct kref *kref) tcmu_blocks_release(udev, 0, udev->dbi_max); bitmap_free(udev->data_bitmap); + bitmap_free(udev->zc_data_bitmap); mutex_unlock(&udev->cmdr_lock); pr_debug("dev_kref_release\n"); @@ -1840,12 +2045,12 @@ static struct page *tcmu_try_get_data_page(struct tcmu_dev *udev, uint32_t dpi) { struct page *page; - mutex_lock(&udev->cmdr_lock); + down_read(&udev->i_mmap_sem); page = xa_load(&udev->data_pages, dpi); if (likely(page)) { get_page(page); lock_page(page); - mutex_unlock(&udev->cmdr_lock); + up_read(&udev->i_mmap_sem); return page; } @@ -1855,7 +2060,7 @@ static struct page *tcmu_try_get_data_page(struct tcmu_dev *udev, uint32_t dpi) */ pr_err("Invalid addr to data page mapping (dpi %u) on device %s\n", dpi, udev->name); - mutex_unlock(&udev->cmdr_lock); + up_read(&udev->i_mmap_sem); return NULL; } @@ -1940,7 +2145,7 @@ static int tcmu_mmap_prepare(struct uio_info *info, struct vm_area_desc *desc) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); - vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT, VMA_MIXEDMAP_BIT); desc->vm_ops = &tcmu_vm_ops; desc->private_data = udev; @@ -1952,6 +2157,117 @@ static int tcmu_mmap_prepare(struct uio_info *info, struct vm_area_desc *desc) return 0; } +#define TCMU_ZEROCOPY_PAGE_BATCH 32 + +static inline int tcmu_zerocopy_one_seg(struct iovec *iov, + struct vm_area_struct *vma, + struct sg_page_iter *sgiter) +{ + struct page *pages[TCMU_ZEROCOPY_PAGE_BATCH]; + unsigned int len = iov->iov_len; + unsigned long address = (unsigned long)iov->iov_base; + unsigned long pages_remaining, pg_index = 0; + struct page *page; + int ret; + + while (len > 0) { + __sg_page_iter_next(sgiter); + page = sg_page_iter_page(sgiter); + pages[pg_index++] = page; + len -= PAGE_SIZE; + if (pg_index == TCMU_ZEROCOPY_PAGE_BATCH || !len) { + pages_remaining = pg_index; + ret = vm_insert_pages_mkspecial(vma, address, pages, + &pages_remaining); + if (ret < 0) { + pr_err("vm insert pages failed, error code: %d\n", ret); + return ret; + } + address = address + pg_index * PAGE_SIZE; + pg_index = 0; + } + } + + return 0; +} + +static long tcmu_ioctl_cmd_zerocopy(struct tcmu_dev *udev, unsigned long arg) +{ + struct tcmu_cmd *cmd; + struct se_cmd *se_cmd; + struct scatterlist *data_sg; + unsigned int data_nents; + struct tcmu_cmd_zerocopy zc; + struct iovec *iov, *tiov; + struct sg_page_iter sgiter; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int i, ret = 0; + + if (copy_from_user(&zc, (struct tcmu_cmd_zerocopy __user *)arg, sizeof(zc))) + return -EFAULT; + + if (zc.iov_cnt <= 0) + return -EINVAL; + + iov = kmalloc_array(zc.iov_cnt, sizeof(struct iovec), GFP_KERNEL); + if (!iov) + return -ENOMEM; + if (copy_from_user(iov, zc.iov, sizeof(struct iovec) * zc.iov_cnt)) { + kfree(iov); + return -EFAULT; + } + + mutex_lock(&udev->cmdr_lock); + mmap_read_lock(mm); + cmd = xa_load(&udev->commands, zc.cmd_id); + if (!cmd) { + ret = -EINVAL; + kfree(iov); + pr_err("tcmu zero copy: cmd_id %d not found\n", zc.cmd_id); + goto out; + } + se_cmd = cmd->se_cmd; + + vma = find_vma(current->mm, (unsigned long)iov->iov_base); + if (!vma) { + ret = -EINVAL; + kfree(iov); + pr_err("tcmu zero copy: invalid iov_base\n"); + goto out; + } + data_sg = se_cmd->t_data_sg; + data_nents = se_cmd->t_data_nents; + __sg_page_iter_start(&sgiter, data_sg, data_nents, 0); + tiov = iov; + for (i = 0; i < zc.iov_cnt; i++) { + ret = tcmu_zerocopy_one_seg(tiov, vma, &sgiter); + if (ret < 0) { + /* Rollback already mapped pages */ + int j; + unsigned long address; + + for (j = 0; j <= i; j++) { + address = (unsigned long)iov[j].iov_base; + zap_vma_range(vma, address, iov[j].iov_len); + } + kfree(iov); + goto out; + } + tiov++; + } + + cmd->iov = iov; + cmd->iov_cnt = zc.iov_cnt; + cmd->vma_vm_mm = vma->vm_mm; + cmd->vma = vma; + mmgrab(cmd->vma_vm_mm); +out: + mmap_read_unlock(mm); + mutex_unlock(&udev->cmdr_lock); + return ret; +} + static int tcmu_open(struct uio_info *info, struct inode *inode) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); @@ -2010,6 +2326,110 @@ static int tcmu_release(struct uio_info *info, struct inode *inode) return 0; } +static long tcmu_do_copy_data(struct tcmu_cmd *tcmu_cmd, + struct iovec __user *uiovec, + unsigned int vcnt, + bool is_copy_to_sgl) +{ + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + struct scatterlist *data_sg, *sg; + int i; + unsigned int data_nents; + size_t copied; + + if (se_cmd->se_cmd_flags & SCF_BIDI) { + data_sg = se_cmd->t_bidi_data_sg; + data_nents = se_cmd->t_bidi_data_nents; + } else { + data_sg = se_cmd->t_data_sg; + data_nents = se_cmd->t_data_nents; + } + + ret = import_iovec(is_copy_to_sgl ? ITER_SOURCE : ITER_DEST, + uiovec, vcnt, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) { + pr_err("import iovec failed.\n"); + return -EFAULT; + } + + for_each_sg(data_sg, sg, data_nents, i) { + if (is_copy_to_sgl) + copied = copy_page_from_iter(sg_page(sg), sg->offset, sg->length, &iter); + else + copied = copy_page_to_iter(sg_page(sg), sg->offset, sg->length, &iter); + if (copied != sg->length) { + pr_err("copy failed.\n"); + ret = -EFAULT; + break; + } + } + kfree(iov); + return ret < 0 ? -EFAULT : 0; +} + +static long tcmu_bypass_data_area_copy_data(struct tcmu_dev *udev, + unsigned long arg, + bool is_copy_to_sgl) +{ + struct tcmu_data_xfer __user *uxfer = (struct tcmu_data_xfer __user *)arg; + struct tcmu_data_xfer xfer; + struct tcmu_cmd *tcmu_cmd; + long ret; + + if (copy_from_user(&xfer, uxfer, sizeof(xfer))) + return -EFAULT; + + tcmu_cmd = xa_load(&udev->commands, xfer.cmd_id); + if (!tcmu_cmd) { + pr_err("Can not find tcmu command, cmd_id:%d\n", xfer.cmd_id); + set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags); + return -EFAULT; + } + + mutex_lock(&tcmu_cmd->cmd_lock); + if (!test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags)) { + ret = -EINVAL; + goto out; + } + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &tcmu_cmd->flags)) { + pr_err("Command is expired, cmd_id:%d\n", xfer.cmd_id); + ret = -EFAULT; + goto out; + } + + ret = tcmu_do_copy_data(tcmu_cmd, xfer.iovec, + xfer.iov_cnt, is_copy_to_sgl); +out: + mutex_unlock(&tcmu_cmd->cmd_lock); + return ret; +} + +static long tcmu_ioctl(struct uio_info *info, unsigned int cmd, unsigned long arg) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + long ret; + + switch (cmd) { + case TCMU_IOCTL_CMD_COPY_TO_SGL: + ret = tcmu_bypass_data_area_copy_data(udev, arg, true); + break; + case TCMU_IOCTL_CMD_COPY_FROM_SGL: + ret = tcmu_bypass_data_area_copy_data(udev, arg, false); + break; + case TCMU_IOCTL_CMD_ZEROCOPY: + ret = tcmu_ioctl_cmd_zerocopy(udev, arg); + break; + default: + ret = -EINVAL; + } + return ret; +} + static int tcmu_init_genl_cmd_reply(struct tcmu_dev *udev, int cmd) { struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd; @@ -2210,6 +2630,7 @@ static int tcmu_configure_device(struct se_device *dev) struct uio_info *info; struct tcmu_mailbox *mb; size_t data_size; + size_t zc_data_size; int ret = 0; ret = tcmu_update_uio_info(udev); @@ -2220,10 +2641,11 @@ static int tcmu_configure_device(struct se_device *dev) mutex_lock(&udev->cmdr_lock); udev->data_bitmap = bitmap_zalloc(udev->max_blocks, GFP_KERNEL); + udev->zc_data_bitmap = bitmap_zalloc(udev->zc_max_blocks, GFP_KERNEL); mutex_unlock(&udev->cmdr_lock); - if (!udev->data_bitmap) { + if (!udev->data_bitmap || !udev->zc_data_bitmap) { ret = -ENOMEM; - goto err_bitmap_alloc; + goto err_vzalloc; } mb = vzalloc(udev->cmdr_size + CMDR_OFF); @@ -2237,9 +2659,12 @@ static int tcmu_configure_device(struct se_device *dev) udev->cmdr = (void *)mb + CMDR_OFF; udev->data_off = udev->cmdr_size + CMDR_OFF; data_size = TCMU_MBS_TO_PAGES(udev->data_area_mb) << PAGE_SHIFT; - udev->mmap_pages = (data_size + udev->cmdr_size + CMDR_OFF) >> PAGE_SHIFT; + zc_data_size = (udev->zc_max_blocks * udev->data_pages_per_blk) << PAGE_SHIFT; + udev->mmap_pages = (data_size + zc_data_size + udev->cmdr_size + + CMDR_OFF) >> PAGE_SHIFT; udev->data_blk_size = udev->data_pages_per_blk * PAGE_SIZE; udev->dbi_thresh = 0; /* Default in Idle state */ + udev->zc_dbi_thresh = 0; /* Default in Idle state */ /* Initialise the mailbox of the ring buffer */ mb->version = TCMU_MAILBOX_VERSION; @@ -2257,7 +2682,8 @@ static int tcmu_configure_device(struct se_device *dev) info->mem[0].name = "tcm-user command & data buffer"; info->mem[0].addr = (phys_addr_t)(uintptr_t)udev->mb_addr; - info->mem[0].size = data_size + udev->cmdr_size + CMDR_OFF; + info->mem[0].size = data_size + zc_data_size + + udev->cmdr_size + CMDR_OFF; info->mem[0].memtype = UIO_MEM_NONE; info->irqcontrol = tcmu_irqcontrol; @@ -2266,6 +2692,7 @@ static int tcmu_configure_device(struct se_device *dev) info->mmap_prepare = tcmu_mmap_prepare; info->open = tcmu_open; info->release = tcmu_release; + info->ioctl = tcmu_ioctl; ret = uio_register_device(tcmu_root_device, info); if (ret) @@ -2312,7 +2739,8 @@ static int tcmu_configure_device(struct se_device *dev) err_vzalloc: bitmap_free(udev->data_bitmap); udev->data_bitmap = NULL; -err_bitmap_alloc: + kfree(udev->zc_data_bitmap); + udev->zc_data_bitmap = NULL; kfree(info->name); info->name = NULL; @@ -3147,6 +3575,130 @@ static ssize_t tcmu_free_kept_buf_store(struct config_item *item, const char *pa } CONFIGFS_ATTR_WO(tcmu_, free_kept_buf); +static ssize_t tcmu_read_bypass_data_area_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + if (test_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags)) + return snprintf(page, PAGE_SIZE, "%s\n", "true"); + else + return snprintf(page, PAGE_SIZE, "%s\n", "false"); +} + +static ssize_t tcmu_read_bypass_data_area_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + bool bypass_data_area; + int ret; + + ret = kstrtobool(page, &bypass_data_area); + if (ret < 0) + return ret; + + if (bypass_data_area) + set_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags); + else + clear_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags); + + return count; +} +CONFIGFS_ATTR(tcmu_, read_bypass_data_area); + +static ssize_t tcmu_write_bypass_data_area_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + if (test_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags)) + return snprintf(page, PAGE_SIZE, "%s\n", "true"); + else + return snprintf(page, PAGE_SIZE, "%s\n", "false"); +} + +static ssize_t tcmu_write_bypass_data_area_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + bool bypass_data_area; + int ret; + + ret = kstrtobool(page, &bypass_data_area); + if (ret < 0) + return ret; + + if (bypass_data_area) + set_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags); + else + clear_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags); + + return count; +} +CONFIGFS_ATTR(tcmu_, write_bypass_data_area); + +static ssize_t tcmu_read_zc_size_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + return snprintf(page, PAGE_SIZE, "%ukb\n", udev->read_zc_size); +} + +static ssize_t tcmu_read_zc_size_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + uint32_t read_zc_size; + int ret; + + ret = kstrtou32(page, 0, &read_zc_size); + if (ret < 0) + return ret; + + udev->read_zc_size = read_zc_size; + + return count; +} +CONFIGFS_ATTR(tcmu_, read_zc_size); + +static ssize_t tcmu_write_zc_size_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + return snprintf(page, PAGE_SIZE, "%ukb\n", udev->write_zc_size); +} + +static ssize_t tcmu_write_zc_size_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + uint32_t write_zc_size; + int ret; + + ret = kstrtou32(page, 0, &write_zc_size); + if (ret < 0) + return ret; + + udev->write_zc_size = write_zc_size; + + return count; +} +CONFIGFS_ATTR(tcmu_, write_zc_size); + static struct configfs_attribute *tcmu_attrib_attrs[] = { &tcmu_attr_cmd_time_out, &tcmu_attr_qfull_time_out, @@ -3158,6 +3710,10 @@ static struct configfs_attribute *tcmu_attrib_attrs[] = { &tcmu_attr_emulate_write_cache, &tcmu_attr_tmr_notification, &tcmu_attr_nl_reply_supported, + &tcmu_attr_read_bypass_data_area, + &tcmu_attr_write_bypass_data_area, + &tcmu_attr_read_zc_size, + &tcmu_attr_write_zc_size, NULL, }; @@ -3222,6 +3778,7 @@ static void find_free_blocks(void) continue; } + down_write(&udev->i_mmap_sem); end = udev->dbi_max + 1; block = find_last_bit(udev->data_bitmap, end); if (block == udev->dbi_max) { @@ -3229,6 +3786,7 @@ static void find_free_blocks(void) * The last bit is dbi_max, so it is not possible * reclaim any blocks. */ + up_write(&udev->i_mmap_sem); mutex_unlock(&udev->cmdr_lock); continue; } else if (block == end) { @@ -3256,6 +3814,7 @@ static void find_free_blocks(void) off = udev->data_off + (loff_t)start * udev->data_blk_size; unmap_mapping_range(udev->inode->i_mapping, off, 0, 1); + up_write(&udev->i_mmap_sem); mutex_unlock(&udev->cmdr_lock); total_pages_freed += pages_freed; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 1e4ade78ed8491220787b715b36001de023fcdf9..44b6dcae191d1b7651904d9bbe0371e0f5e04f27 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -12,6 +12,8 @@ * Base Functions */ +#include +#include #include #include #include @@ -219,7 +221,9 @@ static ssize_t name_show(struct device *dev, struct uio_device *idev = dev_get_drvdata(dev); int ret; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { ret = -EINVAL; dev_err(dev, "the device has been unregistered\n"); @@ -229,7 +233,7 @@ static ssize_t name_show(struct device *dev, ret = sprintf(buf, "%s\n", idev->info->name); out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return ret; } static DEVICE_ATTR_RO(name); @@ -240,7 +244,9 @@ static ssize_t version_show(struct device *dev, struct uio_device *idev = dev_get_drvdata(dev); int ret; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { ret = -EINVAL; dev_err(dev, "the device has been unregistered\n"); @@ -250,7 +256,7 @@ static ssize_t version_show(struct device *dev, ret = sprintf(buf, "%s\n", idev->info->version); out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return ret; } static DEVICE_ATTR_RO(version); @@ -504,16 +510,20 @@ static int uio_open(struct inode *inode, struct file *filep) listener->event_count = atomic_read(&idev->event); filep->private_data = listener; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) { + ret = -EINVAL; + goto err_infoopen; + } + if (!idev->info) { - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); ret = -EINVAL; goto err_infoopen; } if (idev->info->open) ret = idev->info->open(idev->info, inode); - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); if (ret) goto err_infoopen; @@ -546,10 +556,12 @@ static int uio_release(struct inode *inode, struct file *filep) struct uio_listener *listener = filep->private_data; struct uio_device *idev = listener->dev; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (idev->info && idev->info->release) ret = idev->info->release(idev->info, inode); - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); module_put(idev->owner); kfree(listener); @@ -563,10 +575,12 @@ static __poll_t uio_poll(struct file *filep, poll_table *wait) struct uio_device *idev = listener->dev; __poll_t ret = 0; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info || !idev->info->irq) ret = EPOLLERR; - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); if (ret) return ret; @@ -592,13 +606,17 @@ static ssize_t uio_read(struct file *filep, char __user *buf, add_wait_queue(&idev->wait, &wait); do { - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) { + retval = -EINVAL; + break; + } + if (!idev->info || !idev->info->irq) { retval = -EIO; - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); break; } - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); set_current_state(TASK_INTERRUPTIBLE); @@ -646,7 +664,9 @@ static ssize_t uio_write(struct file *filep, const char __user *buf, if (copy_from_user(&irq_on, buf, count)) return -EFAULT; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { retval = -EINVAL; goto out; @@ -665,7 +685,7 @@ static ssize_t uio_write(struct file *filep, const char __user *buf, retval = idev->info->irqcontrol(idev->info, irq_on); out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return retval ? retval : sizeof(s32); } @@ -690,7 +710,9 @@ static vm_fault_t uio_vma_fault(struct vm_fault *vmf) vm_fault_t ret = 0; int mi; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return VM_FAULT_SIGBUS; + if (!idev->info) { ret = VM_FAULT_SIGBUS; goto out; @@ -717,8 +739,7 @@ static vm_fault_t uio_vma_fault(struct vm_fault *vmf) vmf->page = page; out: - mutex_unlock(&idev->info_lock); - + percpu_ref_put(&idev->info_ref); return ret; } @@ -830,7 +851,9 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) vma->vm_private_data = idev; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { ret = -EINVAL; goto out; @@ -878,10 +901,30 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) } out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return ret; } +static long uio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct uio_listener *listener = filp->private_data; + struct uio_device *idev = listener->dev; + long retval = 0; + + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + + if (!idev->info || !idev->info->ioctl) { + retval = -EINVAL; + goto out; + } + + retval = idev->info->ioctl(idev->info, cmd, arg); +out: + percpu_ref_put(&idev->info_ref); + return retval; +} + static const struct file_operations uio_fops = { .owner = THIS_MODULE, .open = uio_open, @@ -892,6 +935,8 @@ static const struct file_operations uio_fops = { .poll = uio_poll, .fasync = uio_fasync, .llseek = noop_llseek, + .unlocked_ioctl = uio_ioctl, + .compat_ioctl = uio_ioctl, }; static int uio_major_init(void) @@ -971,9 +1016,18 @@ static void uio_device_release(struct device *dev) { struct uio_device *idev = dev_get_drvdata(dev); + percpu_ref_exit(&idev->info_ref); kfree(idev); } +static void uio_info_free(struct percpu_ref *ref) +{ + struct uio_device *idev = container_of(ref, struct uio_device, info_ref); + + complete(&idev->free_done); +} + + /** * __uio_register_device - register a new userspace IO device * @owner: module that creates the new device @@ -1004,12 +1058,21 @@ int __uio_register_device(struct module *owner, idev->owner = owner; idev->info = info; - mutex_init(&idev->info_lock); init_waitqueue_head(&idev->wait); atomic_set(&idev->event, 0); + ret = percpu_ref_init(&idev->info_ref, uio_info_free, 0, GFP_KERNEL); + if (ret) { + pr_err("percpu_ref init failed!\n"); + kfree(idev); + return ret; + } + init_completion(&idev->confirm_done); + init_completion(&idev->free_done); + ret = uio_get_minor(idev); if (ret) { + percpu_ref_exit(&idev->info_ref); kfree(idev); return ret; } @@ -1103,6 +1166,13 @@ int __devm_uio_register_device(struct module *owner, } EXPORT_SYMBOL_GPL(__devm_uio_register_device); +static void uio_confirm_info(struct percpu_ref *ref) +{ + struct uio_device *idev = container_of(ref, struct uio_device, info_ref); + + complete(&idev->confirm_done); +} + /** * uio_unregister_device - unregister a industrial IO device * @info: UIO device capabilities @@ -1119,14 +1189,16 @@ void uio_unregister_device(struct uio_info *info) idev = info->uio_dev; minor = idev->minor; - mutex_lock(&idev->info_lock); + percpu_ref_kill_and_confirm(&idev->info_ref, uio_confirm_info); + wait_for_completion(&idev->confirm_done); + wait_for_completion(&idev->free_done); + /* now, we can set info to NULL */ uio_dev_del_attributes(idev); if (info->irq && info->irq != UIO_IRQ_CUSTOM) free_irq(info->irq, idev); idev->info = NULL; - mutex_unlock(&idev->info_lock); wake_up_interruptible(&idev->wait); kill_fasync(&idev->async_queue, SIGIO, POLL_HUP); diff --git a/include/linux/bio.h b/include/linux/bio.h index 92da13d3fdea14d06508ffce0204d4b5d78ea08f..a80db0464744cd9dea719037cd7c2344673ff4df 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -11,7 +11,15 @@ #include #include +#ifdef CONFIG_THP_SWAP +#if HPAGE_PMD_NR > 256 +#define BIO_MAX_VECS (HPAGE_PMD_NR * 1U) +#else #define BIO_MAX_VECS 256U +#endif +#else +#define BIO_MAX_VECS 256U +#endif #define BIO_MAX_INLINE_VECS UIO_MAXIOV struct queue_limits; diff --git a/include/linux/mm.h b/include/linux/mm.h index 95ad2c61151ce723b774561435d0155830235059..b78c7b4951f9b8fcc1b20f62dc9955bd5c3c1d28 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4512,6 +4512,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); +int vm_insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page *page); +int vm_insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num); int map_kernel_pages_prepare(struct vm_area_desc *desc); int map_kernel_pages_complete(struct vm_area_struct *vma, struct mmap_action *action); diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 02eaac47ac444a1b12654fcbc42fb1cde8a7be6b..4ee9b76ae8e1c04fc1473f93bfcc0abe18ea8cac 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -16,6 +16,7 @@ #include #include #include +#include struct module; struct uio_map; @@ -81,9 +82,11 @@ struct uio_device { struct fasync_struct *async_queue; wait_queue_head_t wait; struct uio_info *info; - struct mutex info_lock; struct kobject *map_dir; struct kobject *portio_dir; + struct percpu_ref info_ref; + struct completion confirm_done; + struct completion free_done; }; /** @@ -116,6 +119,7 @@ struct uio_info { int (*open)(struct uio_info *info, struct inode *inode); int (*release)(struct uio_info *info, struct inode *inode); int (*irqcontrol)(struct uio_info *info, s32 irq_on); + long (*ioctl)(struct uio_info *info, unsigned int cmd, unsigned long arg); }; extern int __must_check diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index f925a77f19ed1b42ab0626b1b897aefe1328b25d..8931c2bb0afe34263f4438c4dc7385cf45fd3c73 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -73,6 +73,8 @@ enum tcmu_opcode { struct tcmu_cmd_entry_hdr { __u32 len_op; __u16 cmd_id; +#define TCMU_KFLAG_ZERO_COPY 0x1 +#define TCMU_KFLAG_BYPASS_DATA_AREA 0x2 __u8 kflags; #define TCMU_UFLAG_UNKNOWN_OP 0x1 #define TCMU_UFLAG_READ_LEN 0x2 @@ -185,4 +187,21 @@ enum tcmu_genl_attr { }; #define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) +struct tcmu_data_xfer { + __u16 cmd_id; + __u16 __pad1; + __u32 iov_cnt; + struct iovec __user *iovec; +}; + +#define TCMU_IOCTL_CMD_COPY_TO_SGL _IOW('T', 0xe0, struct tcmu_data_xfer) +#define TCMU_IOCTL_CMD_COPY_FROM_SGL _IOR('T', 0xe1, struct tcmu_data_xfer) +#define TCMU_IOCTL_CMD_ZEROCOPY _IOW('T', 0xe2, struct tcmu_cmd_zerocopy) + +struct tcmu_cmd_zerocopy { + struct iovec __user *iov; + __u32 iov_cnt; + __u16 cmd_id; +}; + #endif diff --git a/mm/memory.c b/mm/memory.c index 821a585fccc07b12e084266ce9af09b14cf5675a..95bc5c10d0edce7944146da97c8d0d0c14aead0f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2231,6 +2231,7 @@ void zap_vma_range(struct vm_area_struct *vma, unsigned long address, zap_vma_range_batched(&tlb, vma, address, size, NULL); tlb_finish_mmu(&tlb); } +EXPORT_SYMBOL_GPL(zap_vma_range); /** * zap_special_vma_range - zap all page table entries in a special vma range @@ -3291,6 +3292,176 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long } EXPORT_SYMBOL(vm_iomap_memory); +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL +static int insert_page_into_pte_locked_mkspecial(struct mm_struct *mm, pte_t *pte, + unsigned long addr, struct page *page, pgprot_t prot) +{ + /* + * The page to be inserted should be either anonymous page or file page. + * + * In general, the anonymous page used in dio should be pinned, while + * the file page used in buffer IO is either locked (read) or writeback + * (sync). On the other hand, file page used in IO metadata read (e.g., + * ext4_get_inode_loc) can be unlocked, and the buffer_head is locked + * instead. + * + * Finally, it is the caller's responsibility to ensure the validity of + * pages to be inserted, i.e., such pages are used for IO requests. + */ + if (!PageAnon(page) && !folio_is_file_lru(page_folio(page))) + return -EINVAL; + + flush_dcache_page(page); + + if (!pte_none(*pte)) + return -EBUSY; + set_pte_at(mm, addr, pte, pte_mkspecial(mk_pte(page, prot))); + return 0; +} + +static int insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page *page, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + int retval; + pte_t *pte; + spinlock_t *ptl; + + retval = -ENOMEM; + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = insert_page_into_pte_locked_mkspecial(mm, pte, addr, page, prot); + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +int vm_insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, struct page *page) +{ + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(mmap_read_trylock(vma->vm_mm)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vm_flags_set(vma, VM_MIXEDMAP); + } + return insert_page_mkspecial(vma, addr, page, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_page_mkspecial); + +#ifdef pte_index +/* + * insert_pages_mkspecial() amortizes the cost of spinlock operations + * when inserting pages in a loop. Arch *must* define pte_index. + */ +static int insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num, pgprot_t prot) +{ + pmd_t *pmd = NULL; + pte_t *start_pte, *pte; + spinlock_t *pte_lock; + struct mm_struct *const mm = vma->vm_mm; + unsigned long curr_page_idx = 0; + unsigned long remaining_pages_total = *num; + unsigned long pages_to_write_in_pmd; + int ret; +more: + ret = -EFAULT; + pmd = walk_to_pmd(mm, addr); + if (!pmd) + goto out; + + pages_to_write_in_pmd = min_t(unsigned long, + remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); + + /* Allocate the PTE if necessary; takes PMD lock once only. */ + ret = -ENOMEM; + if (pte_alloc(mm, pmd)) + goto out; + + while (pages_to_write_in_pmd) { + int pte_idx = 0; + const int batch_size = min_t(int, pages_to_write_in_pmd, 8); + + start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { + int err = insert_page_into_pte_locked_mkspecial(mm, pte, + addr, pages[curr_page_idx], prot); + if (unlikely(err)) { + pte_unmap_unlock(start_pte, pte_lock); + ret = err; + remaining_pages_total -= pte_idx; + goto out; + } + addr += PAGE_SIZE; + ++curr_page_idx; + } + pte_unmap_unlock(start_pte, pte_lock); + pages_to_write_in_pmd -= batch_size; + remaining_pages_total -= batch_size; + } + if (remaining_pages_total) + goto more; + ret = 0; +out: + *num = remaining_pages_total; + return ret; +} +#endif /* pte_index */ + +/* + * vm_insert_pages_mkspecial - variant of vm_insert_pages using insert_pfn. + * + * The main purpose of vm_insert_pages_mkspecial is to combine the advantages of + * vm_insert_pages (batching the pmd lock) and remap_pfn_range_notrack (skipping + * track_pfn_insert). + * + * The caller should ensure the isolation (refcounted, PG_locked, PG_writeback, etc.) + * of @pages, and account for error case where a subset of @pages are mapped. + */ +int vm_insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ +#ifdef pte_index + const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; + + if (addr < vma->vm_start || end_addr >= vma->vm_end) + return -EFAULT; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(mmap_read_trylock(vma->vm_mm)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vm_flags_set(vma, VM_MIXEDMAP); + } + return insert_pages_mkspecial(vma, addr, pages, num, vma->vm_page_prot); +#else + unsigned long idx = 0, pgcount = *num; + int err = -EINVAL; + + for (; idx < pgcount; ++idx) { + err = vm_insert_page_mkspecial(vma, addr + (PAGE_SIZE * idx), pages[idx]); + if (err) + break; + } + *num = pgcount - idx; + return err; +#endif /* pte_index */ +} +EXPORT_SYMBOL(vm_insert_pages_mkspecial); +#else +int vm_insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, struct page *page) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_page_mkspecial); +int vm_insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_pages_mkspecial); +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ + static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create,