From b4cfe447d47b5763f630412fd5dc5fbe66e991d1 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:26 +0200 Subject: IB/mlx5: Implement on demand paging by adding support for MMU notifiers * Implement the relevant invalidation functions (zap MTTs as needed) * Implement interlocking (and rollback in the page fault handlers) for cases of a racing notifier and fault. * With this patch we can now enable the capability bits for supporting RC send/receive/RDMA read/RDMA write, and UD send. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/odp.c | 128 +++++++++++++++++++++++++++++++++++---- 1 file changed, 117 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband/hw/mlx5/odp.c') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 936a6cd4ecc7..a2c541c4809a 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -37,8 +37,78 @@ #define MAX_PREFETCH_LEN (4*1024*1024U) +/* Timeout in ms to wait for an active mmu notifier to complete when handling + * a pagefault. */ +#define MMU_NOTIFIER_TIMEOUT 1000 + struct workqueue_struct *mlx5_ib_page_fault_wq; +void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, + unsigned long end) +{ + struct mlx5_ib_mr *mr; + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; + u64 idx = 0, blk_start_idx = 0; + int in_block = 0; + u64 addr; + + if (!umem || !umem->odp_data) { + pr_err("invalidation called on NULL umem or non-ODP umem\n"); + return; + } + + mr = umem->odp_data->private; + + if (!mr || !mr->ibmr.pd) + return; + + start = max_t(u64, ib_umem_start(umem), start); + end = min_t(u64, ib_umem_end(umem), end); + + /* + * Iteration one - zap the HW's MTTs. The notifiers_count ensures that + * while we are doing the invalidation, no page fault will attempt to + * overwrite the same MTTs. Concurent invalidations might race us, + * but they will write 0s as well, so no difference in the end result. + */ + + for (addr = start; addr < end; addr += (u64)umem->page_size) { + idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + /* + * Strive to write the MTTs in chunks, but avoid overwriting + * non-existing MTTs. The huristic here can be improved to + * estimate the cost of another UMR vs. the cost of bigger + * UMR. + */ + if (umem->odp_data->dma_list[idx] & + (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (!in_block) { + blk_start_idx = idx; + in_block = 1; + } + } else { + u64 umr_offset = idx & umr_block_mask; + + if (in_block && umr_offset == 0) { + mlx5_ib_update_mtt(mr, blk_start_idx, + idx - blk_start_idx, 1); + in_block = 0; + } + } + } + if (in_block) + mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, + 1); + + /* + * We are now sure that the device will not access the + * memory. We can safely unmap it, and mark it as dirty if + * needed. + */ + + ib_umem_odp_unmap_dma_pages(umem, start, end); +} + #define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ @@ -59,9 +129,18 @@ int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) if (err) goto out; - /* At this point we would copy the capability bits that the driver - * supports from the hw_caps struct to the caps struct. However, no - * such capabilities are supported so far. */ + caps->general_caps = IB_ODP_SUPPORT; + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps, + SEND); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + SEND); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + RECV); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + WRITE); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + READ); + out: return err; } @@ -71,8 +150,9 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, { u32 base_key = mlx5_base_mkey(key); struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); + struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); - if (!mmr || mmr->key != key) + if (!mmr || mmr->key != key || !mr->live) return NULL; return container_of(mmr, struct mlx5_ib_mr, mmr); @@ -143,6 +223,11 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, } current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); + /* + * Ensure the sequence number is valid for some time before we call + * gup. + */ + smp_rmb(); /* * Avoid branches - this code will perform correctly @@ -165,15 +250,20 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, if (npages > 0) { mutex_lock(&mr->umem->odp_data->umem_mutex); - /* - * No need to check whether the MTTs really belong to - * this MR, since ib_umem_odp_map_dma_pages already - * checks this. - */ - ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); + if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { + /* + * No need to check whether the MTTs really belong to + * this MR, since ib_umem_odp_map_dma_pages already + * checks this. + */ + ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); + } else { + ret = -EAGAIN; + } mutex_unlock(&mr->umem->odp_data->umem_mutex); if (ret < 0) { - pr_err("Failed to update mkey page tables\n"); + if (ret != -EAGAIN) + pr_err("Failed to update mkey page tables\n"); goto srcu_unlock; } @@ -185,6 +275,22 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, } srcu_unlock: + if (ret == -EAGAIN) { + if (!mr->umem->odp_data->dying) { + struct ib_umem_odp *odp_data = mr->umem->odp_data; + unsigned long timeout = + msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); + + if (!wait_for_completion_timeout( + &odp_data->notifier_completion, + timeout)) { + pr_warn("timeout waiting for mmu notifier completion\n"); + } + } else { + /* The MR is being killed, kill the QP as well. */ + ret = -EFAULT; + } + } srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); pfault->mpfault.bytes_committed = 0; return ret ? ret : npages; -- cgit v1.2.1