diff options
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r-- | drivers/infiniband/core/Makefile | 1 | ||||
-rw-r--r-- | drivers/infiniband/core/addr.c | 4 | ||||
-rw-r--r-- | drivers/infiniband/core/multicast.c | 11 | ||||
-rw-r--r-- | drivers/infiniband/core/umem.c | 72 | ||||
-rw-r--r-- | drivers/infiniband/core/umem_odp.c | 668 | ||||
-rw-r--r-- | drivers/infiniband/core/umem_rbtree.c | 94 | ||||
-rw-r--r-- | drivers/infiniband/core/uverbs.h | 1 | ||||
-rw-r--r-- | drivers/infiniband/core/uverbs_cmd.c | 171 | ||||
-rw-r--r-- | drivers/infiniband/core/uverbs_main.c | 5 | ||||
-rw-r--r-- | drivers/infiniband/core/verbs.c | 3 |
10 files changed, 976 insertions, 54 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index ffd0af6734af..acf736764445 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o +ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o ib_mad-y := mad.o smi.o agent.o mad_rmpp.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 8172d37f9add..f80da50d84a5 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -176,8 +176,8 @@ static void set_timeout(unsigned long time) unsigned long delay; delay = time - jiffies; - if ((long)delay <= 0) - delay = 1; + if ((long)delay < 0) + delay = 0; mod_delayed_work(addr_wq, &work, delay); } diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index d2360a8ef0b2..fa17b552ff78 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -525,17 +525,22 @@ static void join_handler(int status, struct ib_sa_mcmember_rec *rec, if (status) process_join_error(group, status); else { + int mgids_changed, is_mgid0; ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(rec->pkey), &pkey_index); spin_lock_irq(&group->port->lock); - group->rec = *rec; if (group->state == MCAST_BUSY && group->pkey_index == MCAST_INVALID_PKEY_INDEX) group->pkey_index = pkey_index; - if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) { + mgids_changed = memcmp(&rec->mgid, &group->rec.mgid, + sizeof(group->rec.mgid)); + group->rec = *rec; + if (mgids_changed) { rb_erase(&group->node, &group->port->table); - mcast_insert(group->port, group, 1); + is_mgid0 = !memcmp(&mgid0, &group->rec.mgid, + sizeof(mgid0)); + mcast_insert(group->port, group, is_mgid0); } spin_unlock_irq(&group->port->lock); } diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index df0c4f605a21..aec7a6aa2951 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -39,6 +39,7 @@ #include <linux/hugetlb.h> #include <linux/dma-attrs.h> #include <linux/slab.h> +#include <rdma/ib_umem_odp.h> #include "uverbs.h" @@ -69,6 +70,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d /** * ib_umem_get - Pin and DMA map userspace memory. + * + * If access flags indicate ODP memory, avoid pinning. Instead, stores + * the mm for future page fault handling in conjunction with MMU notifiers. + * * @context: userspace context to pin memory for * @addr: userspace virtual address to start at * @size: length of region to pin @@ -103,17 +108,30 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->context = context; umem->length = size; - umem->offset = addr & ~PAGE_MASK; + umem->address = addr; umem->page_size = PAGE_SIZE; umem->pid = get_task_pid(current, PIDTYPE_PID); /* - * We ask for writable memory if any access flags other than - * "remote read" are set. "Local write" and "remote write" + * We ask for writable memory if any of the following + * access flags are set. "Local write" and "remote write" * obviously require write access. "Remote atomic" can do * things like fetch and add, which will modify memory, and * "MW bind" can change permissions by binding a window. */ - umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); + umem->writable = !!(access & + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); + + if (access & IB_ACCESS_ON_DEMAND) { + ret = ib_umem_odp_get(context, umem); + if (ret) { + kfree(umem); + return ERR_PTR(ret); + } + return umem; + } + + umem->odp_data = NULL; /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; @@ -132,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!vma_list) umem->hugetlb = 0; - npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; + npages = ib_umem_num_pages(umem); down_write(¤t->mm->mmap_sem); @@ -235,6 +253,11 @@ void ib_umem_release(struct ib_umem *umem) struct task_struct *task; unsigned long diff; + if (umem->odp_data) { + ib_umem_odp_release(umem); + return; + } + __ib_umem_release(umem->context->device, umem, 1); task = get_pid_task(umem->pid, PIDTYPE_PID); @@ -246,7 +269,7 @@ void ib_umem_release(struct ib_umem *umem) if (!mm) goto out; - diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; + diff = ib_umem_num_pages(umem); /* * We may be called with the mm's mmap_sem already held. This @@ -283,6 +306,9 @@ int ib_umem_page_count(struct ib_umem *umem) int n; struct scatterlist *sg; + if (umem->odp_data) + return ib_umem_num_pages(umem); + shift = ilog2(umem->page_size); n = 0; @@ -292,3 +318,37 @@ int ib_umem_page_count(struct ib_umem *umem) return n; } EXPORT_SYMBOL(ib_umem_page_count); + +/* + * Copy from the given ib_umem's pages to the given buffer. + * + * umem - the umem to copy from + * offset - offset to start copying from + * dst - destination buffer + * length - buffer length + * + * Returns 0 on success, or an error code. + */ +int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length) +{ + size_t end = offset + length; + int ret; + + if (offset > umem->length || length > umem->length - offset) { + pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n", + offset, umem->length, end); + return -EINVAL; + } + + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length, + offset + ib_umem_offset(umem)); + + if (ret < 0) + return ret; + else if (ret != length) + return -EINVAL; + else + return 0; +} +EXPORT_SYMBOL(ib_umem_copy_from); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c new file mode 100644 index 000000000000..6095872549e7 --- /dev/null +++ b/drivers/infiniband/core/umem_odp.c @@ -0,0 +1,668 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/pid.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/vmalloc.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> + +static void ib_umem_notifier_start_account(struct ib_umem *item) +{ + mutex_lock(&item->odp_data->umem_mutex); + + /* Only update private counters for this umem if it has them. + * Otherwise skip it. All page faults will be delayed for this umem. */ + if (item->odp_data->mn_counters_active) { + int notifiers_count = item->odp_data->notifiers_count++; + + if (notifiers_count == 0) + /* Initialize the completion object for waiting on + * notifiers. Since notifier_count is zero, no one + * should be waiting right now. */ + reinit_completion(&item->odp_data->notifier_completion); + } + mutex_unlock(&item->odp_data->umem_mutex); +} + +static void ib_umem_notifier_end_account(struct ib_umem *item) +{ + mutex_lock(&item->odp_data->umem_mutex); + + /* Only update private counters for this umem if it has them. + * Otherwise skip it. All page faults will be delayed for this umem. */ + if (item->odp_data->mn_counters_active) { + /* + * This sequence increase will notify the QP page fault that + * the page that is going to be mapped in the spte could have + * been freed. + */ + ++item->odp_data->notifiers_seq; + if (--item->odp_data->notifiers_count == 0) + complete_all(&item->odp_data->notifier_completion); + } + mutex_unlock(&item->odp_data->umem_mutex); +} + +/* Account for a new mmu notifier in an ib_ucontext. */ +static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) +{ + atomic_inc(&context->notifier_count); +} + +/* Account for a terminating mmu notifier in an ib_ucontext. + * + * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since + * the function takes the semaphore itself. */ +static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) +{ + int zero_notifiers = atomic_dec_and_test(&context->notifier_count); + + if (zero_notifiers && + !list_empty(&context->no_private_counters)) { + /* No currently running mmu notifiers. Now is the chance to + * add private accounting to all previously added umems. */ + struct ib_umem_odp *odp_data, *next; + + /* Prevent concurrent mmu notifiers from working on the + * no_private_counters list. */ + down_write(&context->umem_rwsem); + + /* Read the notifier_count again, with the umem_rwsem + * semaphore taken for write. */ + if (!atomic_read(&context->notifier_count)) { + list_for_each_entry_safe(odp_data, next, + &context->no_private_counters, + no_private_counters) { + mutex_lock(&odp_data->umem_mutex); + odp_data->mn_counters_active = true; + list_del(&odp_data->no_private_counters); + complete_all(&odp_data->notifier_completion); + mutex_unlock(&odp_data->umem_mutex); + } + } + + up_write(&context->umem_rwsem); + } +} + +static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) { + /* + * Increase the number of notifiers running, to + * prevent any further fault handling on this MR. + */ + ib_umem_notifier_start_account(item); + item->odp_data->dying = 1; + /* Make sure that the fact the umem is dying is out before we release + * all pending page faults. */ + smp_wmb(); + complete_all(&item->odp_data->notifier_completion); + item->context->invalidate_range(item, ib_umem_start(item), + ib_umem_end(item)); + return 0; +} + +static void ib_umem_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, + ULLONG_MAX, + ib_umem_notifier_release_trampoline, + NULL); + up_read(&context->umem_rwsem); +} + +static int invalidate_page_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_start_account(item); + item->context->invalidate_range(item, start, start + PAGE_SIZE); + ib_umem_notifier_end_account(item); + return 0; +} + +static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, address, + address + PAGE_SIZE, + invalidate_page_trampoline, NULL); + up_read(&context->umem_rwsem); + ib_ucontext_notifier_end_account(context); +} + +static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_start_account(item); + item->context->invalidate_range(item, start, end); + return 0; +} + +static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + end, + invalidate_range_start_trampoline, NULL); + up_read(&context->umem_rwsem); +} + +static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_end_account(item); + return 0; +} + +static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + end, + invalidate_range_end_trampoline, NULL); + up_read(&context->umem_rwsem); + ib_ucontext_notifier_end_account(context); +} + +static struct mmu_notifier_ops ib_umem_notifiers = { + .release = ib_umem_notifier_release, + .invalidate_page = ib_umem_notifier_invalidate_page, + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, +}; + +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) +{ + int ret_val; + struct pid *our_pid; + struct mm_struct *mm = get_task_mm(current); + + if (!mm) + return -EINVAL; + + /* Prevent creating ODP MRs in child processes */ + rcu_read_lock(); + our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); + rcu_read_unlock(); + put_pid(our_pid); + if (context->tgid != our_pid) { + ret_val = -EINVAL; + goto out_mm; + } + + umem->hugetlb = 0; + umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); + if (!umem->odp_data) { + ret_val = -ENOMEM; + goto out_mm; + } + umem->odp_data->umem = umem; + + mutex_init(&umem->odp_data->umem_mutex); + + init_completion(&umem->odp_data->notifier_completion); + + umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * + sizeof(*umem->odp_data->page_list)); + if (!umem->odp_data->page_list) { + ret_val = -ENOMEM; + goto out_odp_data; + } + + umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * + sizeof(*umem->odp_data->dma_list)); + if (!umem->odp_data->dma_list) { + ret_val = -ENOMEM; + goto out_page_list; + } + + /* + * When using MMU notifiers, we will get a + * notification before the "current" task (and MM) is + * destroyed. We use the umem_rwsem semaphore to synchronize. + */ + down_write(&context->umem_rwsem); + context->odp_mrs_count++; + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_insert(&umem->odp_data->interval_tree, + &context->umem_tree); + if (likely(!atomic_read(&context->notifier_count))) + umem->odp_data->mn_counters_active = true; + else + list_add(&umem->odp_data->no_private_counters, + &context->no_private_counters); + downgrade_write(&context->umem_rwsem); + + if (context->odp_mrs_count == 1) { + /* + * Note that at this point, no MMU notifier is running + * for this context! + */ + atomic_set(&context->notifier_count, 0); + INIT_HLIST_NODE(&context->mn.hlist); + context->mn.ops = &ib_umem_notifiers; + /* + * Lock-dep detects a false positive for mmap_sem vs. + * umem_rwsem, due to not grasping downgrade_write correctly. + */ + lockdep_off(); + ret_val = mmu_notifier_register(&context->mn, mm); + lockdep_on(); + if (ret_val) { + pr_err("Failed to register mmu_notifier %d\n", ret_val); + ret_val = -EBUSY; + goto out_mutex; + } + } + + up_read(&context->umem_rwsem); + + /* + * Note that doing an mmput can cause a notifier for the relevant mm. + * If the notifier is called while we hold the umem_rwsem, this will + * cause a deadlock. Therefore, we release the reference only after we + * released the semaphore. + */ + mmput(mm); + return 0; + +out_mutex: + up_read(&context->umem_rwsem); + vfree(umem->odp_data->dma_list); +out_page_list: + vfree(umem->odp_data->page_list); +out_odp_data: + kfree(umem->odp_data); +out_mm: + mmput(mm); + return ret_val; +} + +void ib_umem_odp_release(struct ib_umem *umem) +{ + struct ib_ucontext *context = umem->context; + + /* + * Ensure that no more pages are mapped in the umem. + * + * It is the driver's responsibility to ensure, before calling us, + * that the hardware will not attempt to access the MR any more. + */ + ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), + ib_umem_end(umem)); + + down_write(&context->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_remove(&umem->odp_data->interval_tree, + &context->umem_tree); + context->odp_mrs_count--; + if (!umem->odp_data->mn_counters_active) { + list_del(&umem->odp_data->no_private_counters); + complete_all(&umem->odp_data->notifier_completion); + } + + /* + * Downgrade the lock to a read lock. This ensures that the notifiers + * (who lock the mutex for reading) will be able to finish, and we + * will be able to enventually obtain the mmu notifiers SRCU. Note + * that since we are doing it atomically, no other user could register + * and unregister while we do the check. + */ + downgrade_write(&context->umem_rwsem); + if (!context->odp_mrs_count) { + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + + owning_process = get_pid_task(context->tgid, + PIDTYPE_PID); + if (owning_process == NULL) + /* + * The process is already dead, notifier were removed + * already. + */ + goto out; + + owning_mm = get_task_mm(owning_process); + if (owning_mm == NULL) + /* + * The process' mm is already dead, notifier were + * removed already. + */ + goto out_put_task; + mmu_notifier_unregister(&context->mn, owning_mm); + + mmput(owning_mm); + +out_put_task: + put_task_struct(owning_process); + } +out: + up_read(&context->umem_rwsem); + + vfree(umem->odp_data->dma_list); + vfree(umem->odp_data->page_list); + kfree(umem->odp_data); + kfree(umem); +} + +/* + * Map for DMA and insert a single page into the on-demand paging page tables. + * + * @umem: the umem to insert the page to. + * @page_index: index in the umem to add the page to. + * @page: the page struct to map and add. + * @access_mask: access permissions needed for this page. + * @current_seq: sequence number for synchronization with invalidations. + * the sequence number is taken from + * umem->odp_data->notifiers_seq. + * + * The function returns -EFAULT if the DMA mapping operation fails. It returns + * -EAGAIN if a concurrent invalidation prevents us from updating the page. + * + * The page is released via put_page even if the operation failed. For + * on-demand pinning, the page is released whenever it isn't stored in the + * umem. + */ +static int ib_umem_odp_map_dma_single_page( + struct ib_umem *umem, + int page_index, + u64 base_virt_addr, + struct page *page, + u64 access_mask, + unsigned long current_seq) +{ + struct ib_device *dev = umem->context->device; + dma_addr_t dma_addr; + int stored_page = 0; + int remove_existing_mapping = 0; + int ret = 0; + + mutex_lock(&umem->odp_data->umem_mutex); + /* + * Note: we avoid writing if seq is different from the initial seq, to + * handle case of a racing notifier. This check also allows us to bail + * early if we have a notifier running in parallel with us. + */ + if (ib_umem_mmu_notifier_retry(umem, current_seq)) { + ret = -EAGAIN; + goto out; + } + if (!(umem->odp_data->dma_list[page_index])) { + dma_addr = ib_dma_map_page(dev, + page, + 0, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(dev, dma_addr)) { + ret = -EFAULT; + goto out; + } + umem->odp_data->dma_list[page_index] = dma_addr | access_mask; + umem->odp_data->page_list[page_index] = page; + stored_page = 1; + } else if (umem->odp_data->page_list[page_index] == page) { + umem->odp_data->dma_list[page_index] |= access_mask; + } else { + pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", + umem->odp_data->page_list[page_index], page); + /* Better remove the mapping now, to prevent any further + * damage. */ + remove_existing_mapping = 1; + } + +out: + mutex_unlock(&umem->odp_data->umem_mutex); + + /* On Demand Paging - avoid pinning the page */ + if (umem->context->invalidate_range || !stored_page) + put_page(page); + + if (remove_existing_mapping && umem->context->invalidate_range) { + invalidate_page_trampoline( + umem, + base_virt_addr + (page_index * PAGE_SIZE), + base_virt_addr + ((page_index+1)*PAGE_SIZE), + NULL); + ret = -EAGAIN; + } + + return ret; +} + +/** + * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. + * + * Pins the range of pages passed in the argument, and maps them to + * DMA addresses. The DMA addresses of the mapped pages is updated in + * umem->odp_data->dma_list. + * + * Returns the number of pages mapped in success, negative error code + * for failure. + * An -EAGAIN error code is returned when a concurrent mmu notifier prevents + * the function from completing its task. + * + * @umem: the umem to map and pin + * @user_virt: the address from which we need to map. + * @bcnt: the minimal number of bytes to pin and map. The mapping might be + * bigger due to alignment, and may also be smaller in case of an error + * pinning or mapping a page. The actual pages mapped is returned in + * the return value. + * @access_mask: bit mask of the requested access permissions for the given + * range. + * @current_seq: the MMU notifiers sequance value for synchronization with + * invalidations. the sequance number is read from + * umem->odp_data->notifiers_seq before calling this function + */ +int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, + u64 access_mask, unsigned long current_seq) +{ + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + struct page **local_page_list = NULL; + u64 off; + int j, k, ret = 0, start_idx, npages = 0; + u64 base_virt_addr; + + if (access_mask == 0) + return -EINVAL; + + if (user_virt < ib_umem_start(umem) || + user_virt + bcnt > ib_umem_end(umem)) + return -EFAULT; + + local_page_list = (struct page **)__get_free_page(GFP_KERNEL); + if (!local_page_list) + return -ENOMEM; + + off = user_virt & (~PAGE_MASK); + user_virt = user_virt & PAGE_MASK; + base_virt_addr = user_virt; + bcnt += off; /* Charge for the first page offset as well. */ + + owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); + if (owning_process == NULL) { + ret = -EINVAL; + goto out_no_task; + } + + owning_mm = get_task_mm(owning_process); + if (owning_mm == NULL) { + ret = -EINVAL; + goto out_put_task; + } + + start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; + k = start_idx; + + while (bcnt > 0) { + const size_t gup_num_pages = + min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, + PAGE_SIZE / sizeof(struct page *)); + + down_read(&owning_mm->mmap_sem); + /* + * Note: this might result in redundent page getting. We can + * avoid this by checking dma_list to be 0 before calling + * get_user_pages. However, this make the code much more + * complex (and doesn't gain us much performance in most use + * cases). + */ + npages = get_user_pages(owning_process, owning_mm, user_virt, + gup_num_pages, + access_mask & ODP_WRITE_ALLOWED_BIT, 0, + local_page_list, NULL); + up_read(&owning_mm->mmap_sem); + + if (npages < 0) + break; + + bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); + user_virt += npages << PAGE_SHIFT; + for (j = 0; j < npages; ++j) { + ret = ib_umem_odp_map_dma_single_page( + umem, k, base_virt_addr, local_page_list[j], + access_mask, current_seq); + if (ret < 0) + break; + k++; + } + + if (ret < 0) { + /* Release left over pages when handling errors. */ + for (++j; j < npages; ++j) + put_page(local_page_list[j]); + break; + } + } + + if (ret >= 0) { + if (npages < 0 && k == start_idx) + ret = npages; + else + ret = k - start_idx; + } + + mmput(owning_mm); +out_put_task: + put_task_struct(owning_process); +out_no_task: + free_page((unsigned long)local_page_list); + return ret; +} +EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); + +void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, + u64 bound) +{ + int idx; + u64 addr; + struct ib_device *dev = umem->context->device; + + virt = max_t(u64, virt, ib_umem_start(umem)); + bound = min_t(u64, bound, ib_umem_end(umem)); + /* Note that during the run of this function, the + * notifiers_count of the MR is > 0, preventing any racing + * faults from completion. We might be racing with other + * invalidations, so we must make sure we free each page only + * once. */ + for (addr = virt; addr < bound; addr += (u64)umem->page_size) { + idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + mutex_lock(&umem->odp_data->umem_mutex); + if (umem->odp_data->page_list[idx]) { + struct page *page = umem->odp_data->page_list[idx]; + struct page *head_page = compound_head(page); + dma_addr_t dma = umem->odp_data->dma_list[idx]; + dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; + + WARN_ON(!dma_addr); + + ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (dma & ODP_WRITE_ALLOWED_BIT) + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); + /* on demand pinning support */ + if (!umem->context->invalidate_range) + put_page(page); + umem->odp_data->page_list[idx] = NULL; + umem->odp_data->dma_list[idx] = 0; + } + mutex_unlock(&umem->odp_data->umem_mutex); + } +} +EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c new file mode 100644 index 000000000000..727d788448f5 --- /dev/null +++ b/drivers/infiniband/core/umem_rbtree.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/interval_tree_generic.h> +#include <linux/sched.h> +#include <linux/gfp.h> +#include <rdma/ib_umem_odp.h> + +/* + * The ib_umem list keeps track of memory regions for which the HW + * device request to receive notification when the related memory + * mapping is changed. + * + * ib_umem_lock protects the list. + */ + +static inline u64 node_start(struct umem_odp_node *n) +{ + struct ib_umem_odp *umem_odp = + container_of(n, struct ib_umem_odp, interval_tree); + + return ib_umem_start(umem_odp->umem); +} + +/* Note that the representation of the intervals in the interval tree + * considers the ending point as contained in the interval, while the + * function ib_umem_end returns the first address which is not contained + * in the umem. + */ +static inline u64 node_last(struct umem_odp_node *n) +{ + struct ib_umem_odp *umem_odp = + container_of(n, struct ib_umem_odp, interval_tree); + + return ib_umem_end(umem_odp->umem) - 1; +} + +INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, + node_start, node_last, , rbt_ib_umem) + +/* @last is not a part of the interval. See comment for function + * node_last. + */ +int rbt_ib_umem_for_each_in_range(struct rb_root *root, + u64 start, u64 last, + umem_call_back cb, + void *cookie) +{ + int ret_val = 0; + struct umem_odp_node *node; + struct ib_umem_odp *umem; + + if (unlikely(start == last)) + return ret_val; + + for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; + node = rbt_ib_umem_iter_next(node, start, last - 1)) { + umem = container_of(node, struct ib_umem_odp, interval_tree); + ret_val = cb(umem->umem, start, last, cookie) || ret_val; + } + + return ret_val; +} diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 643c08a025a5..b716b0815644 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -258,5 +258,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); IB_UVERBS_DECLARE_EX_CMD(create_flow); IB_UVERBS_DECLARE_EX_CMD(destroy_flow); +IB_UVERBS_DECLARE_EX_CMD(query_device); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5ba2a86aab6a..532d8eba8b02 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -36,6 +36,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/slab.h> +#include <linux/sched.h> #include <asm/uaccess.h> @@ -288,6 +289,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, struct ib_uverbs_get_context_resp resp; struct ib_udata udata; struct ib_device *ibdev = file->device->ib_dev; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct ib_device_attr dev_attr; +#endif struct ib_ucontext *ucontext; struct file *filp; int ret; @@ -325,8 +329,25 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, INIT_LIST_HEAD(&ucontext->ah_list); INIT_LIST_HEAD(&ucontext->xrcd_list); INIT_LIST_HEAD(&ucontext->rule_list); + rcu_read_lock(); + ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + rcu_read_unlock(); ucontext->closing = 0; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + ucontext->umem_tree = RB_ROOT; + init_rwsem(&ucontext->umem_rwsem); + ucontext->odp_mrs_count = 0; + INIT_LIST_HEAD(&ucontext->no_private_counters); + + ret = ib_query_device(ibdev, &dev_attr); + if (ret) + goto err_free; + if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + ucontext->invalidate_range = NULL; + +#endif + resp.num_comp_vectors = file->device->num_comp_vectors; ret = get_unused_fd_flags(O_CLOEXEC); @@ -371,6 +392,7 @@ err_fd: put_unused_fd(resp.async_fd); err_free: + put_pid(ucontext->tgid); ibdev->dealloc_ucontext(ucontext); err: @@ -378,6 +400,52 @@ err: return ret; } +static void copy_query_dev_fields(struct ib_uverbs_file *file, + struct ib_uverbs_query_device_resp *resp, + struct ib_device_attr *attr) +{ + resp->fw_ver = attr->fw_ver; + resp->node_guid = file->device->ib_dev->node_guid; + resp->sys_image_guid = attr->sys_image_guid; + resp->max_mr_size = attr->max_mr_size; + resp->page_size_cap = attr->page_size_cap; + resp->vendor_id = attr->vendor_id; + resp->vendor_part_id = attr->vendor_part_id; + resp->hw_ver = attr->hw_ver; + resp->max_qp = attr->max_qp; + resp->max_qp_wr = attr->max_qp_wr; + resp->device_cap_flags = attr->device_cap_flags; + resp->max_sge = attr->max_sge; + resp->max_sge_rd = attr->max_sge_rd; + resp->max_cq = attr->max_cq; + resp->max_cqe = attr->max_cqe; + resp->max_mr = attr->max_mr; + resp->max_pd = attr->max_pd; + resp->max_qp_rd_atom = attr->max_qp_rd_atom; + resp->max_ee_rd_atom = attr->max_ee_rd_atom; + resp->max_res_rd_atom = attr->max_res_rd_atom; + resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; + resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; + resp->atomic_cap = attr->atomic_cap; + resp->max_ee = attr->max_ee; + resp->max_rdd = attr->max_rdd; + resp->max_mw = attr->max_mw; + resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; + resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; + resp->max_mcast_grp = attr->max_mcast_grp; + resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; + resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; + resp->max_ah = attr->max_ah; + resp->max_fmr = attr->max_fmr; + resp->max_map_per_fmr = attr->max_map_per_fmr; + resp->max_srq = attr->max_srq; + resp->max_srq_wr = attr->max_srq_wr; + resp->max_srq_sge = attr->max_srq_sge; + resp->max_pkeys = attr->max_pkeys; + resp->local_ca_ack_delay = attr->local_ca_ack_delay; + resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; +} + ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -398,47 +466,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, return ret; memset(&resp, 0, sizeof resp); - - resp.fw_ver = attr.fw_ver; - resp.node_guid = file->device->ib_dev->node_guid; - resp.sys_image_guid = attr.sys_image_guid; - resp.max_mr_size = attr.max_mr_size; - resp.page_size_cap = attr.page_size_cap; - resp.vendor_id = attr.vendor_id; - resp.vendor_part_id = attr.vendor_part_id; - resp.hw_ver = attr.hw_ver; - resp.max_qp = attr.max_qp; - resp.max_qp_wr = attr.max_qp_wr; - resp.device_cap_flags = attr.device_cap_flags; - resp.max_sge = attr.max_sge; - resp.max_sge_rd = attr.max_sge_rd; - resp.max_cq = attr.max_cq; - resp.max_cqe = attr.max_cqe; - resp.max_mr = attr.max_mr; - resp.max_pd = attr.max_pd; - resp.max_qp_rd_atom = attr.max_qp_rd_atom; - resp.max_ee_rd_atom = attr.max_ee_rd_atom; - resp.max_res_rd_atom = attr.max_res_rd_atom; - resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom; - resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom; - resp.atomic_cap = attr.atomic_cap; - resp.max_ee = attr.max_ee; - resp.max_rdd = attr.max_rdd; - resp.max_mw = attr.max_mw; - resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp; - resp.max_raw_ethy_qp = attr.max_raw_ethy_qp; - resp.max_mcast_grp = attr.max_mcast_grp; - resp.max_mcast_qp_attach = attr.max_mcast_qp_attach; - resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach; - resp.max_ah = attr.max_ah; - resp.max_fmr = attr.max_fmr; - resp.max_map_per_fmr = attr.max_map_per_fmr; - resp.max_srq = attr.max_srq; - resp.max_srq_wr = attr.max_srq_wr; - resp.max_srq_sge = attr.max_srq_sge; - resp.max_pkeys = attr.max_pkeys; - resp.local_ca_ack_delay = attr.local_ca_ack_delay; - resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt; + copy_query_dev_fields(file, &resp, &attr); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -947,6 +975,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, goto err_free; } + if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { + struct ib_device_attr attr; + + ret = ib_query_device(pd->device, &attr); + if (ret || !(attr.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { + pr_debug("ODP support not available\n"); + ret = -EINVAL; + goto err_put; + } + } + mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, cmd.access_flags, &udata); if (IS_ERR(mr)) { @@ -3253,3 +3293,52 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, return ret ? ret : in_len; } + +int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_query_device_resp resp; + struct ib_uverbs_ex_query_device cmd; + struct ib_device_attr attr; + struct ib_device *device; + int err; + + device = file->device->ib_dev; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + if (cmd.reserved) + return -EINVAL; + + err = device->query_device(device, &attr); + if (err) + return err; + + memset(&resp, 0, sizeof(resp)); + copy_query_dev_fields(file, &resp.base, &attr); + resp.comp_mask = 0; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (cmd.comp_mask & IB_USER_VERBS_EX_QUERY_DEVICE_ODP) { + resp.odp_caps.general_caps = attr.odp_caps.general_caps; + resp.odp_caps.per_transport_caps.rc_odp_caps = + attr.odp_caps.per_transport_caps.rc_odp_caps; + resp.odp_caps.per_transport_caps.uc_odp_caps = + attr.odp_caps.per_transport_caps.uc_odp_caps; + resp.odp_caps.per_transport_caps.ud_odp_caps = + attr.odp_caps.per_transport_caps.ud_odp_caps; + resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP; + } +#endif + + err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); + if (err) + return err; + + return 0; +} diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 71ab83fde472..e6c23b9eab33 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -122,7 +122,8 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) = { [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, - [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow + [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, + [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device }; static void ib_uverbs_add_one(struct ib_device *device); @@ -296,6 +297,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uobj); } + put_pid(context->tgid); + return context->device->dealloc_ucontext(context); } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c2b89cc5dbca..f93eb8da7b5a 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -879,7 +879,8 @@ int ib_resolve_eth_l2_attrs(struct ib_qp *qp, if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac); rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac); - qp_attr->vlan_id = rdma_get_vlan_id(&sgid); + if (!(*qp_attr_mask & IB_QP_VID)) + qp_attr->vlan_id = rdma_get_vlan_id(&sgid); } else { ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid, qp_attr->ah_attr.dmac, &qp_attr->vlan_id); |