summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd/kfd_process.c
diff options
context:
space:
mode:
authorYong Zhao <yong.zhao@amd.com>2017-09-20 18:10:14 -0400
committerOded Gabbay <oded.gabbay@gmail.com>2017-09-20 18:10:14 -0400
commit733fa1f7428c362b17b3de3a1c691e21fa803239 (patch)
treed106c889b11b3910a0fad79f677a42424b81b209 /drivers/gpu/drm/amd/amdkfd/kfd_process.c
parentb8935a7c4b4f2c487c639eb9071b4e8f0cbeac4a (diff)
downloadtalos-op-linux-733fa1f7428c362b17b3de3a1c691e21fa803239.tar.gz
talos-op-linux-733fa1f7428c362b17b3de3a1c691e21fa803239.zip
drm/amdkfd: Fix suspend/resume issue on Carrizo v2
When we do suspend/resume through "sudo pm-suspend" while there is HSA activity running, upon resume we will encounter HWS hanging, which is caused by memory read/write failures. The root cause is that when suspend, we neglected to unbind pasid from kfd device. Another major change is that the bind/unbinding is changed to be performed on a per process basis, instead of whether there are queues in dqm. v2: - free IOMMU device if kfd_bind_processes_to_device fails in kfd_resume - add comments to kfd_bind/unbind_processes_to/from_device - minor cleanups Signed-off-by: Yong Zhao <yong.zhao@amd.com> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_process.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c97
1 files changed, 81 insertions, 16 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 9e65ce3c1967..1325f88591ae 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -174,9 +174,10 @@ static void kfd_process_wq_release(struct work_struct *work)
if (pdd->reset_wavefronts)
dbgdev_wave_reset_wavefronts(pdd->dev, p);
- amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
- list_del(&pdd->per_device_list);
+ if (pdd->bound == PDD_BOUND)
+ amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
+ list_del(&pdd->per_device_list);
kfree(pdd);
}
@@ -351,9 +352,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
list_for_each_entry(pdd, &p->per_device_data, per_device_list)
if (pdd->dev == dev)
- break;
+ return pdd;
- return pdd;
+ return NULL;
}
struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
@@ -368,6 +369,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
pdd->qpd.dqm = dev->dqm;
pdd->reset_wavefronts = false;
+ pdd->bound = PDD_UNBOUND;
list_add(&pdd->per_device_list, &p->per_device_data);
}
@@ -393,19 +395,91 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
return ERR_PTR(-ENOMEM);
}
- if (pdd->bound)
+ if (pdd->bound == PDD_BOUND) {
return pdd;
+ } else if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) {
+ pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n");
+ return ERR_PTR(-EINVAL);
+ }
err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
if (err < 0)
return ERR_PTR(err);
- pdd->bound = true;
+ pdd->bound = PDD_BOUND;
return pdd;
}
-void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
+/*
+ * Bind processes do the device that have been temporarily unbound
+ * (PDD_BOUND_SUSPENDED) in kfd_unbind_processes_from_device.
+ */
+int kfd_bind_processes_to_device(struct kfd_dev *dev)
+{
+ struct kfd_process_device *pdd;
+ struct kfd_process *p;
+ unsigned int temp;
+ int err = 0;
+
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ mutex_lock(&p->mutex);
+ pdd = kfd_get_process_device_data(dev, p);
+ if (pdd->bound != PDD_BOUND_SUSPENDED) {
+ mutex_unlock(&p->mutex);
+ continue;
+ }
+
+ err = amd_iommu_bind_pasid(dev->pdev, p->pasid,
+ p->lead_thread);
+ if (err < 0) {
+ pr_err("unexpected pasid %d binding failure\n",
+ p->pasid);
+ mutex_unlock(&p->mutex);
+ break;
+ }
+
+ pdd->bound = PDD_BOUND;
+ mutex_unlock(&p->mutex);
+ }
+
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+
+ return err;
+}
+
+/*
+ * Temporarily unbind currently bound processes from the device and
+ * mark them as PDD_BOUND_SUSPENDED. These processes will be restored
+ * to PDD_BOUND state in kfd_bind_processes_to_device.
+ */
+void kfd_unbind_processes_from_device(struct kfd_dev *dev)
+{
+ struct kfd_process_device *pdd;
+ struct kfd_process *p;
+ unsigned int temp, temp_bound, temp_pasid;
+
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ mutex_lock(&p->mutex);
+ pdd = kfd_get_process_device_data(dev, p);
+ temp_bound = pdd->bound;
+ temp_pasid = p->pasid;
+ if (pdd->bound == PDD_BOUND)
+ pdd->bound = PDD_BOUND_SUSPENDED;
+ mutex_unlock(&p->mutex);
+
+ if (temp_bound == PDD_BOUND)
+ amd_iommu_unbind_pasid(dev->pdev, temp_pasid);
+ }
+
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+}
+
+void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
{
struct kfd_process *p;
struct kfd_process_device *pdd;
@@ -438,15 +512,6 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
pdd->reset_wavefronts = false;
}
- /*
- * Just mark pdd as unbound, because we still need it
- * to call amd_iommu_unbind_pasid() in when the
- * process exits.
- * We don't call amd_iommu_unbind_pasid() here
- * because the IOMMU called us.
- */
- pdd->bound = false;
-
mutex_unlock(&p->mutex);
}
OpenPOWER on IntegriCloud