diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 206 |
1 files changed, 166 insertions, 40 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index d11eba09eadd..a97af422575a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -25,7 +25,8 @@ #include "amdgpu.h" #include "amdgpu_xgmi.h" #include "amdgpu_smu.h" - +#include "amdgpu_ras.h" +#include "df/df_3_6_offset.h" static DEFINE_MUTEX(xgmi_mutex); @@ -131,9 +132,37 @@ static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, } +#define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) +static ssize_t amdgpu_xgmi_show_error(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = ddev->dev_private; + uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; + uint64_t fica_out; + unsigned int error_count = 0; + + ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); + ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); -static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); + fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in); + if (fica_out != 0x1f) + pr_err("xGMI error counters not enabled!\n"); + + fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in); + + if ((fica_out & 0xffff) == 2) + error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); + adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); + + return snprintf(buf, PAGE_SIZE, "%d\n", error_count); +} + + +static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); +static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) @@ -148,6 +177,12 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, return ret; } + /* Create xgmi error file */ + ret = device_create_file(adev->dev, &dev_attr_xgmi_error); + if (ret) + pr_err("failed to create xgmi_error\n"); + + /* Create sysfs link to hive info folder on the first device */ if (adev != hive->adev) { ret = sysfs_create_link(&adev->dev->kobj, hive->kobj, @@ -226,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo INIT_LIST_HEAD(&tmp->device_list); mutex_init(&tmp->hive_lock); mutex_init(&tmp->reset_lock); + task_barrier_init(&tmp->tb); if (lock) mutex_lock(&tmp->hive_lock); @@ -239,22 +275,49 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) { int ret = 0; struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + struct amdgpu_device *tmp_adev; + bool update_hive_pstate = true; + bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20; if (!hive) return 0; - if (hive->pstate == pstate) - return 0; + mutex_lock(&hive->hive_lock); + + if (hive->pstate == pstate) { + adev->pstate = is_high_pstate ? pstate : adev->pstate; + goto out; + } dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate); - if (is_support_sw_smu(adev)) - ret = smu_set_xgmi_pstate(&adev->smu, pstate); - if (ret) + ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate); + if (ret) { dev_err(adev->dev, "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id, ret); + goto out; + } + + /* Update device pstate */ + adev->pstate = pstate; + + /* + * Update the hive pstate only all devices of the hive + * are in the same pstate + */ + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + if (tmp_adev->pstate != adev->pstate) { + update_hive_pstate = false; + break; + } + } + if (update_hive_pstate || is_high_pstate) + hive->pstate = pstate; + +out: + mutex_unlock(&hive->hive_lock); return ret; } @@ -296,23 +359,28 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) struct amdgpu_xgmi *entry; struct amdgpu_device *tmp_adev = NULL; - int count = 0, ret = -EINVAL; + int count = 0, ret = 0; if (!adev->gmc.xgmi.supported) return 0; - ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); - if (ret) { - dev_err(adev->dev, - "XGMI: Failed to get node id\n"); - return ret; - } + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { + ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); + if (ret) { + dev_err(adev->dev, + "XGMI: Failed to get hive id\n"); + return ret; + } - ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); - if (ret) { - dev_err(adev->dev, - "XGMI: Failed to get hive id\n"); - return ret; + ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); + if (ret) { + dev_err(adev->dev, + "XGMI: Failed to get node id\n"); + return ret; + } + } else { + adev->gmc.xgmi.hive_id = 16; + adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; } hive = amdgpu_get_xgmi_hive(adev, 1); @@ -324,6 +392,9 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) goto exit; } + /* Set default device pstate */ + adev->pstate = -1; + top_info = &adev->psp.xgmi_context.top_info; list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); @@ -332,29 +403,34 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) top_info->num_nodes = count; hive->number_devices = count; - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { - /* update node list for other device in the hive */ - if (tmp_adev != adev) { - top_info = &tmp_adev->psp.xgmi_context.top_info; - top_info->nodes[count - 1].node_id = adev->gmc.xgmi.node_id; - top_info->num_nodes = count; + task_barrier_add_task(&hive->tb); + + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + /* update node list for other device in the hive */ + if (tmp_adev != adev) { + top_info = &tmp_adev->psp.xgmi_context.top_info; + top_info->nodes[count - 1].node_id = + adev->gmc.xgmi.node_id; + top_info->num_nodes = count; + } + ret = amdgpu_xgmi_update_topology(hive, tmp_adev); + if (ret) + goto exit; } - ret = amdgpu_xgmi_update_topology(hive, tmp_adev); - if (ret) - goto exit; - } - /* get latest topology info for each device from psp */ - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { - ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, - &tmp_adev->psp.xgmi_context.top_info); - if (ret) { - dev_err(tmp_adev->dev, - "XGMI: Get topology failure on device %llx, hive %llx, ret %d", - tmp_adev->gmc.xgmi.node_id, - tmp_adev->gmc.xgmi.hive_id, ret); - /* To do : continue with some node failed or disable the whole hive */ - goto exit; + /* get latest topology info for each device from psp */ + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, + &tmp_adev->psp.xgmi_context.top_info); + if (ret) { + dev_err(tmp_adev->dev, + "XGMI: Get topology failure on device %llx, hive %llx, ret %d", + tmp_adev->gmc.xgmi.node_id, + tmp_adev->gmc.xgmi.hive_id, ret); + /* To do : continue with some node failed or disable the whole hive */ + goto exit; + } } } @@ -391,7 +467,57 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev) mutex_destroy(&hive->hive_lock); mutex_destroy(&hive->reset_lock); } else { + task_barrier_rem_task(&hive->tb); amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); mutex_unlock(&hive->hive_lock); } } + +int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) +{ + int r; + struct ras_ih_if ih_info = { + .cb = NULL, + }; + struct ras_fs_if fs_info = { + .sysfs_name = "xgmi_wafl_err_count", + .debugfs_name = "xgmi_wafl_err_inject", + }; + + if (!adev->gmc.xgmi.supported || + adev->gmc.xgmi.num_physical_nodes == 0) + return 0; + + if (!adev->gmc.xgmi.ras_if) { + adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); + if (!adev->gmc.xgmi.ras_if) + return -ENOMEM; + adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL; + adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; + adev->gmc.xgmi.ras_if->sub_block_index = 0; + strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl"); + } + ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if; + r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if, + &fs_info, &ih_info); + if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) { + kfree(adev->gmc.xgmi.ras_if); + adev->gmc.xgmi.ras_if = NULL; + } + + return r; +} + +void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev) +{ + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) && + adev->gmc.xgmi.ras_if) { + struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if; + struct ras_ih_if ih_info = { + .cb = NULL, + }; + + amdgpu_ras_late_fini(adev, ras_if, &ih_info); + kfree(ras_if); + } +} |