aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2024-01-09 09:07:42 +1000
committerDave Airlie <airlied@redhat.com>2024-01-09 09:07:50 +1000
commite54478fbdad20f2c58d0a4f99d01299ed8e7fe9c (patch)
treea0e9b35d0cc755a26c6e6b2ae7aca864cc7dcad7 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent3c064aea46d071ccf95a142be5532768a7fa6f02 (diff)
parent754d349ed41186e3aba50c3128937be335f9460a (diff)
downloadlinux-e54478fbdad20f2c58d0a4f99d01299ed8e7fe9c.tar.gz
linux-e54478fbdad20f2c58d0a4f99d01299ed8e7fe9c.tar.bz2
linux-e54478fbdad20f2c58d0a4f99d01299ed8e7fe9c.zip
Merge tag 'amd-drm-next-6.8-2024-01-05' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.8-2024-01-05: amdgpu: - VRR fixes - PSR-SU fixes - SubVP fixes - DCN 3.5 fixes - Documentation updates - DMCUB fixes - DML2 fixes - UMC 12.0 updates - GPUVM fix - Misc code cleanups and whitespace cleanups - DP MST fix - Let KFD sync with GPUVM fences - GFX11 reset fix - SMU 13.0.6 fixes - VSC fix for DP/eDP - Navi12 display fix - RN/CZN system aperture fix - DCN 2.1 bandwidth validation fix - DCN INIT cleanup amdkfd: - SVM fixes - Revert TBA/TMA location change Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240105220522.4976-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c34
1 files changed, 22 insertions, 12 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index bacb59d8b701..fc42fb6ee191 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1156,8 +1156,10 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
- amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, err_info->ce_count);
- amdgpu_ras_error_statistic_ue_count(&obj->err_data, &err_info->mcm_info, err_info->ue_count);
+ amdgpu_ras_error_statistic_ce_count(&obj->err_data,
+ &err_info->mcm_info, NULL, err_info->ce_count);
+ amdgpu_ras_error_statistic_ue_count(&obj->err_data,
+ &err_info->mcm_info, NULL, err_info->ue_count);
}
} else {
/* for legacy asic path which doesn't has error source info */
@@ -1174,6 +1176,9 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
struct amdgpu_ras_block_object *block_obj = NULL;
+ if (blk == AMDGPU_RAS_BLOCK_COUNT)
+ return -EINVAL;
+
if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
return -EINVAL;
@@ -2538,7 +2543,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
return 0;
data = &con->eh_data;
- *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
+ *data = kzalloc(sizeof(**data), GFP_KERNEL);
if (!*data) {
ret = -ENOMEM;
goto out;
@@ -2825,10 +2830,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (con)
return 0;
- con = kmalloc(sizeof(struct amdgpu_ras) +
+ con = kzalloc(sizeof(*con) +
sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
- GFP_KERNEL|__GFP_ZERO);
+ GFP_KERNEL);
if (!con)
return -ENOMEM;
@@ -3133,8 +3138,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
- /* enable MCA debug on APU device */
- amdgpu_ras_set_mca_debug_mode(adev, !!(adev->flags & AMD_IS_APU));
+ amdgpu_ras_set_mca_debug_mode(adev, false);
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
@@ -3691,7 +3695,8 @@ static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct
}
static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info)
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ struct ras_err_addr *err_addr)
{
struct ras_err_node *err_node;
@@ -3705,6 +3710,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
+ if (err_addr)
+ memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr));
+
err_data->err_list_count++;
list_add_tail(&err_node->node, &err_data->err_node_list);
list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp);
@@ -3713,7 +3721,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
}
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ struct ras_err_addr *err_addr, u64 count)
{
struct ras_err_info *err_info;
@@ -3723,7 +3732,7 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
if (!count)
return 0;
- err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
+ err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
if (!err_info)
return -EINVAL;
@@ -3734,7 +3743,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
}
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ struct ras_err_addr *err_addr, u64 count)
{
struct ras_err_info *err_info;
@@ -3744,7 +3754,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
if (!count)
return 0;
- err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
+ err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
if (!err_info)
return -EINVAL;