From baaeb610b17a3419fa31a5c70a2bb84528a6eaa7 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 13 Nov 2019 22:24:12 +0800 Subject: drm/amdgpu: enable ras capablity check on arcturus check hw ras capablity via atomfirmware Signed-off-by: Hawking Zhang Reviewed-by: Alex Deucher Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 404483437bd3..1593564578b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1687,7 +1687,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, *supported = 0; if (amdgpu_sriov_vf(adev) || - adev->asic_type != CHIP_VEGA20) + (adev->asic_type != CHIP_VEGA20 && + adev->asic_type != CHIP_ARCTURUS)) return; if (adev->is_atom_fw && -- cgit From f2a79be1c094f4d664b6a3fbfd9b5a61f8ff7f02 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Mon, 25 Nov 2019 12:26:09 +0800 Subject: drm/amdgpu: export amdgpu_ras_find_obj to use externally Change it to external interface. Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1593564578b0..04394c45aa03 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -198,9 +198,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, return 0; } -static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, - struct ras_common_if *head); - /** * DOC: AMDGPU RAS debugfs control interface * @@ -445,7 +442,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, } /* return an obj equal to head, or the first when head is NULL */ -static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, +struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, struct ras_common_if *head) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -- cgit From 619346240932ac86a0f0c42f887827ae759eda47 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Fri, 13 Dec 2019 16:46:05 +0800 Subject: drm/amdgpu: drop useless BACO arg in amdgpu_ras_reset_gpu BACO reset mode strategy is determined by latter func when calling amdgpu_ras_reset_gpu. So not to confuse audience, drop it. Signed-off-by: Guchun Chen Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 04394c45aa03..d5346dc2523a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1870,7 +1870,7 @@ void amdgpu_ras_resume(struct amdgpu_device *adev) * See feature_enable_on_boot */ amdgpu_ras_disable_all_features(adev, 1); - amdgpu_ras_reset_gpu(adev, 0); + amdgpu_ras_reset_gpu(adev); } } @@ -1933,6 +1933,6 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); - amdgpu_ras_reset_gpu(adev, false); + amdgpu_ras_reset_gpu(adev); } } -- cgit From 374bf7bd6ae43ea7f78b6ce254114adb9463526f Mon Sep 17 00:00:00 2001 From: zhengbin Date: Sat, 14 Dec 2019 17:02:24 +0800 Subject: drm/amdgpu: Remove unneeded semicolon in amdgpu_ras.c Fixes coccicheck warning: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:318:2-3: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index d5346dc2523a..ad593d1c2576 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -315,7 +315,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * default: ret = -EINVAL; break; - }; + } if (ret) return -EINVAL; -- cgit From 46cf2fecf5971b4019c6565c2a895a77f574eb9b Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Mon, 23 Dec 2019 11:40:13 +0800 Subject: drm/amdgpu: add missed return value set for error case Return value should be set when going to error handle tag for error case, this can avoid potential invalid array access by upper caller. Signed-off-by: Guchun Chen Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ad593d1c2576..96fc538ec824 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1311,6 +1311,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, data = con->eh_data; if (!data || data->count == 0) { *bps = NULL; + ret = -EINVAL; goto out; } -- cgit From 3e81ee9a78e5df7df46329e0dcfa751b59573bb7 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 9 Jan 2020 00:48:46 +0800 Subject: drm/amdgpu: support error reporting for sdma ip block invoke sdma query_ras_error_count to get sdma single bit error count Signed-off-by: Hawking Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 96fc538ec824..991c4eaac244 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -686,6 +686,7 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, { struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_err_data err_data = {0, 0, 0, NULL}; + int i; if (!obj) return -EINVAL; @@ -700,6 +701,13 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, if (adev->umc.funcs->query_ras_error_address) adev->umc.funcs->query_ras_error_address(adev, &err_data); break; + case AMDGPU_RAS_BLOCK__SDMA: + if (adev->sdma.funcs->query_ras_error_count) { + for (i = 0; i < adev->sdma.num_instances; i++) + adev->sdma.funcs->query_ras_error_count(adev, i, + &err_data); + } + break; case AMDGPU_RAS_BLOCK__GFX: if (adev->gfx.funcs->query_ras_error_count) adev->gfx.funcs->query_ras_error_count(adev, &err_data); -- cgit From 93af20f74e8eb4077fecdcc6e8093f13f0059bc9 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 16 Jan 2020 12:39:15 +0800 Subject: drm/amdgpu: check if driver should try recovery in ras recovery path To allow the flexibilty for user to disable gpu recovery in RAS recovery path by module parameter amdgpu_gpu_recovery Signed-off-by: Hawking Zhang Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 991c4eaac244..766be7f18282 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1353,7 +1353,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_ras *ras = container_of(work, struct amdgpu_ras, recovery_work); - amdgpu_device_gpu_recover(ras->adev, 0); + if (amdgpu_device_should_recover_gpu(ras->adev)) + amdgpu_device_gpu_recover(ras->adev, 0); atomic_set(&ras->in_recovery, 0); } -- cgit