From f4322b9f8ad5f9f62add288c785d2e10bb6a5efe Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Mon, 22 Apr 2024 14:59:02 -0400 Subject: drm/amdgpu: Fix two reset triggered in a row Some times a hang GPU causes multiple reset sources to schedule resets. The second source will be able to trigger an unnecessary reset if they schedule after we call amdgpu_device_stop_pending_resets. Move amdgpu_device_stop_pending_resets to after the reset is done. Since at this point the GPU is supposedly in a good state, any reset scheduled after this point would be a legitimate reset. Remove unnecessary and incorrect checks for amdgpu_in_reset that was kinda serving this purpose. Signed-off-by: Yunxiang Li Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index 59f53c743362..14a065516ae4 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -560,7 +560,7 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev, r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION); /* only handle FLR_NOTIFY now */ - if (!r && !amdgpu_in_reset(adev)) + if (!r) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, &adev->virt.flr_work), "Failed to queue work! at %s", -- cgit From 25c01191c2555351922e5515b6b6d31357975031 Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Mon, 22 Apr 2024 14:44:38 -0400 Subject: drm/amdgpu: Add reset_context flag for host FLR There are other reset sources that pass NULL as the job pointer, such as amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if the FLR comes from the host does not work. Add a flag in reset_context to explicitly mark host triggered reset, and set this flag when we receive host reset notification. Signed-off-by: Yunxiang Li Reviewed-by: Emily Deng Reviewed-by: Zhigang Luo Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index 14a065516ae4..78cd07744ebe 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_HOST_FLR, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } -- cgit