From 56b53c0b5aa5de49747351b2ad323fd36089eb52 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 10 Mar 2021 17:20:45 +0800 Subject: drm/amdgpu: add codes to capture invalid hardware access when recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When recovery thread has begun GPU reset, there should be not other threads to access hardware, otherwise system randomly hang. v2 (chk): rewritten from scratch, use trylock and lockdep instead of hand wiring the logic. v3: add in_irq check v4: change to check in_task Signed-off-by: Dennis Li Signed-off-by: Christian König Reviewed-by: Christian König Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0f82c5d21237..2d080622eb23 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -326,6 +326,35 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, /* * register access helper functions. */ + +/* Check if hw access should be skipped because of hotplug or device error */ +bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) +{ + if (adev->in_pci_err_recovery) + return true; + +#ifdef CONFIG_LOCKDEP + /* + * This is a bit complicated to understand, so worth a comment. What we assert + * here is that the GPU reset is not running on another thread in parallel. + * + * For this we trylock the read side of the reset semaphore, if that succeeds + * we know that the reset is not running in paralell. + * + * If the trylock fails we assert that we are either already holding the read + * side of the lock or are the reset thread itself and hold the write side of + * the lock. + */ + if (in_task()) { + if (down_read_trylock(&adev->reset_sem)) + up_read(&adev->reset_sem); + else + lockdep_assert_held(&adev->reset_sem); + } +#endif + return false; +} + /** * amdgpu_device_rreg - read a memory mapped IO or indirect register * @@ -340,7 +369,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, { uint32_t ret; - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if ((reg * 4) < adev->rmmio_size) { @@ -377,7 +406,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, */ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if (offset < adev->rmmio_size) @@ -402,7 +431,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) */ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (offset < adev->rmmio_size) @@ -425,7 +454,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if ((reg * 4) < adev->rmmio_size) { @@ -452,7 +481,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (amdgpu_sriov_fullaccess(adev) && @@ -476,7 +505,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, */ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if (index < adev->doorbell.num_doorbells) { @@ -499,7 +528,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (index < adev->doorbell.num_doorbells) { @@ -520,7 +549,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) */ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if (index < adev->doorbell.num_doorbells) { @@ -543,7 +572,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (index < adev->doorbell.num_doorbells) { -- cgit From fe68ceef3412544aa8e2e2ff397f76edfedf6bd9 Mon Sep 17 00:00:00 2001 From: Xiaojian Du Date: Thu, 18 Mar 2021 15:39:21 +0800 Subject: Revert "drm/amdgpu: disable gpu reset on Vangogh for now" This reverts commit 33cf440d594bfbf81fc20604957bc64f02d0b560. And it will enable mode-2 gpu reset for vangogh, it asks PSP firmware version is 00.1A.00.0F or newer. Signed-off-by: Xiaojian Du Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2d080622eb23..7db60edfb5d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4254,6 +4254,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: case CHIP_DIMGREY_CAVEFISH: + case CHIP_VANGOGH: break; default: goto disabled; -- cgit From b98c6299ef992660f5ca4392287a11ea2439c664 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 10 Mar 2021 00:43:35 -0500 Subject: drm/amdgpu: disentangle HG systems from vgaswitcheroo There's no need to keep vgaswitcheroo around for HG systems. They don't use muxes and their power control is handled via ACPI. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 +++++++++++++----------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7db60edfb5d1..1c3044f2e767 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -212,18 +212,18 @@ static DEVICE_ATTR(serial_number, S_IRUGO, amdgpu_device_get_serial_number, NULL); /** - * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control + * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control * * @dev: drm_device pointer * - * Returns true if the device is a dGPU with HG/PX power control, + * Returns true if the device is a dGPU with ATPX power control, * otherwise return false. */ -bool amdgpu_device_supports_atpx(struct drm_device *dev) +bool amdgpu_device_supports_px(struct drm_device *dev) { struct amdgpu_device *adev = drm_to_adev(dev); - if (adev->flags & AMD_IS_PX) + if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) return true; return false; } @@ -233,14 +233,15 @@ bool amdgpu_device_supports_atpx(struct drm_device *dev) * * @dev: drm_device pointer * - * Returns true if the device is a dGPU with HG/PX power control, + * Returns true if the device is a dGPU with ACPI power control, * otherwise return false. */ bool amdgpu_device_supports_boco(struct drm_device *dev) { struct amdgpu_device *adev = drm_to_adev(dev); - if (adev->has_pr3) + if (adev->has_pr3 || + ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) return true; return false; } @@ -1420,7 +1421,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, struct drm_device *dev = pci_get_drvdata(pdev); int r; - if (amdgpu_device_supports_atpx(dev) && state == VGA_SWITCHEROO_OFF) + if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) return; if (state == VGA_SWITCHEROO_ON) { @@ -3226,7 +3227,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, struct drm_device *ddev = adev_to_drm(adev); struct pci_dev *pdev = adev->pdev; int r, i; - bool atpx = false; + bool px = false; u32 max_MBps; adev->shutdown = false; @@ -3388,16 +3389,12 @@ int amdgpu_device_init(struct amdgpu_device *adev, if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); - if (amdgpu_device_supports_atpx(ddev)) - atpx = true; - if (amdgpu_has_atpx() && - (amdgpu_is_atpx_hybrid() || - amdgpu_has_atpx_dgpu_power_cntl()) && - !pci_is_thunderbolt_attached(adev->pdev)) + if (amdgpu_device_supports_px(ddev)) { + px = true; vga_switcheroo_register_client(adev->pdev, - &amdgpu_switcheroo_ops, atpx); - if (atpx) + &amdgpu_switcheroo_ops, px); vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); + } if (amdgpu_emu_mode == 1) { /* post the asic on emulation mode */ @@ -3604,7 +3601,7 @@ release_ras_con: failed: amdgpu_vf_error_trans_all(adev); - if (atpx) + if (px) vga_switcheroo_fini_domain_pm_ops(adev->dev); failed_unmap: @@ -3664,13 +3661,10 @@ void amdgpu_device_fini(struct amdgpu_device *adev) kfree(adev->bios); adev->bios = NULL; - if (amdgpu_has_atpx() && - (amdgpu_is_atpx_hybrid() || - amdgpu_has_atpx_dgpu_power_cntl()) && - !pci_is_thunderbolt_attached(adev->pdev)) + if (amdgpu_device_supports_px(adev_to_drm(adev))) { vga_switcheroo_unregister_client(adev->pdev); - if (amdgpu_device_supports_atpx(adev_to_drm(adev))) vga_switcheroo_fini_domain_pm_ops(adev->dev); + } if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) vga_client_register(adev->pdev, NULL, NULL, NULL); iounmap(adev->rmmio); -- cgit From e5192f7b4af684e0ba09f8b9ae30cb164bdedde5 Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Tue, 9 Mar 2021 09:34:00 +0800 Subject: drm/amdgpu: fix the hibernation suspend with s0ix During system hibernation suspend still need un-gate gfx CG/PG firstly to handle HW status check before HW resource destory. Signed-off-by: Prike Liang Acked-by: Alex Deucher Acked-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1c3044f2e767..87ebf67086e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2685,7 +2685,7 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { int i, r; - if (adev->in_poweroff_reboot_com || + if (adev->in_poweroff_reboot_com || adev->in_hibernate || !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); @@ -3766,7 +3766,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) amdgpu_fence_driver_suspend(adev); - if (adev->in_poweroff_reboot_com || + /* + * TODO: Need figure out the each GNB IP idle off dependency and then + * improve the AMDGPU suspend/resume sequence for system-wide Sx entry/exit. + */ + if (adev->in_poweroff_reboot_com || adev->in_hibernate || !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) r = amdgpu_device_ip_suspend_phase2(adev); else -- cgit From 62498733d4c4fde8bc15215c5502923ff8224f86 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 12 Mar 2021 15:22:36 -0500 Subject: drm/amdgpu: rework S3/S4/S0ix state handling Set flags at the top level pmops callbacks to track state. This cleans up the current set of flags and properly handles S4 on S0ix capable systems. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 87ebf67086e0..2359449567be 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2685,8 +2685,7 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { int i, r; - if (adev->in_poweroff_reboot_com || adev->in_hibernate || - !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { + if (!adev->in_s0ix || amdgpu_in_reset(adev)) { amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); } @@ -3766,12 +3765,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) amdgpu_fence_driver_suspend(adev); - /* - * TODO: Need figure out the each GNB IP idle off dependency and then - * improve the AMDGPU suspend/resume sequence for system-wide Sx entry/exit. - */ - if (adev->in_poweroff_reboot_com || adev->in_hibernate || - !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) + if (!adev->in_s0ix || amdgpu_in_reset(adev)) r = amdgpu_device_ip_suspend_phase2(adev); else amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); @@ -3805,7 +3799,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; - if (amdgpu_acpi_is_s0ix_supported(adev)) + if (adev->in_s0ix) amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); /* post card */ -- cgit From a2e15b0e6c91a13d2219fba1f9e83ecd473a60db Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 19 Mar 2021 16:34:45 -0400 Subject: drm/amdgpu: clean up non-DC suspend/resume handling Move the non-DC specific code into the DCE IP blocks similar to how we handle DC. This cleans up the common suspend and resume pathes. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 86 +----------------------------- 1 file changed, 2 insertions(+), 84 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2359449567be..1a0fe315dee2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3696,14 +3696,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) */ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) { - struct amdgpu_device *adev; - struct drm_crtc *crtc; - struct drm_connector *connector; - struct drm_connector_list_iter iter; + struct amdgpu_device *adev = drm_to_adev(dev); int r; - adev = drm_to_adev(dev); - if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; @@ -3715,45 +3710,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) cancel_delayed_work_sync(&adev->delayed_init_work); - if (!amdgpu_device_has_dc_support(adev)) { - /* turn off display hw */ - drm_modeset_lock_all(dev); - drm_connector_list_iter_begin(dev, &iter); - drm_for_each_connector_iter(connector, &iter) - drm_helper_connector_dpms(connector, - DRM_MODE_DPMS_OFF); - drm_connector_list_iter_end(&iter); - drm_modeset_unlock_all(dev); - /* unpin the front buffers and cursors */ - list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { - struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); - struct drm_framebuffer *fb = crtc->primary->fb; - struct amdgpu_bo *robj; - - if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { - struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); - r = amdgpu_bo_reserve(aobj, true); - if (r == 0) { - amdgpu_bo_unpin(aobj); - amdgpu_bo_unreserve(aobj); - } - } - - if (fb == NULL || fb->obj[0] == NULL) { - continue; - } - robj = gem_to_amdgpu_bo(fb->obj[0]); - /* don't unpin kernel fb objects */ - if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { - r = amdgpu_bo_reserve(robj, true); - if (r == 0) { - amdgpu_bo_unpin(robj); - amdgpu_bo_unreserve(robj); - } - } - } - } - amdgpu_ras_suspend(adev); r = amdgpu_device_ip_suspend_phase1(adev); @@ -3790,10 +3746,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) */ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) { - struct drm_connector *connector; - struct drm_connector_list_iter iter; struct amdgpu_device *adev = drm_to_adev(dev); - struct drm_crtc *crtc; int r = 0; if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) @@ -3824,24 +3777,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) queue_delayed_work(system_wq, &adev->delayed_init_work, msecs_to_jiffies(AMDGPU_RESUME_MS)); - if (!amdgpu_device_has_dc_support(adev)) { - /* pin cursors */ - list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { - struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); - - if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { - struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); - r = amdgpu_bo_reserve(aobj, true); - if (r == 0) { - r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); - if (r != 0) - dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); - amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); - amdgpu_bo_unreserve(aobj); - } - } - } - } r = amdgpu_amdkfd_resume(adev, adev->in_runpm); if (r) return r; @@ -3849,25 +3784,8 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) /* Make sure IB tests flushed */ flush_delayed_work(&adev->delayed_init_work); - /* blat the mode back in */ - if (fbcon) { - if (!amdgpu_device_has_dc_support(adev)) { - /* pre DCE11 */ - drm_helper_resume_force_mode(dev); - - /* turn on display hw */ - drm_modeset_lock_all(dev); - - drm_connector_list_iter_begin(dev, &iter); - drm_for_each_connector_iter(connector, &iter) - drm_helper_connector_dpms(connector, - DRM_MODE_DPMS_ON); - drm_connector_list_iter_end(&iter); - - drm_modeset_unlock_all(dev); - } + if (fbcon) amdgpu_fbdev_set_suspend(adev, 0); - } drm_kms_helper_poll_enable(dev); -- cgit From 344169315707a1147cf5bda629ac6ee1fbd3a1bd Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 12 Mar 2021 15:33:46 -0500 Subject: drm/amdgpu: move s0ix check into amdgpu_device_ip_suspend_phase2 (v3) No functional change. v2: use correct dev v3: rework Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1a0fe315dee2..27279131c123 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2728,6 +2728,11 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) { int i, r; + if (adev->in_s0ix) { + amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); + return 0; + } + for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.valid) continue; @@ -3721,10 +3726,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) amdgpu_fence_driver_suspend(adev); - if (!adev->in_s0ix || amdgpu_in_reset(adev)) - r = amdgpu_device_ip_suspend_phase2(adev); - else - amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); + r = amdgpu_device_ip_suspend_phase2(adev); /* evict remaining vram memory * This second call to evict vram is to evict the gart page table * using the CPU. -- cgit From 557f42a2b38cc763736ba4f88f012c1cf8f259e2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 12 Mar 2021 15:36:04 -0500 Subject: drm/amdgpu: re-enable suspend phase 2 for S0ix This really needs to be done to properly tear down the device. SMC, PSP, and GFX are still problematic, need to dig deeper into what aspect of them that is problematic. Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 27279131c123..6a1416cb1378 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2728,10 +2728,8 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) { int i, r; - if (adev->in_s0ix) { + if (adev->in_s0ix) amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); - return 0; - } for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.valid) @@ -2755,6 +2753,14 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) adev->ip_blocks[i].status.hw = false; continue; } + + /* XXX fix these remaining cases */ + if (adev->in_s0ix && + (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || /* breaks suspend */ + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || /* breaks resume */ + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) /* breaks suspend */ + continue; + /* XXX handle errors */ r = adev->ip_blocks[i].version->funcs->suspend(adev); /* XXX handle errors */ -- cgit From f937008757a2048e1b22bb067e5fe36b1f4fb1af Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 12 Mar 2021 16:00:21 -0500 Subject: drm/amdgpu/swsmu: skip gfx cgpg on s0ix suspend The SMU expects CGPG to be enabled when entering S0ix. with this we can re-enable SMU suspend. Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6a1416cb1378..5ec9b8793ae5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2756,8 +2756,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) /* XXX fix these remaining cases */ if (adev->in_s0ix && - (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || /* breaks suspend */ - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || /* breaks resume */ + (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || /* breaks resume */ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) /* breaks suspend */ continue; -- cgit From 32ff160da7ffc707e3bccfe1fdd3711ac2246164 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 16 Mar 2021 14:00:36 -0400 Subject: drm/amdgpu: update comments about s0ix suspend/resume Provide and explanation as to why we skip GFX and PSP for S0ix. GFX goes into gfxoff, same as runtime, so no need to tear down and re-init. PSP is part of the always on state, so no need to touch it. Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5ec9b8793ae5..579633134f7a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2754,10 +2754,14 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) continue; } - /* XXX fix these remaining cases */ + /* skip suspend of gfx and psp for S0ix + * gfx is in gfxoff state, so on resume it will exit gfxoff just + * like at runtime. PSP is also part of the always on hardware + * so no need to suspend it. + */ if (adev->in_s0ix && - (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || /* breaks resume */ - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) /* breaks suspend */ + (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) continue; /* XXX handle errors */ -- cgit From 5d70a549d00dc1b8f8ae227ff192ca38f902b57c Mon Sep 17 00:00:00 2001 From: Pratik Vishwakarma Date: Tue, 16 Mar 2021 14:15:44 -0400 Subject: drm/amdgpu: skip CG/PG for gfx during S0ix Not needed as the device is in gfxoff state so the CG/PG state is handled just like it would be for gfxoff during runtime gfxoff. This should also prevent delays on resume. Reworked from Pratik's original patch (Alex) Acked-by: Evan Quan Signed-off-by: Alex Deucher Signed-off-by: Pratik Vishwakarma --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 579633134f7a..b9e5f5047467 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2373,6 +2373,10 @@ static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; if (!adev->ip_blocks[i].status.late_initialized) continue; + /* skip CG for GFX on S0ix */ + if (adev->in_s0ix && + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) + continue; /* skip CG for VCE/UVD, it's handled specially */ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && @@ -2404,6 +2408,10 @@ static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_power i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; if (!adev->ip_blocks[i].status.late_initialized) continue; + /* skip PG for GFX on S0ix */ + if (adev->in_s0ix && + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) + continue; /* skip CG for VCE/UVD, it's handled specially */ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && -- cgit From 50ec83f0d820bd7c7ef0c88a91816a7e2bb2682c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 16 Mar 2021 14:18:30 -0400 Subject: drm/amdgpu: drop S0ix checks around CG/PG in suspend We handle it properly within the CG/PG functions directly now. Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b9e5f5047467..5705bde4547b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2693,10 +2693,8 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { int i, r; - if (!adev->in_s0ix || amdgpu_in_reset(adev)) { - amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); - amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); - } + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.valid) -- cgit From 5d3a2d95224da3213b1ce60fe28bf60b3dfe6827 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 16 Mar 2021 22:02:14 -0400 Subject: drm/amdgpu: skip kfd suspend/resume for S0ix GFX is in gfxoff mode during s0ix so we shouldn't need to actually tear anything down and restore it. Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5705bde4547b..7ae5e1d8202d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3734,7 +3734,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) r = amdgpu_device_ip_suspend_phase1(adev); - amdgpu_amdkfd_suspend(adev, adev->in_runpm); + if (!adev->in_s0ix) + amdgpu_amdkfd_suspend(adev, adev->in_runpm); /* evict vram memory */ amdgpu_bo_evict_vram(adev); @@ -3794,9 +3795,11 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) queue_delayed_work(system_wq, &adev->delayed_init_work, msecs_to_jiffies(AMDGPU_RESUME_MS)); - r = amdgpu_amdkfd_resume(adev, adev->in_runpm); - if (r) - return r; + if (!adev->in_s0ix) { + r = amdgpu_amdkfd_resume(adev, adev->in_runpm); + if (r) + return r; + } /* Make sure IB tests flushed */ flush_delayed_work(&adev->delayed_init_work); -- cgit From 437f3e0b6eb24cc777473ae55f4b98e720258779 Mon Sep 17 00:00:00 2001 From: Horace Chen Date: Tue, 23 Mar 2021 14:22:22 +0800 Subject: drm/amdgpu: move vram recover into sriov full access [what] currently driver recover vram after full access, which may hit a corner case that meanwhile another whole gpu reset may be triggered by another VF, which will cause vram recover fail then fail the whole device reset. [how] move the recover vram into full access. So another bad VF will not disturb the recover sequence for this vf. Signed-off-by: Horace Chen Reviewed by: Monk.Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7ae5e1d8202d..789b02508077 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4103,11 +4103,11 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, amdgpu_amdkfd_post_reset(adev); error: - amdgpu_virt_release_full_gpu(adev, true); if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { amdgpu_inc_vram_lost(adev); r = amdgpu_device_recover_vram(adev); } + amdgpu_virt_release_full_gpu(adev, true); return r; } -- cgit From 36000c7a51080840902d79e1558851076ecb7a96 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Wed, 24 Mar 2021 17:17:40 +0800 Subject: drm/amdgpu: Convert sysfs sprintf/snprintf family to sysfs_emit Fix the following coccicheck warning: drivers/gpu//drm/amd/amdgpu/amdgpu_ras.c:434:9-17: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_xgmi.c:220:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_xgmi.c:249:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/df_v3_6.c:208:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_psp.c:2973:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_vram_mgr.c:75:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_vram_mgr.c:112:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_vram_mgr.c:58:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_vram_mgr.c:93:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_vram_mgr.c:125:9-17: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_gtt_mgr.c:52:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_gtt_mgr.c:71:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_device.c:140:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_device.c:164:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_device.c:186:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_device.c:208:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_atombios.c:1916:8-16: WARNING: use scnprintf or sprintf Signed-off-by: Tian Tao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 789b02508077..165192b2c336 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -137,7 +137,7 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, struct amdgpu_device *adev = drm_to_adev(ddev); uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); - return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); + return sysfs_emit(buf, "%llu\n", cnt); } static DEVICE_ATTR(pcie_replay_count, S_IRUGO, @@ -161,7 +161,7 @@ static ssize_t amdgpu_device_get_product_name(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); - return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); + return sysfs_emit(buf, "%s\n", adev->product_name); } static DEVICE_ATTR(product_name, S_IRUGO, @@ -183,7 +183,7 @@ static ssize_t amdgpu_device_get_product_number(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); - return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); + return sysfs_emit(buf, "%s\n", adev->product_number); } static DEVICE_ATTR(product_number, S_IRUGO, @@ -205,7 +205,7 @@ static ssize_t amdgpu_device_get_serial_number(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); - return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); + return sysfs_emit(buf, "%s\n", adev->serial); } static DEVICE_ATTR(serial_number, S_IRUGO, -- cgit From e6c6338f393b74ac0b303d567bb918b44ae7ad75 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Mon, 8 Mar 2021 12:41:27 +0800 Subject: drm/amd/amdgpu implement tdr advanced mode [Why] Previous tdr design treats the first job in job_timeout as the bad job. But sometimes a later bad compute job can block a good gfx job and cause an unexpected gfx job timeout because gfx and compute ring share internal GC HW mutually. [How] This patch implements an advanced tdr mode.It involves an additinal synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit step in order to find the real bad job. 1. At Step0 Resubmit stage, it synchronously submits and pends for the first job being signaled. If it gets timeout, we identify it as guilty and do hw reset. After that, we would do the normal resubmit step to resubmit left jobs. 2. For whole gpu reset(vram lost), do resubmit as the old way. v2: squash in build fix (Alex) Signed-off-by: Jack Zhang Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 81 ++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 165192b2c336..3cbe8137a6af 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4575,6 +4575,73 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) return 0; } +void amdgpu_device_recheck_guilty_jobs(struct amdgpu_device *adev, + struct amdgpu_hive_info *hive, + struct list_head *device_list_handle, + bool *need_full_reset) +{ + int i, r = 0; + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + int ret = 0; + struct drm_sched_job *s_job; + + if (!ring || !ring->sched.thread) + continue; + + s_job = list_first_entry_or_null(&ring->sched.pending_list, + struct drm_sched_job, list); + if (s_job == NULL) + continue; + + /* clear job's guilty and depend the folowing step to decide the real one */ + drm_sched_reset_karma(s_job); + drm_sched_resubmit_jobs_ext(&ring->sched, 1); + + ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); + if (ret == 0) { /* timeout */ + DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", + ring->sched.name, s_job->id); + + /* set guilty */ + drm_sched_increase_karma(s_job); +retry: + /* do hw reset */ + if (amdgpu_sriov_vf(adev)) { + amdgpu_virt_fini_data_exchange(adev); + r = amdgpu_device_reset_sriov(adev, false); + if (r) + adev->asic_reset_res = r; + } else { + r = amdgpu_do_asic_reset(hive, device_list_handle, + need_full_reset, false); + if (r && r == -EAGAIN) + goto retry; + } + + /* + * add reset counter so that the following + * resubmitted job could flush vmid + */ + atomic_inc(&adev->gpu_reset_counter); + continue; + } + + /* got the hw fence, signal finished fence */ + atomic_dec(ring->sched.score); + dma_fence_get(&s_job->s_fence->finished); + dma_fence_signal(&s_job->s_fence->finished); + dma_fence_put(&s_job->s_fence->finished); + + /* remove node from list and free the job */ + spin_lock(&ring->sched.job_list_lock); + list_del_init(&s_job->list); + spin_unlock(&ring->sched.job_list_lock); + ring->sched.ops->free_job(s_job); + } +} + /** * amdgpu_device_gpu_recover - reset the asic and recover scheduler * @@ -4597,6 +4664,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, int i, r = 0; bool need_emergency_restart = false; bool audio_suspended = false; + int tmp_vram_lost_counter; /* * Special case: RAS triggered and full reset isn't supported @@ -4748,6 +4816,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ } } + tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); /* Actual ASIC resets if needed.*/ /* TODO Implement XGMI hive reset logic for SRIOV */ if (amdgpu_sriov_vf(adev)) { @@ -4765,6 +4834,18 @@ skip_hw_reset: /* Post ASIC reset for all devs .*/ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + /* + * Sometimes a later bad compute job can block a good gfx job as gfx + * and compute ring share internal GC HW mutually. We add an additional + * guilty jobs recheck step to find the real guilty job, it synchronously + * submits and pends for the first job being signaled. If it gets timeout, + * we identify it as a real guilty job. + */ + if (amdgpu_gpu_recovery == 2 && + !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) + amdgpu_device_recheck_guilty_jobs(tmp_adev, hive, + device_list_handle, &need_full_reset); + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; -- cgit From 04442bf70debb197d4ed4e850aa77213e685b352 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 16 Mar 2021 20:31:51 +0800 Subject: drm/amdgpu: Add reset control handling to reset workflow This prefers reset control based handling if it's implemented for a particular ASIC. If not, it takes the legacy path. It uses the legacy method of preparing environment (job, scheduler tasks) and restoring environment. v2: remove unused variable (Alex) Signed-off-by: Lijo Lazar Reviewed-by: Feifei Xu Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 108 +++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 30 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3cbe8137a6af..a51f470631d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -65,6 +65,7 @@ #include "amdgpu_ras.h" #include "amdgpu_pmu.h" #include "amdgpu_fru_eeprom.h" +#include "amdgpu_reset.h" #include #include @@ -3421,6 +3422,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, goto fence_driver_init; } + amdgpu_reset_init(adev); + /* detect if we are with an SRIOV vbios */ amdgpu_device_detect_sriov_bios(adev); @@ -3671,6 +3674,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) release_firmware(adev->firmware.gpu_info_fw); adev->firmware.gpu_info_fw = NULL; adev->accel_working = false; + + amdgpu_reset_fini(adev); + /* free i2c buses */ if (!amdgpu_device_has_dc_support(adev)) amdgpu_i2c_fini(adev); @@ -4239,11 +4245,15 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) } int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, - struct amdgpu_job *job, - bool *need_full_reset_arg) + struct amdgpu_reset_context *reset_context) { int i, r = 0; - bool need_full_reset = *need_full_reset_arg; + struct amdgpu_job *job = NULL; + bool need_full_reset = + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + + if (reset_context->reset_req_dev == adev) + job = reset_context->job; /* no need to dump if device is not in good state during probe period */ if (!adev->gmc.xgmi.pending_reset) @@ -4268,6 +4278,10 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if(job) drm_sched_increase_karma(&job->base); + r = amdgpu_reset_prepare_hwcontext(adev, reset_context); + if (r != -ENOSYS) + return r; + /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ if (!amdgpu_sriov_vf(adev)) { @@ -4286,22 +4300,36 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (need_full_reset) r = amdgpu_device_ip_suspend(adev); - - *need_full_reset_arg = need_full_reset; + if (need_full_reset) + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + else + clear_bit(AMDGPU_NEED_FULL_RESET, + &reset_context->flags); } return r; } -int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, - struct list_head *device_list_handle, - bool *need_full_reset_arg, - bool skip_hw_reset) +int amdgpu_do_asic_reset(struct list_head *device_list_handle, + struct amdgpu_reset_context *reset_context) { struct amdgpu_device *tmp_adev = NULL; - bool need_full_reset = *need_full_reset_arg, vram_lost = false; + bool need_full_reset, skip_hw_reset, vram_lost = false; int r = 0; + /* Try reset handler method first */ + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, + reset_list); + r = amdgpu_reset_perform_reset(tmp_adev, reset_context); + + if (r != -ENOSYS) + return r; + + /* Reset handler not implemented, use the default method */ + need_full_reset = + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); + /* * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) @@ -4385,7 +4413,8 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, */ amdgpu_register_gpu_instance(tmp_adev); - if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) + if (!reset_context->hive && + tmp_adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(tmp_adev); r = amdgpu_device_ip_late_init(tmp_adev); @@ -4413,8 +4442,10 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } /* Update PSP FW topology after reset */ - if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) - r = amdgpu_xgmi_update_topology(hive, tmp_adev); + if (reset_context->hive && + tmp_adev->gmc.xgmi.num_physical_nodes > 1) + r = amdgpu_xgmi_update_topology( + reset_context->hive, tmp_adev); } } @@ -4438,7 +4469,10 @@ out: } end: - *need_full_reset_arg = need_full_reset; + if (need_full_reset) + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + else + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); return r; } @@ -4575,10 +4609,9 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) return 0; } -void amdgpu_device_recheck_guilty_jobs(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive, - struct list_head *device_list_handle, - bool *need_full_reset) +void amdgpu_device_recheck_guilty_jobs( + struct amdgpu_device *adev, struct list_head *device_list_handle, + struct amdgpu_reset_context *reset_context) { int i, r = 0; @@ -4614,8 +4647,10 @@ retry: if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(hive, device_list_handle, - need_full_reset, false); + clear_bit(AMDGPU_SKIP_HW_RESET, + &reset_context->flags); + r = amdgpu_do_asic_reset(device_list_handle, + reset_context); if (r && r == -EAGAIN) goto retry; } @@ -4657,7 +4692,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job) { struct list_head device_list, *device_list_handle = NULL; - bool need_full_reset = false; bool job_signaled = false; struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; @@ -4665,6 +4699,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, bool need_emergency_restart = false; bool audio_suspended = false; int tmp_vram_lost_counter; + struct amdgpu_reset_context reset_context; + + memset(&reset_context, 0, sizeof(reset_context)); /* * Special case: RAS triggered and full reset isn't supported @@ -4705,6 +4742,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, mutex_lock(&hive->hive_lock); } + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + reset_context.job = job; + reset_context.hive = hive; + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + /* * lock the device before we try to operate the linked list * if didn't get the device lock, don't touch the linked list since @@ -4805,9 +4848,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - r = amdgpu_device_pre_asic_reset(tmp_adev, - (tmp_adev == adev) ? job : NULL, - &need_full_reset); + r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); /*TODO Should we stop ?*/ if (r) { dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", @@ -4824,7 +4865,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); + r = amdgpu_do_asic_reset(device_list_handle, &reset_context); if (r && r == -EAGAIN) goto retry; } @@ -4843,8 +4884,8 @@ skip_hw_reset: */ if (amdgpu_gpu_recovery == 2 && !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) - amdgpu_device_recheck_guilty_jobs(tmp_adev, hive, - device_list_handle, &need_full_reset); + amdgpu_device_recheck_guilty_jobs( + tmp_adev, device_list_handle, &reset_context); for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -5189,12 +5230,14 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); int r, i; - bool need_full_reset = true; + struct amdgpu_reset_context reset_context; u32 memsize; struct list_head device_list; DRM_INFO("PCI error: slot reset callback!!\n"); + memset(&reset_context, 0, sizeof(reset_context)); + INIT_LIST_HEAD(&device_list); list_add_tail(&adev->reset_list, &device_list); @@ -5217,13 +5260,18 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) goto out; } + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); + adev->in_pci_err_recovery = true; - r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); + r = amdgpu_device_pre_asic_reset(adev, &reset_context); adev->in_pci_err_recovery = false; if (r) goto out; - r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); + r = amdgpu_do_asic_reset(&device_list, &reset_context); out: if (!r) { -- cgit From 5d89bb2d2f53b27460f77c89756597dee507cd34 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 16 Mar 2021 21:14:40 +0800 Subject: drm/amdgpu: Make set PG/CG state functions public Expose PG/CG set states functions for other clients Signed-off-by: Lijo Lazar Reviewed-by: Feifei Xu Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a51f470631d4..2c66fb144d55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2362,8 +2362,8 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) * Returns 0 on success, negative error code on failure. */ -static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, - enum amd_clockgating_state state) +int amdgpu_device_set_cg_state(struct amdgpu_device *adev, + enum amd_clockgating_state state) { int i, j, r; @@ -2398,7 +2398,8 @@ static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, return 0; } -static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) +int amdgpu_device_set_pg_state(struct amdgpu_device *adev, + enum amd_powergating_state state) { int i, j, r; -- cgit From ea4e96a7b3e7a8e23e81d0c8faca0fbb3f4679bb Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 23 Mar 2021 18:50:50 +0800 Subject: drm/amdgpu: Enable recovery on aldebaran Add aldebaran to devices which support recovery Signed-off-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2c66fb144d55..319d69646a13 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4192,6 +4192,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_NAVY_FLOUNDER: case CHIP_DIMGREY_CAVEFISH: case CHIP_VANGOGH: + case CHIP_ALDEBARAN: break; default: goto disabled; -- cgit From 404b277bbe4945830e5ebc01a93ff9fe8403702f Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 26 Mar 2021 17:47:20 +0800 Subject: drm/amdgpu: Reset error code for 'no handler' case If reset handler is not implemented, reset error before proceeding. Fixes issue with the following trace - [ 106.508592] amdgpu 0000:b1:00.0: amdgpu: ASIC reset failed with error, -38 for drm dev, 0000:b1:00.0 [ 106.508972] amdgpu 0000:b1:00.0: amdgpu: GPU reset succeeded, trying to resume [ 106.509116] [drm] PCIE GART of 512M enabled. [ 106.509120] [drm] PTB located at 0x0000008000000000 [ 106.509136] [drm] VRAM is lost due to GPU reset! [ 106.509332] [drm] PSP is resuming... Signed-off-by: Lijo Lazar Reviewed-and-tested-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 319d69646a13..a501d1a4d000 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4281,7 +4281,10 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, drm_sched_increase_karma(&job->base); r = amdgpu_reset_prepare_hwcontext(adev, reset_context); - if (r != -ENOSYS) + /* If reset handler not implemented, continue; otherwise return */ + if (r == -ENOSYS) + r = 0; + else return r; /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ @@ -4323,8 +4326,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); r = amdgpu_reset_perform_reset(tmp_adev, reset_context); - - if (r != -ENOSYS) + /* If reset handler not implemented, continue; otherwise return */ + if (r == -ENOSYS) + r = 0; + else return r; /* Reset handler not implemented, use the default method */ -- cgit From 77eabc6f5975dafeb76f7c7c2451282b91e9f5b6 Mon Sep 17 00:00:00 2001 From: Peng Ju Zhou Date: Mon, 29 Mar 2021 15:47:20 +0800 Subject: drm/amdgpu: indirect register access for nv12 sriov get pf2vf msg info at it's earliest time so that guest driver can use these info to decide whether register indirect access enabled. Signed-off-by: Peng Ju Zhou Reviewed-by: Emily.Deng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a501d1a4d000..060d0ae99453 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2080,6 +2080,11 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); return r; } + + /*get pf2vf msg info at it's earliest time*/ + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_init_data_exchange(adev); + } } -- cgit From 5e025531b773ee9789a9a9948fc7e74e6077ddd5 Mon Sep 17 00:00:00 2001 From: Peng Ju Zhou Date: Mon, 22 Mar 2021 15:18:01 +0800 Subject: drm/amdgpu: indirect register access for nv12 sriov 1. expand rlcg interface for gc & mmhub indirect access 2. add rlcg interface for no kiq v2: squash in fix for gfx9 (Changfeng) Signed-off-by: Peng Ju Zhou Reviewed-by: Emily.Deng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 060d0ae99453..438e2f732377 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -490,7 +490,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) - return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); + return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0); } else { writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } -- cgit From 8bc7b360ad4b0a090380d7548dbf24a627f0b035 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 19 Mar 2021 15:50:14 +0800 Subject: drm/amdgpu: split mmhub callbacks into ras and non-ras ones mmhub ras is only avaiable in cerntain mmhub ip generation. Signed-off-by: Hawking Zhang Reviewed-by: Dennis Li Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 438e2f732377..b4ad1c055c70 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3142,8 +3142,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) goto fail; - if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) - adev->mmhub.funcs->reset_ras_error_count(adev); + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->reset_ras_error_count) + adev->mmhub.ras_funcs->reset_ras_error_count(adev); } else { task_barrier_full(&hive->tb); @@ -4378,9 +4379,9 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (!r && amdgpu_ras_intr_triggered()) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - if (tmp_adev->mmhub.funcs && - tmp_adev->mmhub.funcs->reset_ras_error_count) - tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); + if (tmp_adev->mmhub.ras_funcs && + tmp_adev->mmhub.ras_funcs->reset_ras_error_count) + tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); } amdgpu_ras_intr_cleared(); -- cgit From 8c3dd61cfa05a65a7e1a8a028000fc95856156c4 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 26 Apr 2021 18:50:00 +0800 Subject: drm/amdgpu: Register VGA clients after init can no longer fail When an amdgpu device fails to init, it makes another VGA device cause kernel splat: kernel: amdgpu 0000:08:00.0: amdgpu: amdgpu_device_ip_init failed kernel: amdgpu 0000:08:00.0: amdgpu: Fatal error during GPU init kernel: amdgpu: probe of 0000:08:00.0 failed with error -110 ... kernel: amdgpu 0000:01:00.0: vgaarb: changed VGA decodes: olddecodes=io+mem,decodes=none:owns=none kernel: BUG: kernel NULL pointer dereference, address: 0000000000000018 kernel: #PF: supervisor read access in kernel mode kernel: #PF: error_code(0x0000) - not-present page kernel: PGD 0 P4D 0 kernel: Oops: 0000 [#1] SMP NOPTI kernel: CPU: 6 PID: 1080 Comm: Xorg Tainted: G W 5.12.0-rc8+ #12 kernel: Hardware name: HP HP EliteDesk 805 G6/872B, BIOS S09 Ver. 02.02.00 12/30/2020 kernel: RIP: 0010:amdgpu_device_vga_set_decode+0x13/0x30 [amdgpu] kernel: Code: 06 31 c0 c3 b8 ea ff ff ff 5d c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 0f 1f 44 00 00 55 48 8b 87 90 06 00 00 48 89 e5 53 89 f3 <48> 8b 40 18 40 0f b6 f6 e8 40 58 39 fd 80 fb 01 5b 5d 19 c0 83 e0 kernel: RSP: 0018:ffffae3c0246bd68 EFLAGS: 00010002 kernel: RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 kernel: RDX: ffff8dd1af5a8560 RSI: 0000000000000000 RDI: ffff8dce8c160000 kernel: RBP: ffffae3c0246bd70 R08: ffff8dd1af5985c0 R09: ffffae3c0246ba38 kernel: R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000246 kernel: R13: 0000000000000000 R14: 0000000000000003 R15: ffff8dce81490000 kernel: FS: 00007f9303d8fa40(0000) GS:ffff8dd1af580000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 0000000000000018 CR3: 0000000103cfa000 CR4: 0000000000350ee0 kernel: Call Trace: kernel: vga_arbiter_notify_clients.part.0+0x4a/0x80 kernel: vga_get+0x17f/0x1c0 kernel: vga_arb_write+0x121/0x6a0 kernel: ? apparmor_file_permission+0x1c/0x20 kernel: ? security_file_permission+0x30/0x180 kernel: vfs_write+0xca/0x280 kernel: ksys_write+0x67/0xe0 kernel: __x64_sys_write+0x1a/0x20 kernel: do_syscall_64+0x38/0x90 kernel: entry_SYSCALL_64_after_hwframe+0x44/0xae kernel: RIP: 0033:0x7f93041e02f7 kernel: Code: 75 05 48 83 c4 58 c3 e8 f7 33 ff ff 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 kernel: RSP: 002b:00007fff60e49b28 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 kernel: RAX: ffffffffffffffda RBX: 000000000000000b RCX: 00007f93041e02f7 kernel: RDX: 000000000000000b RSI: 00007fff60e49b40 RDI: 000000000000000f kernel: RBP: 00007fff60e49b40 R08: 00000000ffffffff R09: 00007fff60e499d0 kernel: R10: 00007f93049350b5 R11: 0000000000000246 R12: 000056111d45e808 kernel: R13: 0000000000000000 R14: 000056111d45e7f8 R15: 000056111d46c980 kernel: Modules linked in: nls_iso8859_1 snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio snd_hda_codec_hdmi snd_hda_intel snd_intel_dspcfg snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_seq input_leds snd_seq_device snd_timer snd soundcore joydev kvm_amd serio_raw k10temp mac_hid hp_wmi ccp kvm sparse_keymap wmi_bmof ucsi_acpi efi_pstore typec_ucsi rapl typec video wmi sch_fq_codel parport_pc ppdev lp parport ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx libcrc32c xor raid6_pq raid1 raid0 multipath linear dm_mirror dm_region_hash dm_log hid_generic usbhid hid amdgpu drm_ttm_helper ttm iommu_v2 gpu_sched i2c_algo_bit drm_kms_helper syscopyarea sysfillrect crct10dif_pclmul sysimgblt crc32_pclmul fb_sys_fops ghash_clmulni_intel cec rc_core aesni_intel crypto_simd psmouse cryptd r8169 i2c_piix4 drm ahci xhci_pci realtek libahci xhci_pci_renesas gpio_amdpt gpio_generic kernel: CR2: 0000000000000018 kernel: ---[ end trace 76d04313d4214c51 ]--- Commit 4192f7b57689 ("drm/amdgpu: unmap register bar on device init failure") makes amdgpu_driver_unload_kms() skips amdgpu_device_fini(), so the VGA clients remain registered. So when vga_arbiter_notify_clients() iterates over registered clients, it causes NULL pointer dereference. Since there's no reason to register VGA clients that early, so solve the issue by putting them after all the goto cleanups. v2: - Remove redundant vga_switcheroo cleanup in failed: label. Fixes: 4192f7b57689 ("drm/amdgpu: unmap register bar on device init failure") Signed-off-by: Kai-Heng Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b4ad1c055c70..7d3b54615147 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3410,19 +3410,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); - /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ - /* this will fail for cards that aren't VGA class devices, just - * ignore it */ - if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) - vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); - - if (amdgpu_device_supports_px(ddev)) { - px = true; - vga_switcheroo_register_client(adev->pdev, - &amdgpu_switcheroo_ops, px); - vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); - } - if (amdgpu_emu_mode == 1) { /* post the asic on emulation mode */ emu_soc_asic_init(adev); @@ -3619,6 +3606,19 @@ fence_driver_init: if (amdgpu_device_cache_pci_state(adev->pdev)) pci_restore_state(pdev); + /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ + /* this will fail for cards that aren't VGA class devices, just + * ignore it */ + if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) + vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); + + if (amdgpu_device_supports_px(ddev)) { + px = true; + vga_switcheroo_register_client(adev->pdev, + &amdgpu_switcheroo_ops, px); + vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); + } + if (adev->gmc.xgmi.pending_reset) queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, msecs_to_jiffies(AMDGPU_RESUME_MS)); @@ -3630,8 +3630,6 @@ release_ras_con: failed: amdgpu_vf_error_trans_all(adev); - if (px) - vga_switcheroo_fini_domain_pm_ops(adev->dev); failed_unmap: iounmap(adev->rmmio); -- cgit