From ed098aa34c51d81a16de93f59c64e36a136fd201 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 18 Mar 2021 16:44:10 -0400 Subject: drm/amdgpu: Add additional Sienna Cichlid PCI ID Add new DID. Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 33991b4a5627..fcda7189d4ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1161,6 +1161,7 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x73A3, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, {0x1002, 0x73AB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, {0x1002, 0x73AE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, + {0x1002, 0x73AF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, {0x1002, 0x73BF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, /* Van Gogh */ -- cgit From e25443d2765f40a9b3d0056dc4d560a007dd850c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 10 Mar 2021 00:41:49 -0500 Subject: drm/amdgpu: add a dev_pm_ops prepare callback (v2) as per: https://www.kernel.org/doc/html/latest/driver-api/pm/devices.html The prepare callback is required to support the DPM_FLAG_SMART_SUSPEND driver flag. This allows runtime pm to auto complete when the system goes into suspend avoiding a wake up on suspend and on resume. Apply this for hybrid gfx and BOCO systems where d3cold is provided by the ACPI platform. v2: check if device is runtime suspended in prepare. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index fcda7189d4ab..1508da68cfbb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "amdgpu.h" #include "amdgpu_irq.h" @@ -1403,6 +1404,27 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) return; } +static int amdgpu_pmops_prepare(struct device *dev) +{ + struct drm_device *drm_dev = dev_get_drvdata(dev); + + /* Return a positive number here so + * DPM_FLAG_SMART_SUSPEND works properly + */ + if ((amdgpu_device_supports_atpx(drm_dev) && + amdgpu_is_atpx_hybrid()) || + amdgpu_device_supports_boco(drm_dev)) + return pm_runtime_suspended(dev) && + pm_suspend_via_firmware(); + + return 0; +} + +static void amdgpu_pmops_complete(struct device *dev) +{ + /* nothing to do */ +} + static int amdgpu_pmops_suspend(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); @@ -1621,6 +1643,8 @@ out: } static const struct dev_pm_ops amdgpu_pm_ops = { + .prepare = amdgpu_pmops_prepare, + .complete = amdgpu_pmops_complete, .suspend = amdgpu_pmops_suspend, .resume = amdgpu_pmops_resume, .freeze = amdgpu_pmops_freeze, -- cgit From b98c6299ef992660f5ca4392287a11ea2439c664 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 10 Mar 2021 00:43:35 -0500 Subject: drm/amdgpu: disentangle HG systems from vgaswitcheroo There's no need to keep vgaswitcheroo around for HG systems. They don't use muxes and their power control is handled via ACPI. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 34 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 20 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 1508da68cfbb..daaacf9067b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1411,9 +1411,7 @@ static int amdgpu_pmops_prepare(struct device *dev) /* Return a positive number here so * DPM_FLAG_SMART_SUSPEND works properly */ - if ((amdgpu_device_supports_atpx(drm_dev) && - amdgpu_is_atpx_hybrid()) || - amdgpu_device_supports_boco(drm_dev)) + if (amdgpu_device_supports_boco(drm_dev)) return pm_runtime_suspended(dev) && pm_suspend_via_firmware(); @@ -1502,7 +1500,7 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev) } adev->in_runpm = true; - if (amdgpu_device_supports_atpx(drm_dev)) + if (amdgpu_device_supports_px(drm_dev)) drm_dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; ret = amdgpu_device_suspend(drm_dev, false); @@ -1511,16 +1509,14 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev) return ret; } - if (amdgpu_device_supports_atpx(drm_dev)) { + if (amdgpu_device_supports_px(drm_dev)) { /* Only need to handle PCI state in the driver for ATPX * PCI core handles it for _PR3. */ - if (!amdgpu_is_atpx_hybrid()) { - amdgpu_device_cache_pci_state(pdev); - pci_disable_device(pdev); - pci_ignore_hotplug(pdev); - pci_set_power_state(pdev, PCI_D3cold); - } + amdgpu_device_cache_pci_state(pdev); + pci_disable_device(pdev); + pci_ignore_hotplug(pdev); + pci_set_power_state(pdev, PCI_D3cold); drm_dev->switch_power_state = DRM_SWITCH_POWER_DYNAMIC_OFF; } else if (amdgpu_device_supports_baco(drm_dev)) { amdgpu_device_baco_enter(drm_dev); @@ -1539,19 +1535,17 @@ static int amdgpu_pmops_runtime_resume(struct device *dev) if (!adev->runpm) return -EINVAL; - if (amdgpu_device_supports_atpx(drm_dev)) { + if (amdgpu_device_supports_px(drm_dev)) { drm_dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; /* Only need to handle PCI state in the driver for ATPX * PCI core handles it for _PR3. */ - if (!amdgpu_is_atpx_hybrid()) { - pci_set_power_state(pdev, PCI_D0); - amdgpu_device_load_pci_state(pdev); - ret = pci_enable_device(pdev); - if (ret) - return ret; - } + pci_set_power_state(pdev, PCI_D0); + amdgpu_device_load_pci_state(pdev); + ret = pci_enable_device(pdev); + if (ret) + return ret; pci_set_master(pdev); } else if (amdgpu_device_supports_boco(drm_dev)) { /* Only need to handle PCI state in the driver for ATPX @@ -1562,7 +1556,7 @@ static int amdgpu_pmops_runtime_resume(struct device *dev) amdgpu_device_baco_exit(drm_dev); } ret = amdgpu_device_resume(drm_dev, false); - if (amdgpu_device_supports_atpx(drm_dev)) + if (amdgpu_device_supports_px(drm_dev)) drm_dev->switch_power_state = DRM_SWITCH_POWER_ON; adev->in_runpm = false; return 0; -- cgit From 62498733d4c4fde8bc15215c5502923ff8224f86 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 12 Mar 2021 15:22:36 -0500 Subject: drm/amdgpu: rework S3/S4/S0ix state handling Set flags at the top level pmops callbacks to track state. This cleans up the current set of flags and properly handles S4 on S0ix capable systems. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index daaacf9067b8..02b75fa64d7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1335,9 +1335,7 @@ amdgpu_pci_shutdown(struct pci_dev *pdev) */ if (!amdgpu_passthrough(adev)) adev->mp1_state = PP_MP1_STATE_UNLOAD; - adev->in_poweroff_reboot_com = true; amdgpu_device_ip_suspend(adev); - adev->in_poweroff_reboot_com = false; adev->mp1_state = PP_MP1_STATE_NONE; } @@ -1426,15 +1424,28 @@ static void amdgpu_pmops_complete(struct device *dev) static int amdgpu_pmops_suspend(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(drm_dev); + int r; - return amdgpu_device_suspend(drm_dev, true); + if (amdgpu_acpi_is_s0ix_supported(adev)) + adev->in_s0ix = true; + adev->in_s3 = true; + r = amdgpu_device_suspend(drm_dev, true); + adev->in_s3 = false; + + return r; } static int amdgpu_pmops_resume(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(drm_dev); + int r; - return amdgpu_device_resume(drm_dev, true); + r = amdgpu_device_resume(drm_dev, true); + if (amdgpu_acpi_is_s0ix_supported(adev)) + adev->in_s0ix = false; + return r; } static int amdgpu_pmops_freeze(struct device *dev) @@ -1443,9 +1454,9 @@ static int amdgpu_pmops_freeze(struct device *dev) struct amdgpu_device *adev = drm_to_adev(drm_dev); int r; - adev->in_hibernate = true; + adev->in_s4 = true; r = amdgpu_device_suspend(drm_dev, true); - adev->in_hibernate = false; + adev->in_s4 = false; if (r) return r; return amdgpu_asic_reset(adev); @@ -1461,13 +1472,8 @@ static int amdgpu_pmops_thaw(struct device *dev) static int amdgpu_pmops_poweroff(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(drm_dev); - int r; - adev->in_poweroff_reboot_com = true; - r = amdgpu_device_suspend(drm_dev, true); - adev->in_poweroff_reboot_com = false; - return r; + return amdgpu_device_suspend(drm_dev, true); } static int amdgpu_pmops_restore(struct device *dev) -- cgit From e6c6338f393b74ac0b303d567bb918b44ae7ad75 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Mon, 8 Mar 2021 12:41:27 +0800 Subject: drm/amd/amdgpu implement tdr advanced mode [Why] Previous tdr design treats the first job in job_timeout as the bad job. But sometimes a later bad compute job can block a good gfx job and cause an unexpected gfx job timeout because gfx and compute ring share internal GC HW mutually. [How] This patch implements an advanced tdr mode.It involves an additinal synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit step in order to find the real bad job. 1. At Step0 Resubmit stage, it synchronously submits and pends for the first job being signaled. If it gets timeout, we identify it as guilty and do hw reset. After that, we would do the normal resubmit step to resubmit left jobs. 2. For whole gpu reset(vram lost), do resubmit as the old way. v2: squash in build fix (Alex) Signed-off-by: Jack Zhang Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 02b75fa64d7d..0e10c3958f94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -516,7 +516,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444); * DOC: gpu_recovery (int) * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV). */ -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)"); +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)"); module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444); /** -- cgit From 04442bf70debb197d4ed4e850aa77213e685b352 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 16 Mar 2021 20:31:51 +0800 Subject: drm/amdgpu: Add reset control handling to reset workflow This prefers reset control based handling if it's implemented for a particular ASIC. If not, it takes the legacy path. It uses the legacy method of preparing environment (job, scheduler tasks) and restoring environment. v2: remove unused variable (Alex) Signed-off-by: Lijo Lazar Reviewed-by: Feifei Xu Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 0e10c3958f94..d8f131ed10cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -47,6 +47,7 @@ #include "amdgpu_ras.h" #include "amdgpu_xgmi.h" +#include "amdgpu_reset.h" /* * KMS wrapper. @@ -1349,7 +1350,9 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) struct list_head device_list; struct amdgpu_device *adev; int i, r; - bool need_full_reset = true; + struct amdgpu_reset_context reset_context; + + memset(&reset_context, 0, sizeof(reset_context)); mutex_lock(&mgpu_info.mutex); if (mgpu_info.pending_reset == true) { @@ -1359,9 +1362,14 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) mgpu_info.pending_reset = true; mutex_unlock(&mgpu_info.mutex); + /* Use a common context, just need to make sure full reset is done */ + reset_context.method = AMD_RESET_METHOD_NONE; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + for (i = 0; i < mgpu_info.num_dgpu; i++) { adev = mgpu_info.gpu_ins[i].adev; - r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); + reset_context.reset_req_dev = adev; + r = amdgpu_device_pre_asic_reset(adev, &reset_context); if (r) { dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", r, adev_to_drm(adev)->unique); @@ -1388,7 +1396,10 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) list_for_each_entry(adev, &device_list, reset_list) amdgpu_unregister_gpu_instance(adev); - r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); + /* Use a common context, just need to make sure full reset is done */ + set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); + r = amdgpu_do_asic_reset(&device_list, &reset_context); + if (r) { DRM_ERROR("reinit gpus failure"); return; -- cgit From 28a5d7a58949aede16f8572ba501aa2ee2a60657 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 16 Apr 2021 14:44:27 +0800 Subject: drm/amdgpu: correct default gfx wdt timeout setting When gfx wdt was configured to fatal_disable, the timeout period should be configured to 0x0 (timeout disabled) Signed-off-by: Hawking Zhang Reviewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index d8f131ed10cb..922938931e1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -185,7 +185,7 @@ uint amdgpu_ras_mask = 0xffffffff; int amdgpu_bad_page_threshold = -1; struct amdgpu_watchdog_timer amdgpu_watchdog_timer = { .timeout_fatal_disable = false, - .period = 0x23, /* default to max. timeout = 1 << 0x23 cycles */ + .period = 0x0, /* default to 0x0 (timeout disable) */ }; /** @@ -553,7 +553,7 @@ module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_di * DOC: timeout_period (uint) * Modify the watchdog timeout max_cycles as (1 << period) */ -MODULE_PARM_DESC(timeout_period, "watchdog timeout period (1 to 0x23(default), timeout maxCycles = (1 << period)"); +MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0 = timeout disabled, 1 ~ 0x23 = timeout maxcycles = (1 << period)"); module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644); /** -- cgit From b45aeb2dea9142d4d32fa3a117ba381d84f27065 Mon Sep 17 00:00:00 2001 From: Pavan Kumar Ramayanam Date: Tue, 27 Apr 2021 10:21:18 +0530 Subject: drm/amdgpu: Handling of amdgpu_device_resume return value for graceful teardown The runtime resume PM op disregards the return value from amdgpu_device_resume(), masking errors for failed resumes at the PM layer. Reviewed-by: Alex Deucher Signed-off-by: Pavan Kumar Ramayanam Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index d8f131ed10cb..b0378cfe2218 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1573,6 +1573,9 @@ static int amdgpu_pmops_runtime_resume(struct device *dev) amdgpu_device_baco_exit(drm_dev); } ret = amdgpu_device_resume(drm_dev, false); + if (ret) + return ret; + if (amdgpu_device_supports_px(drm_dev)) drm_dev->switch_power_state = DRM_SWITCH_POWER_ON; adev->in_runpm = false; -- cgit