From 1da37801a8b0fffb024fea594c7f1d7867ed8aa0 Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Wed, 6 Nov 2019 14:38:55 -0500 Subject: drm/amd/display: Drop CONFIG_DRM_AMD_DC_DCN2_0 and DSC_SUPPORTED [Why] DCN2 and DSC are stable enough to be build by default. So drop the flags. [How] Remove them using the unifdef tool. The following commands were executed in sequence: $ find -name '*.c' -exec unifdef -m -DCONFIG_DRM_AMD_DC_DSC_SUPPORT -DCONFIG_DRM_AMD_DC_DCN2_0 -UCONFIG_TRIM_DRM_AMD_DC_DCN2_0 '{}' ';' $ find -name '*.h' -exec unifdef -m -DCONFIG_DRM_AMD_DC_DSC_SUPPORT -DCONFIG_DRM_AMD_DC_DCN2_0 -UCONFIG_TRIM_DRM_AMD_DC_DCN2_0 '{}' ';' In addition: * Remove from kconfig, and replace any dependencies with DCN1_0. * Remove from any makefiles. * Fix and cleanup NV defninitions in dal_asic_id.h * Expand DCN1 ifdef to include DCN2 code in the following files: * clk_mgr/clk_mgr.c: dc_clk_mgr_create() * core/dc_resources.c: dc_create_resource_pool() * dce/dce_dmcu.c: dcn20_*lock_phy() * dce/dce_dmcu.c: dcn20_funcs * dce/dce_dmcu.c: dcn20_dmcu_create() * gpio/hw_factory.c: dal_hw_factory_init() * gpio/hw_translate.c: dal_hw_translate_init() Signed-off-by: Leo Li Signed-off-by: Bhawanpreet Lakha Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8ff69a5c2327..1b865d7f904d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1527,7 +1527,6 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) } parse_soc_bounding_box: -#ifdef CONFIG_DRM_AMD_DC_DCN2_0 /* * soc bounding box info is not integrated in disocovery table, * we always need to parse it from gpu info firmware. @@ -1538,7 +1537,6 @@ parse_soc_bounding_box: le32_to_cpu(hdr->header.ucode_array_offset_bytes)); adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; } -#endif break; } default: @@ -2602,8 +2600,6 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_VEGA20: #if defined(CONFIG_DRM_AMD_DC_DCN1_0) case CHIP_RAVEN: -#endif -#if defined(CONFIG_DRM_AMD_DC_DCN2_0) case CHIP_NAVI10: case CHIP_NAVI14: case CHIP_NAVI12: -- cgit From aca935c7cc866a935a61769c9e9782dd834a8502 Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Wed, 6 Nov 2019 14:44:19 -0500 Subject: drm/amd/display: Drop CONFIG_DRM_AMD_DC_DCN2_1 flag [Why] DCN21 is stable enough to be build by default. So drop the flags. [How] Remove them using the unifdef tool. The following commands were executed in sequence: $ find -name '*.c' -exec unifdef -m -DCONFIG_DRM_AMD_DC_DCN2_1 -UCONFIG_TRIM_DRM_AMD_DC_DCN2_1 '{}' ';' $ find -name '*.h' -exec unifdef -m -DCONFIG_DRM_AMD_DC_DCN2_1 -UCONFIG_TRIM_DRM_AMD_DC_DCN2_1 '{}' ';' In addition: * Remove from kconfig, and replace any dependencies with DCN1_0. * Remove from any makefiles. * Fix and cleanup Renoir definitions in dal_asic_id.h * Expand DCN1 ifdef to include DCN21 code in the following files: * clk_mgr/clk_mgr.c: dc_clk_mgr_create() * core/dc_resources.c: dc_create_resource_pool() * gpio/hw_factory.c: dal_hw_factory_init() * gpio/hw_translate.c: dal_hw_translate_init() Signed-off-by: Bhawanpreet Lakha Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1b865d7f904d..329bd3787e57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2603,8 +2603,6 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_NAVI10: case CHIP_NAVI14: case CHIP_NAVI12: -#endif -#if defined(CONFIG_DRM_AMD_DC_DCN2_1) case CHIP_RENOIR: #endif return amdgpu_dc != 0; -- cgit From b86a1aa36a92bcfbc062c5e99c1d084f27f25bab Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Wed, 6 Nov 2019 14:48:35 -0500 Subject: drm/amd/display: rename DCN1_0 kconfig to DCN Since dcn20 and dcn21 are under dcn1 it doesnt make sense to have it named dcn1. Change it to "dcn" to make it generic Signed-off-by: Bhawanpreet Lakha Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 329bd3787e57..9d210bb9bf33 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2598,7 +2598,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_VEGA10: case CHIP_VEGA12: case CHIP_VEGA20: -#if defined(CONFIG_DRM_AMD_DC_DCN1_0) +#if defined(CONFIG_DRM_AMD_DC_DCN) case CHIP_RAVEN: case CHIP_NAVI10: case CHIP_NAVI14: -- cgit From b8b721305770cf85bffbe7ce1e0dc5fb6c4fef47 Mon Sep 17 00:00:00 2001 From: yu kuai Date: Mon, 4 Nov 2019 21:27:21 +0800 Subject: drm/amdgpu: add function parameter description in 'amdgpu_device_set_cg_state' Fixes gcc warning: drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:1954: warning: Function parameter or member 'state' not described in 'amdgpu_device_set_cg_state' Fixes: e3ecdffac9cc ("drm/amdgpu: add documentation for amdgpu_device.c") Signed-off-by: yu kuai Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9d210bb9bf33..0ad61febbb5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1936,6 +1936,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) * amdgpu_device_set_cg_state - set clockgating for amdgpu device * * @adev: amdgpu_device pointer + * @state: clockgating state (gate or ungate) * * The list of all the hardware IPs that make up the asic is walked and the * set_clockgating_state callbacks are run. -- cgit From 52f2e779ad86daf6eb39f02eaab94b7326a546cb Mon Sep 17 00:00:00 2001 From: Leo Liu Date: Fri, 8 Nov 2019 15:00:58 -0500 Subject: drm/amdgpu: add driver support for JPEG2.0 and above By using JPEG IP block type Signed-off-by: Leo Liu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0ad61febbb5f..cdd8ddab8f78 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1961,6 +1961,7 @@ static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && adev->ip_blocks[i].version->funcs->set_clockgating_state) { /* enable clockgating to save power */ r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, @@ -1991,6 +1992,7 @@ static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_power if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && adev->ip_blocks[i].version->funcs->set_powergating_state) { /* enable powergating to save power */ r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, -- cgit From d0d13fe874909542d2936056c0f8b36e70079570 Mon Sep 17 00:00:00 2001 From: Yintian Tao Date: Mon, 18 Nov 2019 16:06:00 +0800 Subject: drm/amdgpu: put flush_delayed_work at first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is one regression from 042f3d7b745cd76aa To put flush_delayed_work after adev->shutdown = true which will make amdgpu_ih_process not response the irq At last, all ib ring tests will be failed just like below [drm] amdgpu: finishing device. [drm] Fence fallback timer expired on ring gfx [drm] Fence fallback timer expired on ring comp_1.0.0 [drm] Fence fallback timer expired on ring comp_1.1.0 [drm] Fence fallback timer expired on ring comp_1.2.0 [drm] Fence fallback timer expired on ring comp_1.3.0 [drm] Fence fallback timer expired on ring comp_1.0.1 amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on comp_1.1.1 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on comp_1.2.1 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on comp_1.3.1 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on sdma0 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on sdma1 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on uvd_enc_0.0 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on vce0 (-110). [drm:amdgpu_device_delayed_init_work_handler [amdgpu]] *ERROR* ib ring test failed (-110). v2: replace cancel_delayed_work_sync() with flush_delayed_work() Signed-off-by: Yintian Tao Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index cdd8ddab8f78..27ed48bde3fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3106,9 +3106,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) int r; DRM_INFO("amdgpu: finishing device.\n"); - adev->shutdown = true; - flush_delayed_work(&adev->delayed_init_work); + adev->shutdown = true; /* disable all interrupts */ amdgpu_irq_disable_all(adev); @@ -3127,7 +3126,6 @@ void amdgpu_device_fini(struct amdgpu_device *adev) adev->firmware.gpu_info_fw = NULL; } adev->accel_working = false; - cancel_delayed_work_sync(&adev->delayed_init_work); /* free i2c buses */ if (!amdgpu_device_has_dc_support(adev)) amdgpu_i2c_fini(adev); -- cgit From a69cba42b11ae5e8cede2ee6a61d9faf5187df9b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 28 Oct 2019 14:47:38 -0400 Subject: drm/amdgpu: add a amdgpu_device_supports_baco helper BACO - Bus Active, Chip Off To check if a device supports BACO or not. This will be used in determining when to enable runtime pm. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 27ed48bde3fa..314138a95ccd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -153,6 +153,21 @@ bool amdgpu_device_is_px(struct drm_device *dev) return false; } +/** + * amdgpu_device_supports_baco - Does the device support BACO + * + * @dev: drm_device pointer + * + * Returns true if the device supporte BACO, + * otherwise return false. + */ +bool amdgpu_device_supports_baco(struct drm_device *dev) +{ + struct amdgpu_device *adev = dev->dev_private; + + return amdgpu_asic_supports_baco(adev); +} + /** * VRAM access helper functions. * -- cgit From 31af062acfbd5db8b0b99d0ad418b33d4458e206 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 4 Oct 2019 10:42:22 -0500 Subject: drm/amdgpu: rename amdgpu_device_is_px to amdgpu_device_supports_boco (v2) BACO - Bus Active, Chip Off BOCO - Bus Off, Chip Off To better match what we are checking for and to align with amdgpu_device_supports_baco. BOCO is used on PowerXpress/Hybrid Graphics systems and BACO is used on desktop dGPU boards. v2: fix typo in documentation Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 314138a95ccd..635091bad874 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -137,14 +137,14 @@ static DEVICE_ATTR(pcie_replay_count, S_IRUGO, static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); /** - * amdgpu_device_is_px - Is the device is a dGPU with HG/PX power control + * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control * * @dev: drm_device pointer * * Returns true if the device is a dGPU with HG/PX power control, * otherwise return false. */ -bool amdgpu_device_is_px(struct drm_device *dev) +bool amdgpu_device_supports_boco(struct drm_device *dev) { struct amdgpu_device *adev = dev->dev_private; @@ -1088,7 +1088,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero { struct drm_device *dev = pci_get_drvdata(pdev); - if (amdgpu_device_is_px(dev) && state == VGA_SWITCHEROO_OFF) + if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) return; if (state == VGA_SWITCHEROO_ON) { @@ -2913,7 +2913,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, * ignore it */ vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); - if (amdgpu_device_is_px(ddev)) + if (amdgpu_device_supports_boco(ddev)) runtime = true; if (!pci_is_thunderbolt_attached(adev->pdev)) vga_switcheroo_register_client(adev->pdev, -- cgit From 361dbd01a1de8bdd6bdf9a879ae23a121b8f7266 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 4 Oct 2019 12:33:09 -0500 Subject: drm/amdgpu: add helpers for baco entry and exit BACO - Bus Active, Chip Off Will be used for runtime pm. Entry will enter the BACO state (chip off). Exit will exit the BACO state (chip on). Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 61 ++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 635091bad874..612c4cc82d6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4296,3 +4296,64 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) } } +int amdgpu_device_baco_enter(struct drm_device *dev) +{ + struct amdgpu_device *adev = dev->dev_private; + + if (!amdgpu_device_supports_baco(adev->ddev)) + return -ENOTSUPP; + + if (is_support_sw_smu(adev)) { + struct smu_context *smu = &adev->smu; + int ret; + + ret = smu_baco_enter(smu); + if (ret) + return ret; + + return 0; + } else { + void *pp_handle = adev->powerplay.pp_handle; + const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; + + if (!pp_funcs ||!pp_funcs->get_asic_baco_state ||!pp_funcs->set_asic_baco_state) + return -ENOENT; + + /* enter BACO state */ + if (pp_funcs->set_asic_baco_state(pp_handle, 1)) + return -EIO; + + return 0; + } +} + +int amdgpu_device_baco_exit(struct drm_device *dev) +{ + struct amdgpu_device *adev = dev->dev_private; + + if (!amdgpu_device_supports_baco(adev->ddev)) + return -ENOTSUPP; + + if (is_support_sw_smu(adev)) { + struct smu_context *smu = &adev->smu; + int ret; + + ret = smu_baco_exit(smu); + if (ret) + return ret; + + return 0; + } else { + void *pp_handle = adev->powerplay.pp_handle; + const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; + + if (!pp_funcs ||!pp_funcs->get_asic_baco_state ||!pp_funcs->set_asic_baco_state) + return -ENOENT; + + /* exit BACO state */ + if (pp_funcs->set_asic_baco_state(pp_handle, 0)) + return -EIO; + + return 0; + } +} -- cgit From 3840c5bcc2456381ca53f3f9604915aa36249faf Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 4 Oct 2019 13:25:37 -0500 Subject: drm/amdgpu: disentangle runtime pm and vga_switcheroo Originally we only supported runtime pm on PX/HG laptops so vga_switcheroo and runtime pm are sort of entangled. Attempt to logically separate them. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 612c4cc82d6c..d472526d97b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2743,7 +2743,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, uint32_t flags) { int r, i; - bool runtime = false; + bool boco = false; u32 max_MBps; adev->shutdown = false; @@ -2914,11 +2914,14 @@ int amdgpu_device_init(struct amdgpu_device *adev, vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); if (amdgpu_device_supports_boco(ddev)) - runtime = true; - if (!pci_is_thunderbolt_attached(adev->pdev)) + boco = true; + if (amdgpu_has_atpx() && + (amdgpu_is_atpx_hybrid() || + amdgpu_has_atpx_dgpu_power_cntl()) && + !pci_is_thunderbolt_attached(adev->pdev)) vga_switcheroo_register_client(adev->pdev, - &amdgpu_switcheroo_ops, runtime); - if (runtime) + &amdgpu_switcheroo_ops, boco); + if (boco) vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); if (amdgpu_emu_mode == 1) { @@ -3102,7 +3105,7 @@ fence_driver_init: failed: amdgpu_vf_error_trans_all(adev); - if (runtime) + if (boco) vga_switcheroo_fini_domain_pm_ops(adev->dev); return r; @@ -3150,9 +3153,12 @@ void amdgpu_device_fini(struct amdgpu_device *adev) kfree(adev->bios); adev->bios = NULL; - if (!pci_is_thunderbolt_attached(adev->pdev)) + if (amdgpu_has_atpx() && + (amdgpu_is_atpx_hybrid() || + amdgpu_has_atpx_dgpu_power_cntl()) && + !pci_is_thunderbolt_attached(adev->pdev)) vga_switcheroo_unregister_client(adev->pdev); - if (adev->flags & AMD_IS_PX) + if (amdgpu_device_supports_boco(adev->ddev)) vga_switcheroo_fini_domain_pm_ops(adev->dev); vga_client_register(adev->pdev, NULL, NULL, NULL); if (adev->rio_mem) -- cgit From de185019bcb9d824d3dd5a80746571e83644b636 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 20 Nov 2019 17:31:11 -0500 Subject: drm/amdgpu: move pci handling out of pm ops The documentation says the that PCI core handles this for you unless you choose to implement it. Just rely on the PCI core to handle the pci specific bits. Reviewed-by: Zhan Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 33 +++++++++++++----------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d472526d97b9..aae8a29f48ad 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1087,6 +1087,7 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) { struct drm_device *dev = pci_get_drvdata(pdev); + int r; if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) return; @@ -1096,7 +1097,12 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero /* don't suspend or resume card normally */ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; - amdgpu_device_resume(dev, true, true); + pci_set_power_state(dev->pdev, PCI_D0); + pci_restore_state(dev->pdev); + r = pci_enable_device(dev->pdev); + if (r) + DRM_WARN("pci_enable_device failed (%d)\n", r); + amdgpu_device_resume(dev, true); dev->switch_power_state = DRM_SWITCH_POWER_ON; drm_kms_helper_poll_enable(dev); @@ -1104,7 +1110,11 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero pr_info("amdgpu: switched off\n"); drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; - amdgpu_device_suspend(dev, true, true); + amdgpu_device_suspend(dev, true); + pci_save_state(dev->pdev); + /* Shut down the device */ + pci_disable_device(dev->pdev); + pci_set_power_state(dev->pdev, PCI_D3cold); dev->switch_power_state = DRM_SWITCH_POWER_OFF; } } @@ -3195,7 +3205,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) * Returns 0 for success or an error on failure. * Called at driver suspend. */ -int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) +int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) { struct amdgpu_device *adev; struct drm_crtc *crtc; @@ -3278,13 +3288,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) */ amdgpu_bo_evict_vram(adev); - if (suspend) { - pci_save_state(dev->pdev); - /* Shut down the device */ - pci_disable_device(dev->pdev); - pci_set_power_state(dev->pdev, PCI_D3hot); - } - return 0; } @@ -3299,7 +3302,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) * Returns 0 for success or an error on failure. * Called at driver resume. */ -int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) +int amdgpu_device_resume(struct drm_device *dev, bool fbcon) { struct drm_connector *connector; struct drm_connector_list_iter iter; @@ -3310,14 +3313,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; - if (resume) { - pci_set_power_state(dev->pdev, PCI_D0); - pci_restore_state(dev->pdev); - r = pci_enable_device(dev->pdev); - if (r) - return r; - } - /* post card */ if (amdgpu_device_need_post(adev)) { r = amdgpu_atom_asic_init(adev->mode_info.atom_context); -- cgit From 7c868b592d5c2c9d54c8ab92624efcf2d3d41bc6 Mon Sep 17 00:00:00 2001 From: Yintian Tao Date: Fri, 29 Nov 2019 16:05:55 +0800 Subject: drm/amdgpu: not remove sysfs if not create sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When load amdgpu failed before create pm_sysfs and ucode_sysfs, the pm_sysfs and ucode_sysfs should not be removed. Otherwise, there will be warning call trace just like below. [ 24.836386] [drm] VCE initialized successfully. [ 24.841352] amdgpu 0000:00:07.0: amdgpu_device_ip_init failed [ 25.370383] amdgpu 0000:00:07.0: Fatal error during GPU init [ 25.889575] [drm] amdgpu: finishing device. [ 26.069128] amdgpu 0000:00:07.0: [drm:amdgpu_ring_test_helper [amdgpu]] *ERROR* ring kiq_2.1.0 test failed (-110) [ 26.070110] [drm:gfx_v9_0_hw_fini [amdgpu]] *ERROR* KCQ disable failed [ 26.200309] [TTM] Finalizing pool allocator [ 26.200314] [TTM] Finalizing DMA pool allocator [ 26.200349] [TTM] Zone kernel: Used memory at exit: 0 KiB [ 26.200351] [TTM] Zone dma32: Used memory at exit: 0 KiB [ 26.200353] [drm] amdgpu: ttm finalized [ 26.205329] ------------[ cut here ]------------ [ 26.205330] sysfs group 'fw_version' not found for kobject '0000:00:07.0' [ 26.205347] WARNING: CPU: 0 PID: 1228 at fs/sysfs/group.c:256 sysfs_remove_group+0x80/0x90 [ 26.205348] Modules linked in: amdgpu(OE+) gpu_sched(OE) ttm(OE) drm_kms_helper(OE) drm(OE) i2c_algo_bit fb_sys_fops syscopyarea sysfillrect sysimgblt rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace fscache binfmt_misc snd_hda_codec_generic ledtrig_audio crct10dif_pclmul snd_hda_intel crc32_pclmul snd_hda_codec ghash_clmulni_intel snd_hda_core snd_hwdep snd_pcm snd_timer input_leds snd joydev soundcore serio_raw pcspkr evbug aesni_intel aes_x86_64 crypto_simd cryptd mac_hid glue_helper sunrpc ip_tables x_tables autofs4 8139too psmouse 8139cp mii i2c_piix4 pata_acpi floppy [ 26.205369] CPU: 0 PID: 1228 Comm: modprobe Tainted: G OE 5.2.0-rc1 #1 [ 26.205370] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 26.205372] RIP: 0010:sysfs_remove_group+0x80/0x90 [ 26.205374] Code: e8 35 b9 ff ff 5b 41 5c 41 5d 5d c3 48 89 df e8 f6 b5 ff ff eb c6 49 8b 55 00 49 8b 34 24 48 c7 c7 48 7a 70 98 e8 60 63 d3 ff <0f> 0b eb d7 66 90 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 [ 26.205375] RSP: 0018:ffffbee242b0b908 EFLAGS: 00010282 [ 26.205376] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006 [ 26.205377] RDX: 0000000000000007 RSI: 0000000000000092 RDI: ffff97ad6f817380 [ 26.205377] RBP: ffffbee242b0b920 R08: ffffffff98f520c4 R09: 00000000000002b3 [ 26.205378] R10: ffffbee242b0b8f8 R11: 00000000000002b3 R12: ffffffffc0e58240 [ 26.205379] R13: ffff97ad6d1fe0b0 R14: ffff97ad4db954c8 R15: ffff97ad4db7fff0 [ 26.205380] FS: 00007ff3d8a1c4c0(0000) GS:ffff97ad6f800000(0000) knlGS:0000000000000000 [ 26.205381] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 26.205381] CR2: 00007f9b2ef1df04 CR3: 000000042aab8001 CR4: 00000000003606f0 [ 26.205384] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 26.205385] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 26.205385] Call Trace: [ 26.205461] amdgpu_ucode_sysfs_fini+0x18/0x20 [amdgpu] [ 26.205518] amdgpu_device_fini+0x3b4/0x560 [amdgpu] [ 26.205573] amdgpu_driver_unload_kms+0x4f/0xa0 [amdgpu] [ 26.205623] amdgpu_driver_load_kms+0xcd/0x250 [amdgpu] [ 26.205637] drm_dev_register+0x12b/0x1c0 [drm] [ 26.205695] amdgpu_pci_probe+0x12a/0x1e0 [amdgpu] [ 26.205699] local_pci_probe+0x47/0xa0 [ 26.205701] pci_device_probe+0x106/0x1b0 [ 26.205704] really_probe+0x21a/0x3f0 [ 26.205706] driver_probe_device+0x11c/0x140 [ 26.205707] device_driver_attach+0x58/0x60 [ 26.205709] __driver_attach+0xc3/0x140 Signed-off-by: Yintian Tao Acked-by: Christian König Reviewed-by: Nirmoy Das Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index aae8a29f48ad..baee37c92352 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3041,12 +3041,18 @@ fence_driver_init: amdgpu_pm_virt_sysfs_init(adev); r = amdgpu_pm_sysfs_init(adev); - if (r) + if (r) { + adev->pm_sysfs_en = false; DRM_ERROR("registering pm debugfs failed (%d).\n", r); + } else + adev->pm_sysfs_en = true; r = amdgpu_ucode_sysfs_init(adev); - if (r) + if (r) { + adev->ucode_sysfs_en = false; DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); + } else + adev->ucode_sysfs_en = true; r = amdgpu_debugfs_gem_init(adev); if (r) @@ -3146,7 +3152,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) drm_atomic_helper_shutdown(adev->ddev); } amdgpu_fence_driver_fini(adev); - amdgpu_pm_sysfs_fini(adev); + if (adev->pm_sysfs_en) + amdgpu_pm_sysfs_fini(adev); amdgpu_fbdev_fini(adev); r = amdgpu_device_ip_fini(adev); if (adev->firmware.gpu_info_fw) { @@ -3182,7 +3189,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) amdgpu_debugfs_regs_cleanup(adev); device_remove_file(adev->dev, &dev_attr_pcie_replay_count); - amdgpu_ucode_sysfs_fini(adev); + if (adev->ucode_sysfs_en) + amdgpu_ucode_sysfs_fini(adev); if (IS_ENABLED(CONFIG_PERF_EVENTS)) amdgpu_pmu_fini(adev); amdgpu_debugfs_preempt_cleanup(adev); -- cgit From 0ea203a91247082f1294e6aa6e7802914665da50 Mon Sep 17 00:00:00 2001 From: Emily Deng Date: Tue, 3 Dec 2019 01:53:10 +0800 Subject: drm/amdgpu/sriov: No need the event 3 and 4 now As will call unload kms when initialize fail, and the unload kms will send event 3 and 4, so don't need event 3 and 4 in device init. Signed-off-by: Emily Deng Reviewed-by: Zhan Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index baee37c92352..dd46199287d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3018,8 +3018,6 @@ fence_driver_init: } dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); - if (amdgpu_virt_request_full_gpu(adev, false)) - amdgpu_virt_release_full_gpu(adev, false); goto failed; } -- cgit From 7a22677b9514d0e819eb85d687eb8d8ef10ab330 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Tue, 26 Nov 2019 17:24:56 +0800 Subject: drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper This operation is needed when baco entry/exit for ras recovery Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index dd46199287d8..fc53faac4147 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4306,10 +4306,14 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) int amdgpu_device_baco_enter(struct drm_device *dev) { struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); if (!amdgpu_device_supports_baco(adev->ddev)) return -ENOTSUPP; + if (ras && ras->supported) + adev->nbio.funcs->enable_doorbell_interrupt(adev, false); + if (is_support_sw_smu(adev)) { struct smu_context *smu = &adev->smu; int ret; @@ -4317,8 +4321,6 @@ int amdgpu_device_baco_enter(struct drm_device *dev) ret = smu_baco_enter(smu); if (ret) return ret; - - return 0; } else { void *pp_handle = adev->powerplay.pp_handle; const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; @@ -4329,14 +4331,15 @@ int amdgpu_device_baco_enter(struct drm_device *dev) /* enter BACO state */ if (pp_funcs->set_asic_baco_state(pp_handle, 1)) return -EIO; - - return 0; } + + return 0; } int amdgpu_device_baco_exit(struct drm_device *dev) { struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); if (!amdgpu_device_supports_baco(adev->ddev)) return -ENOTSUPP; @@ -4349,7 +4352,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev) if (ret) return ret; - return 0; } else { void *pp_handle = adev->powerplay.pp_handle; const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; @@ -4360,7 +4362,10 @@ int amdgpu_device_baco_exit(struct drm_device *dev) /* exit BACO state */ if (pp_funcs->set_asic_baco_state(pp_handle, 0)) return -EIO; - - return 0; } + + if (ras && ras->supported) + adev->nbio.funcs->enable_doorbell_interrupt(adev, true); + + return 0; } -- cgit From ce316fa55ef0f1751276b846a54fb3b835bd5e64 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Tue, 26 Nov 2019 22:12:31 +0800 Subject: drm/amdgpu: add concurrent baco reset support for XGMI Currently each XGMI node reset wq does not run in parrallel if bound to same cpu. Make change to bound the xgmi_reset_work item to different cpus. XGMI requires all nodes enter into baco within very close proximity before any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit baco respectively. To use baco for XGMI, PMFW supported for baco on XGMI needs to be involved. The case that PSP reset and baco reset coexist within an XGMI hive never exist and is not in the consideration. v2: define use_baco flag to simplify the code for xgmi baco sequence Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 82 +++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index fc53faac4147..114f5bca581a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2661,7 +2661,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); - adev->asic_reset_res = amdgpu_asic_reset(adev); + if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) + adev->asic_reset_res = (adev->in_baco == false) ? + amdgpu_device_baco_enter(adev->ddev) : + amdgpu_device_baco_exit(adev->ddev); + else + adev->asic_reset_res = amdgpu_asic_reset(adev); + if (adev->asic_reset_res) DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", adev->asic_reset_res, adev->ddev->unique); @@ -3787,13 +3793,18 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, +static int amdgpu_do_asic_reset(struct amdgpu_device *adev, + struct amdgpu_hive_info *hive, struct list_head *device_list_handle, bool *need_full_reset_arg) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; int r = 0; + int cpu = smp_processor_id(); + bool use_baco = + (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? + true : false; /* * ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3801,21 +3812,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, */ if (need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - /* For XGMI run all resets in parallel to speed up the process */ + /* + * For XGMI run all resets in parallel to speed up the + * process by scheduling the highpri wq on different + * cpus. For XGMI with baco reset, all nodes must enter + * baco within close proximity before anyone exit. + */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) + if (!queue_work_on(cpu, system_highpri_wq, + &tmp_adev->xgmi_reset_work)) r = -EALREADY; + cpu = cpumask_next(cpu, cpu_online_mask); } else r = amdgpu_asic_reset(tmp_adev); - - if (r) { - DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", - r, tmp_adev->ddev->unique); + if (r) break; - } } - /* For XGMI wait for all PSP resets to complete before proceed */ + /* For XGMI wait for all work to complete before proceed */ if (!r) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { @@ -3824,11 +3838,54 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, r = tmp_adev->asic_reset_res; if (r) break; + if (use_baco) + tmp_adev->in_baco = true; } } } - } + /* + * For XGMI with baco reset, need exit baco phase by scheduling + * xgmi_reset_work one more time. PSP reset and sGPU skips this + * phase. Not assume the situation that PSP reset and baco reset + * coexist within an XGMI hive. + */ + + if (!r && use_baco) { + cpu = smp_processor_id(); + list_for_each_entry(tmp_adev, device_list_handle, + gmc.xgmi.head) { + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + if (!queue_work_on(cpu, + system_highpri_wq, + &tmp_adev->xgmi_reset_work)) + r = -EALREADY; + if (r) + break; + cpu = cpumask_next(cpu, cpu_online_mask); + } + } + } + + if (!r && use_baco) { + list_for_each_entry(tmp_adev, device_list_handle, + gmc.xgmi.head) { + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + flush_work(&tmp_adev->xgmi_reset_work); + r = tmp_adev->asic_reset_res; + if (r) + break; + tmp_adev->in_baco = false; + } + } + } + + if (r) { + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", + r, tmp_adev->ddev->unique); + goto end; + } + } list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (need_full_reset) { @@ -4113,7 +4170,8 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); + r = amdgpu_do_asic_reset(adev, hive, device_list_handle, + &need_full_reset); if (r && r == -EAGAIN) goto retry; } -- cgit From b823821f2244add19a71e7fe6c8f8550a29d672d Mon Sep 17 00:00:00 2001 From: Le Ma Date: Wed, 27 Nov 2019 13:17:17 +0800 Subject: drm/amdgpu: support full gpu reset workflow when ras err_event_athub occurs This athub fatal error can be recovered by baco without system-level reboot, so add a mode to use baco for the recovery. Not affect the default psp reset situations for now. Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 114f5bca581a..2957ebf9b97c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4018,12 +4018,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_device *tmp_adev = NULL; int i, r = 0; bool in_ras_intr = amdgpu_ras_intr_triggered(); + bool use_baco = + (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? + true : false; /* * Flush RAM to disk so that after reboot * the user can read log and see why the system rebooted. */ - if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) { + if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) { DRM_WARN("Emergency reboot."); @@ -4034,7 +4037,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); - dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset"); + dev_info(adev->dev, "GPU %s begin!\n", + (in_ras_intr && !use_baco) ? "jobs stop":"reset"); cancel_delayed_work_sync(&adev->delayed_init_work); @@ -4101,7 +4105,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_unregister_gpu_instance(tmp_adev); /* disable ras on ALL IPs */ - if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev)) + if (!(in_ras_intr && !use_baco) && + amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -4112,13 +4117,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, drm_sched_stop(&ring->sched, job ? &job->base : NULL); - if (in_ras_intr) + if (in_ras_intr && !use_baco) amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } } - if (in_ras_intr) + if (in_ras_intr && !use_baco) goto skip_sched_resume; /* @@ -4212,7 +4217,7 @@ skip_hw_reset: skip_sched_resume: list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /*unlock kfd: SRIOV would do it separately */ - if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev)) + if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev)) amdgpu_amdkfd_post_reset(tmp_adev); amdgpu_device_unlock_adev(tmp_adev); } -- cgit From 00eaa57172a02edddbf445112409e807e0caacd9 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Fri, 25 Oct 2019 17:19:38 +0800 Subject: drm/amdgpu: clear err_event_athub flag after reset exit Otherwise next err_event_athub error cannot call gpu reset. And following resume sequence will not be affected by this flag. v2: create function to clear amdgpu_ras_in_intr for modularity of ras driver Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2957ebf9b97c..5a8506182ade 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3887,6 +3887,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev, } } + if (!r && amdgpu_ras_intr_triggered()) + amdgpu_ras_intr_cleared(); + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (need_full_reset) { /* post card */ -- cgit From c9ffa427db34e6896523f0ef0c172a0bbb77c9ad Mon Sep 17 00:00:00 2001 From: Yintian Tao Date: Wed, 30 Oct 2019 17:16:35 +0800 Subject: drm/amd/powerplay: enable pp one vf mode for vega10 Originally, due to the restriction from PSP and SMU, VF has to send message to hypervisor driver to handle powerplay change which is complicated and redundant. Currently, SMU and PSP can support VF to directly handle powerplay change by itself. Therefore, the old code about the handshake between VF and PF to handle powerplay will be removed and VF will use new the registers below to handshake with SMU. mmMP1_SMN_C2PMSG_101: register to handle SMU message mmMP1_SMN_C2PMSG_102: register to handle SMU parameter mmMP1_SMN_C2PMSG_103: register to handle SMU response v2: remove module parameter pp_one_vf v3: fix the parens v4: forbid vf to change smu feature v5: use hwmon_attributes_visible to skip sepicified hwmon atrribute v6: change skip condition at vega10_copy_table_to_smc Signed-off-by: Yintian Tao Acked-by: Evan Quan Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5a8506182ade..95988ff7356d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1877,6 +1877,9 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) } } + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_init_data_exchange(adev); + r = amdgpu_ib_pool_init(adev); if (r) { dev_err(adev->dev, "IB initialization failed (%d).\n", r); @@ -1918,11 +1921,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) amdgpu_amdkfd_device_init(adev); init_failed: - if (amdgpu_sriov_vf(adev)) { - if (!r) - amdgpu_virt_init_data_exchange(adev); + if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, true); - } return r; } @@ -2822,7 +2822,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); mutex_init(&adev->lock_reset); - mutex_init(&adev->virt.dpm_mutex); mutex_init(&adev->psp.mutex); r = amdgpu_device_check_arguments(adev); @@ -3041,9 +3040,6 @@ fence_driver_init: amdgpu_fbdev_init(adev); - if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) - amdgpu_pm_virt_sysfs_init(adev); - r = amdgpu_pm_sysfs_init(adev); if (r) { adev->pm_sysfs_en = false; @@ -3188,8 +3184,6 @@ void amdgpu_device_fini(struct amdgpu_device *adev) iounmap(adev->rmmio); adev->rmmio = NULL; amdgpu_device_doorbell_fini(adev); - if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) - amdgpu_pm_virt_sysfs_fini(adev); amdgpu_debugfs_regs_cleanup(adev); device_remove_file(adev->dev, &dev_attr_pcie_replay_count); @@ -3670,6 +3664,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) goto error; + amdgpu_virt_init_data_exchange(adev); /* we need recover gart prior to run SMC/CP/SDMA resume */ amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); @@ -3687,7 +3682,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, amdgpu_amdkfd_post_reset(adev); error: - amdgpu_virt_init_data_exchange(adev); amdgpu_virt_release_full_gpu(adev, true); if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { amdgpu_inc_vram_lost(adev); -- cgit From 93b09a9a892ba07bd99132d9bb6318e31c7f2f79 Mon Sep 17 00:00:00 2001 From: Simon Ser Date: Wed, 11 Dec 2019 10:21:01 +0000 Subject: drm/amdgpu: log when amdgpu.dc=1 but ASIC is unsupported This makes it easier to figure out whether the kernel parameter has been taken into account. Signed-off-by: Simon Ser Cc: Harry Wentland Cc: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 95988ff7356d..a97946878024 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2636,6 +2636,9 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) return amdgpu_dc != 0; #endif default: + if (amdgpu_dc > 0) + DRM_INFO("Display Core has been requested via kernel parameter " + "but isn't supported by ASIC, ignoring\n"); return false; } } -- cgit From d7f72fe482bfb7f28c00d99be6d96c5ebad6eacf Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 11 Dec 2019 18:04:05 -0500 Subject: drm/amdgpu: Add CU info print log The log will be useful for easily getting the CU info on various emulation models or ASICs. Signed-off-by: Yong Zhao Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a97946878024..f34017538adb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3029,6 +3029,12 @@ fence_driver_init: goto failed; } + DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", + adev->gfx.config.max_shader_engines, + adev->gfx.config.max_sh_per_se, + adev->gfx.config.max_cu_per_sh, + adev->gfx.cu_info.number); + adev->accel_working = true; amdgpu_vm_check_compute_bug(adev); -- cgit From 8973d9ec8f0e5208909cae81bdeae87ba657359d Mon Sep 17 00:00:00 2001 From: Emily Deng Date: Mon, 16 Dec 2019 17:19:44 +0800 Subject: drm/amdgpu/sriov: Tonga sriov also need load firmware with smu Fix Tonga sriov load driver fail issue. Signed-off-by: Emily Deng Reviewd-by Yintian Tao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f34017538adb..cc4ef4db90e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1810,7 +1810,8 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) } } - r = amdgpu_pm_load_smu_firmware(adev, &smu_version); + if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) + r = amdgpu_pm_load_smu_firmware(adev, &smu_version); return r; } -- cgit From 0c88b43032131ff458818addc9b65b8bd915837d Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Fri, 6 Dec 2019 16:55:49 +0100 Subject: drm/amdgpu: replace vm_pte's run-queue list with drm gpu scheds list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_sched_entity_init() takes drm gpu scheduler list instead of drm_sched_rq list. This makes conversion of drm_sched_rq list to drm gpu scheduler list unnecessary Signed-off-by: Nirmoy Das Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index cc4ef4db90e5..db91663df4f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2786,7 +2786,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->mman.buffer_funcs = NULL; adev->mman.buffer_funcs_ring = NULL; adev->vm_manager.vm_pte_funcs = NULL; - adev->vm_manager.vm_pte_num_rqs = 0; + adev->vm_manager.vm_pte_num_scheds = 0; adev->gmc.gmc_funcs = NULL; adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); -- cgit From f880799d7fcf0a63ca2295d950cd12f5520251d9 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Mon, 16 Dec 2019 14:43:34 +0100 Subject: amd/amdgpu: add sched array to IPs with multiple run-queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This sched array can be passed on to entity creation routine instead of manually creating such sched array on every context creation. v2: squash in missing break fix Signed-off-by: Nirmoy Das Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index db91663df4f6..e1f8b715301a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3036,6 +3036,8 @@ fence_driver_init: adev->gfx.config.max_cu_per_sh, adev->gfx.cu_info.number); + amdgpu_ctx_init_sched(adev); + adev->accel_working = true; amdgpu_vm_check_compute_bug(adev); -- cgit From 5a7489a7e189ee2be889485f90c8cf24ea4b9a40 Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Tue, 17 Dec 2019 18:16:44 +0800 Subject: drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV issues: MEC is ruined by the amdkfd_pre_reset after VF FLR done fix: amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR, the correct sequence is do amdkfd_pre_reset before VF FLR but there is a limitation to block this sequence: if we do pre_reset() before VF FLR, it would go KIQ way to do register access and stuck there, because KIQ probably won't work by that time (e.g. you already made GFX hang) so the best way right now is to simply remove it. Signed-off-by: Monk Liu Reviewed-by: Emily Deng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e1f8b715301a..d7a57435a832 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3669,8 +3669,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; - amdgpu_amdkfd_pre_reset(adev); - /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) -- cgit From 041a62bc0603544c97ac407df67bd60398ce0668 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 6 Dec 2019 13:19:15 -0500 Subject: drm/amdgpu: reverts commit ce316fa55ef0f1751276b846a54fb3b835bd5e64. In preparation for doing XGMI reset synchronization using task barrier. Signed-off-by: Andrey Grodzovsky Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 73 ++++-------------------------- 1 file changed, 10 insertions(+), 63 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d7a57435a832..6d52168454b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3797,18 +3797,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } -static int amdgpu_do_asic_reset(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive, +static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, struct list_head *device_list_handle, bool *need_full_reset_arg) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; int r = 0; - int cpu = smp_processor_id(); - bool use_baco = - (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? - true : false; /* * ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3816,62 +3811,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev, */ if (need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - /* - * For XGMI run all resets in parallel to speed up the - * process by scheduling the highpri wq on different - * cpus. For XGMI with baco reset, all nodes must enter - * baco within close proximity before anyone exit. - */ + /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work_on(cpu, system_highpri_wq, - &tmp_adev->xgmi_reset_work)) + if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) r = -EALREADY; - cpu = cpumask_next(cpu, cpu_online_mask); } else r = amdgpu_asic_reset(tmp_adev); - if (r) - break; - } - /* For XGMI wait for all work to complete before proceed */ - if (!r) { - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - flush_work(&tmp_adev->xgmi_reset_work); - r = tmp_adev->asic_reset_res; - if (r) - break; - if (use_baco) - tmp_adev->in_baco = true; - } - } - } - - /* - * For XGMI with baco reset, need exit baco phase by scheduling - * xgmi_reset_work one more time. PSP reset and sGPU skips this - * phase. Not assume the situation that PSP reset and baco reset - * coexist within an XGMI hive. - */ - - if (!r && use_baco) { - cpu = smp_processor_id(); - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work_on(cpu, - system_highpri_wq, - &tmp_adev->xgmi_reset_work)) - r = -EALREADY; - if (r) - break; - cpu = cpumask_next(cpu, cpu_online_mask); - } + if (r) { + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", + r, tmp_adev->ddev->unique); + break; } } - if (!r && use_baco) { + /* For XGMI wait for all resets to complete before proceed */ + if (!r) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -3879,16 +3834,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev, r = tmp_adev->asic_reset_res; if (r) break; - tmp_adev->in_baco = false; } } } - - if (r) { - DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", - r, tmp_adev->ddev->unique); - goto end; - } } if (!r && amdgpu_ras_intr_triggered()) @@ -4182,8 +4130,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(adev, hive, device_list_handle, - &need_full_reset); + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); if (r && r == -EAGAIN) goto retry; } -- cgit From c6a6e2db994528a3eaf1ed938a0b7a35b87b7fa4 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 11 Dec 2019 14:18:31 -0500 Subject: drm/amdgpu: Redo XGMI reset synchronization. Use task barrier in XGMI hive to synchronize ASIC resets across devices in XGMI hive. v2: Return right away with a warning if no xgmi hive, update doc. Signed-off-by: Andrey Grodzovsky Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 +++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6d52168454b4..277caaf1ea26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -66,6 +66,7 @@ #include "amdgpu_pmu.h" #include +#include MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); @@ -2664,14 +2665,38 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); - if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) - adev->asic_reset_res = (adev->in_baco == false) ? - amdgpu_device_baco_enter(adev->ddev) : - amdgpu_device_baco_exit(adev->ddev); - else - adev->asic_reset_res = amdgpu_asic_reset(adev); + /* It's a bug to not have a hive within this function */ + if (WARN_ON(!hive)) + return; + + /* + * Use task barrier to synchronize all xgmi reset works across the + * hive. task_barrier_enter and task_barrier_exit will block + * until all the threads running the xgmi reset works reach + * those points. task_barrier_full will do both blocks. + */ + if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { + + task_barrier_enter(&hive->tb); + adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); + + if (adev->asic_reset_res) + goto fail; + + task_barrier_exit(&hive->tb); + adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); + + if (adev->asic_reset_res) + goto fail; + } else { + + task_barrier_full(&hive->tb); + adev->asic_reset_res = amdgpu_asic_reset(adev); + } +fail: if (adev->asic_reset_res) DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", adev->asic_reset_res, adev->ddev->unique); -- cgit From c96cf2823dfdc51d3a41addff671576c5a2f0862 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 11 Dec 2019 14:25:36 -0500 Subject: drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq This is to avoid queueing jobs to same CPU during XGMI hive reset because there is a strict timeline for when the reset commands must reach all the GPUs in the hive. Signed-off-by: Andrey Grodzovsky Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 277caaf1ea26..2f93adc8f384 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3838,7 +3838,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) + if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) r = -EALREADY; } else r = amdgpu_asic_reset(tmp_adev); -- cgit From d83c7a07a79b55983e5b9cd7447e4304668f7733 Mon Sep 17 00:00:00 2001 From: Jane Jian Date: Mon, 16 Dec 2019 14:56:35 +0800 Subject: drm/amdgpu: update VCN1(dual instances) fw types ID and VCN ip block type Previously there is no VCN1 type ID in psp gfx interface. Also add VCN ip block type unless the reinit after FLR for sriov would fail. Signed-off-by: Jane Jian Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2f93adc8f384..9d69f2dbcfd9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2441,7 +2441,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_GFX, AMD_IP_BLOCK_TYPE_SDMA, AMD_IP_BLOCK_TYPE_UVD, - AMD_IP_BLOCK_TYPE_VCE + AMD_IP_BLOCK_TYPE_VCE, + AMD_IP_BLOCK_TYPE_VCN }; for (i = 0; i < ARRAY_SIZE(ip_order); i++) { -- cgit From e3c00faa7a3d304807a15394254794f1892c0af8 Mon Sep 17 00:00:00 2001 From: Ma Feng Date: Mon, 23 Dec 2019 14:58:27 -0500 Subject: drm/amdgpu: Remove unneeded variable 'ret' in amdgpu_device.c Fixes coccicheck warning: drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:1036:5-8: Unneeded variable: "ret". Return "0" on line 1079 Reported-by: Hulk Robot Signed-off-by: Ma Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9d69f2dbcfd9..3ab2ca98a8cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1032,8 +1032,6 @@ def_value: */ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) { - int ret = 0; - if (amdgpu_sched_jobs < 4) { dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", amdgpu_sched_jobs); @@ -1073,7 +1071,7 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); - return ret; + return 0; } /** -- cgit From 2a9b90ae470488f7f1ad37d2c8d7cee8a0fb0c63 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 23 Dec 2019 21:46:19 +0800 Subject: drm/amdgpu: use true, false for bool variable in amdgpu_device.c Fixes coccicheck warning: drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:3961:1-19: WARNING: Assignment of 0/1 to bool variable drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:3981:1-19: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3ab2ca98a8cd..9c9c7b32b0ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3953,7 +3953,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) mutex_lock(&adev->lock_reset); atomic_inc(&adev->gpu_reset_counter); - adev->in_gpu_reset = 1; + adev->in_gpu_reset = true; switch (amdgpu_asic_reset_method(adev)) { case AMD_RESET_METHOD_MODE1: adev->mp1_state = PP_MP1_STATE_SHUTDOWN; @@ -3973,7 +3973,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) { amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; - adev->in_gpu_reset = 0; + adev->in_gpu_reset = false; mutex_unlock(&adev->lock_reset); } -- cgit From 895bd048fb0846c912cb896ff58f4341537d0ff1 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Fri, 27 Dec 2019 14:44:03 +0800 Subject: amd/amdgpu/sriov tdr enablement with pp_onevf_mode Under sriov and pp_onevf mode, 1.take resume instead of hw_init for smc recover to avoid potential memory leak. 2.add return condition inside smc resume function for sriov_pp_onevf_mode and pm_enabled param. Signed-off-by: Jack Zhang Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9c9c7b32b0ed..9b4c18b3546f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2455,7 +2455,11 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) block->status.hw) continue; - r = block->version->funcs->hw_init(adev); + if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) + r = block->version->funcs->resume(adev); + else + r = block->version->funcs->hw_init(adev); + DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); if (r) return r; -- cgit From 9530273ec90cc0614f6ac56d0c024e2f39886419 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Tue, 7 Jan 2020 16:57:39 +0800 Subject: drm/amd/powerplay: cover the powerplay implementation details V3 This can save users much troubles. As they do not actually need to care whether swSMU or traditional powerplay routine should be used. V2: apply the fixes to vi.c and cik.c also V3: squash in oops fix Signed-off-by: Evan Quan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 53 ++++-------------------------- 1 file changed, 6 insertions(+), 47 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9b4c18b3546f..1bbea9669204 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2345,14 +2345,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) adev->ip_blocks[i].status.hw = false; /* handle putting the SMC in the appropriate state */ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { - if (is_support_sw_smu(adev)) { - r = smu_set_mp1_state(&adev->smu, adev->mp1_state); - } else if (adev->powerplay.pp_funcs && - adev->powerplay.pp_funcs->set_mp1_state) { - r = adev->powerplay.pp_funcs->set_mp1_state( - adev->powerplay.pp_handle, - adev->mp1_state); - } + r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); if (r) { DRM_ERROR("SMC failed to set mp1 state %d, %d\n", adev->mp1_state, r); @@ -4359,55 +4352,21 @@ int amdgpu_device_baco_enter(struct drm_device *dev) if (ras && ras->supported) adev->nbio.funcs->enable_doorbell_interrupt(adev, false); - if (is_support_sw_smu(adev)) { - struct smu_context *smu = &adev->smu; - int ret; - - ret = smu_baco_enter(smu); - if (ret) - return ret; - } else { - void *pp_handle = adev->powerplay.pp_handle; - const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; - - if (!pp_funcs ||!pp_funcs->get_asic_baco_state ||!pp_funcs->set_asic_baco_state) - return -ENOENT; - - /* enter BACO state */ - if (pp_funcs->set_asic_baco_state(pp_handle, 1)) - return -EIO; - } - - return 0; + return amdgpu_dpm_baco_enter(adev); } int amdgpu_device_baco_exit(struct drm_device *dev) { struct amdgpu_device *adev = dev->dev_private; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + int ret = 0; if (!amdgpu_device_supports_baco(adev->ddev)) return -ENOTSUPP; - if (is_support_sw_smu(adev)) { - struct smu_context *smu = &adev->smu; - int ret; - - ret = smu_baco_exit(smu); - if (ret) - return ret; - - } else { - void *pp_handle = adev->powerplay.pp_handle; - const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; - - if (!pp_funcs ||!pp_funcs->get_asic_baco_state ||!pp_funcs->set_asic_baco_state) - return -ENOENT; - - /* exit BACO state */ - if (pp_funcs->set_asic_baco_state(pp_handle, 0)) - return -EIO; - } + ret = amdgpu_dpm_baco_exit(adev); + if (ret) + return ret; if (ras && ras->supported) adev->nbio.funcs->enable_doorbell_interrupt(adev, true); -- cgit From e9d4cf918f70c6df87265d561aeab8d73397771b Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 16 Jan 2020 12:39:50 +0800 Subject: drm/amdgpu: add arcturus to gpu recovery check code path support check if dirver should try gpu recovery for arcturus Signed-off-by: Hawking Zhang Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1bbea9669204..d3eff4c6289a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3758,6 +3758,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_VEGA10: case CHIP_VEGA12: case CHIP_RAVEN: + case CHIP_ARCTURUS: break; default: goto disabled; -- cgit From bd0522112332663e386df1b8642052463ea9b3b9 Mon Sep 17 00:00:00 2001 From: "Pan, Xinhui" Date: Thu, 16 Jan 2020 06:09:41 +0000 Subject: drm/amdgpu: add the lost mutex_init back MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initialize notifier_lock. Bug: https://gitlab.freedesktop.org/drm/amd/issues/1016 Reviewed-by: Feifei Xu Reviewed-by: Christian König Signed-off-by: xinhui pan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d3eff4c6289a..53d882000101 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2848,6 +2848,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, hash_init(adev->mn_hash); mutex_init(&adev->lock_reset); mutex_init(&adev->psp.mutex); + mutex_init(&adev->notifier_lock); r = amdgpu_device_check_arguments(adev); if (r) -- cgit