From 56b53c0b5aa5de49747351b2ad323fd36089eb52 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 10 Mar 2021 17:20:45 +0800 Subject: drm/amdgpu: add codes to capture invalid hardware access when recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When recovery thread has begun GPU reset, there should be not other threads to access hardware, otherwise system randomly hang. v2 (chk): rewritten from scratch, use trylock and lockdep instead of hand wiring the logic. v3: add in_irq check v4: change to check in_task Signed-off-by: Dennis Li Signed-off-by: Christian König Reviewed-by: Christian König Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 689addb1520d..f63f66a0c63f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -705,7 +705,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) struct amdgpu_kiq *kiq = &adev->gfx.kiq; struct amdgpu_ring *ring = &kiq->ring; - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; BUG_ON(!ring->funcs->emit_rreg); @@ -772,7 +772,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) BUG_ON(!ring->funcs->emit_wreg); - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; spin_lock_irqsave(&kiq->ring_lock, flags); -- cgit From 5a8cd98e6e335ad03493502b3479fcbadcb4889e Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Fri, 12 Mar 2021 10:42:41 +0100 Subject: drm/amdgpu: wrap kiq ring ops with kiq spinlock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KIQ ring is being operated by kfd as well as amdgpu. KFD is using kiq lock, we should the same from amdgpu side as well. Signed-off-by: Nirmoy Das Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index f63f66a0c63f..d087f254e0d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -463,20 +463,25 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev) { struct amdgpu_kiq *kiq = &adev->gfx.kiq; struct amdgpu_ring *kiq_ring = &kiq->ring; - int i; + int i, r; if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; + spin_lock(&adev->gfx.kiq.ring_lock); if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * - adev->gfx.num_compute_rings)) + adev->gfx.num_compute_rings)) { + spin_unlock(&adev->gfx.kiq.ring_lock); return -ENOMEM; + } for (i = 0; i < adev->gfx.num_compute_rings; i++) kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i], RESET_QUEUES, 0, 0); + r = amdgpu_ring_test_helper(kiq_ring); + spin_unlock(&adev->gfx.kiq.ring_lock); - return amdgpu_ring_test_helper(kiq_ring); + return r; } int amdgpu_queue_mask_bit_to_set_resource_bit(struct amdgpu_device *adev, @@ -519,12 +524,13 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev) DRM_INFO("kiq ring mec %d pipe %d q %d\n", kiq_ring->me, kiq_ring->pipe, kiq_ring->queue); - + spin_lock(&adev->gfx.kiq.ring_lock); r = amdgpu_ring_alloc(kiq_ring, kiq->pmf->map_queues_size * adev->gfx.num_compute_rings + kiq->pmf->set_resources_size); if (r) { DRM_ERROR("Failed to lock KIQ (%d).\n", r); + spin_unlock(&adev->gfx.kiq.ring_lock); return r; } @@ -533,6 +539,7 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev) kiq->pmf->kiq_map_queues(kiq_ring, &adev->gfx.compute_ring[i]); r = amdgpu_ring_test_helper(kiq_ring); + spin_unlock(&adev->gfx.kiq.ring_lock); if (r) DRM_ERROR("KCQ enable failed\n"); -- cgit From c107171b8d3241d872807c04917e7e8fb70e8b71 Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 2 Feb 2021 13:05:49 +0100 Subject: drm/amdgpu: add the sched_score to amdgpu_ring_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow separate ring to share the same scheduler score. No functional change. Signed-off-by: Christian König Reviewed-and-Tested-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index d087f254e0d5..de540c209023 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -310,9 +310,8 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev, ring->eop_gpu_addr = kiq->eop_gpu_addr; ring->no_scheduler = true; sprintf(ring->name, "kiq_%d.%d.%d", ring->me, ring->pipe, ring->queue); - r = amdgpu_ring_init(adev, ring, 1024, - irq, AMDGPU_CP_KIQ_IRQ_DRIVER0, - AMDGPU_RING_PRIO_DEFAULT); + r = amdgpu_ring_init(adev, ring, 1024, irq, AMDGPU_CP_KIQ_IRQ_DRIVER0, + AMDGPU_RING_PRIO_DEFAULT, NULL); if (r) dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r); -- cgit From 2d64d23e9596b1815fa1b536b3ac096afac10bd5 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 25 Mar 2021 13:16:48 +0800 Subject: drm/amd/pm: unify the interface for gfx state setting No need to have special handling for swSMU supported ASICs. Signed-off-by: Evan Quan Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index de540c209023..12e8b527776b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -842,14 +842,10 @@ int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev) void amdgpu_gfx_state_change_set(struct amdgpu_device *adev, enum gfx_change_state state) { - if (is_support_sw_smu(adev)) { - smu_gfx_state_change_set(&adev->smu, state); - } else { - mutex_lock(&adev->pm.mutex); - if (adev->powerplay.pp_funcs && - adev->powerplay.pp_funcs->gfx_state_change_set) - ((adev)->powerplay.pp_funcs->gfx_state_change_set( - (adev)->powerplay.pp_handle, state)); - mutex_unlock(&adev->pm.mutex); - } + mutex_lock(&adev->pm.mutex); + if (adev->powerplay.pp_funcs && + adev->powerplay.pp_funcs->gfx_state_change_set) + ((adev)->powerplay.pp_funcs->gfx_state_change_set( + (adev)->powerplay.pp_handle, state)); + mutex_unlock(&adev->pm.mutex); } -- cgit From 719a9b332305b8c4b91805c4bedee27ce82ee916 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 19 Mar 2021 16:59:09 +0800 Subject: drm/amdgpu: split gfx callbacks into ras and non-ras ones gfx ras is only available in cerntain ip generations. Signed-off-by: Hawking Zhang Reviewed-by: Dennis Li Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 12e8b527776b..95d4f43a03df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -677,8 +677,9 @@ int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev, */ if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (adev->gfx.funcs->query_ras_error_count) - adev->gfx.funcs->query_ras_error_count(adev, err_data); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->query_ras_error_count) + adev->gfx.ras_funcs->query_ras_error_count(adev, err_data); amdgpu_ras_reset_gpu(adev); } return AMDGPU_RAS_SUCCESS; -- cgit