From 836af5be1b6d8e93d736c252e711a20db7dbde9d Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Tue, 30 Jul 2024 11:19:53 +0530 Subject: drm/amdgpu: Clean up the register dump via debugfs list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit debugfs register list for dump is cleaned as it have some issues related to proper power state of the IP before register read. Since the above mentioned is removed we no longer want this to be dumped part of the devcoredump and hence removed. Reviewed-by: Christian König Signed-off-by: Sunil Khatri Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 137a88b8de45..c54ddd3e68aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -823,17 +823,6 @@ struct amdgpu_mqd { struct amdgpu_reset_domain; struct amdgpu_fru_info; -struct amdgpu_reset_info { - /* reset dump register */ - u32 *reset_dump_reg_list; - u32 *reset_dump_reg_value; - int num_regs; - -#ifdef CONFIG_DEV_COREDUMP - struct amdgpu_coredump_info *coredump_info; -#endif -}; - /* * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise. */ @@ -1157,8 +1146,6 @@ struct amdgpu_device { struct mutex benchmark_mutex; - struct amdgpu_reset_info reset_info; - bool scpm_enabled; uint32_t scpm_status; -- cgit From 7a26f18119d1daf910cca58f875582d50d0e4974 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 12 Aug 2024 10:28:23 +0200 Subject: drm/amdgpu: Do not set struct drm_driver.lastclose Remove the implementation of struct drm_driver.lastclose. The hook was only necessary before in-kernel DRM clients existed, but is now obsolete. The code in amdgpu_driver_lastclose_kms() is performed by drm_lastclose(). v2: - update commit message Signed-off-by: Thomas Zimmermann Reviewed-by: Daniel Vetter Reviewed-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20240812083000.337744-3-tzimmermann@suse.de --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 137a88b8de45..4baeb6519fda 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1484,7 +1484,6 @@ extern const int amdgpu_max_kms_ioctl; int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags); void amdgpu_driver_unload_kms(struct drm_device *dev); -void amdgpu_driver_lastclose_kms(struct drm_device *dev); int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv); void amdgpu_driver_postclose_kms(struct drm_device *dev, struct drm_file *file_priv); -- cgit From 35c7152202e111968b10140383f49da9159d2704 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Thu, 25 Jul 2024 09:51:56 -0400 Subject: Revert "drm/amdgpu: Extend KIQ reg polling wait for VF" KIQ timeouts no longer seen. This reverts commit 3a19a8af64eaff8a8b230796741a1a8277205344. Signed-off-by: Victor Skvortsov Reviewed-by: Zhigang Luo Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c54ddd3e68aa..f3980b40f2ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -347,9 +347,9 @@ enum amdgpu_kiq_irq { AMDGPU_CP_KIQ_IRQ_DRIVER0 = 0, AMDGPU_CP_KIQ_IRQ_LAST }; -#define SRIOV_USEC_TIMEOUT 1200000 /* wait 12 * 100ms for SRIOV */ -#define MAX_KIQ_REG_WAIT (amdgpu_sriov_vf(adev) ? 50000 : 5000) /* in usecs, extend for VF */ -#define MAX_KIQ_REG_BAILOUT_INTERVAL 5 /* in msecs, 5ms */ +#define SRIOV_USEC_TIMEOUT 1200000 /* wait 12 * 100ms for SRIOV */ +#define MAX_KIQ_REG_WAIT 5000 /* in usecs, 5ms */ +#define MAX_KIQ_REG_BAILOUT_INTERVAL 5 /* in msecs, 5ms */ #define MAX_KIQ_REG_TRY 1000 int amdgpu_device_ip_set_clockgating_state(void *dev, -- cgit From 20588d5afce3992ff4fc9b61085e3e1affbac620 Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Mon, 12 Aug 2024 20:24:15 +0800 Subject: drm/amd: Remove unused declarations amdgpu_gart_table_vram_pin() and amdgpu_gart_table_vram_unpin() has been removed since commit 575e55ee4fbc ("drm/amdgpu: recover gart table at resume") remain the declarations untouched in the header files. Besides, amdgpu_dm_display_resume() has also beed removed since commit a80aa93de1a0 ("drm/amd/display: Unify dm resume sequence into a single call"). So, let's remove this unused declarations. Signed-off-by: Zhang Zekun Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index f3980b40f2ce..937de21a7142 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1575,13 +1575,6 @@ static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return static inline void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { } #endif -#if defined(CONFIG_DRM_AMD_DC) -int amdgpu_dm_display_resume(struct amdgpu_device *adev ); -#else -static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return 0; } -#endif - - void amdgpu_register_gpu_instance(struct amdgpu_device *adev); void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev); -- cgit From 96595204195d7e13736a84295e217316610d4cdb Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Mon, 29 Jul 2024 21:35:26 +0530 Subject: drm/amdgpu: Make enforce_isolation setting per GPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit makes enforce_isolation setting to be per GPU and per partition by adding the enforce_isolation array to the adev structure. The adev variable is set based on the global enforce_isolation module parameter during device initialization. In amdgpu_ids.c, the adev->enforce_isolation value for the current GPU is used to determine whether to enforce isolation between graphics and compute processes on that GPU. In amdgpu_ids.c, the adev->enforce_isolation value for the current GPU and partition is used to determine whether to enforce isolation between graphics and compute processes on that GPU and partition. This allows the enforce_isolation setting to be controlled individually for each GPU and each partition, which is useful in a system with multiple GPUs and partitions where different isolation settings might be desired for different GPUs and partitions. v2: fix loop in amdgpu_vmid_mgr_init() (Alex) Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 937de21a7142..0dceeea235cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1162,6 +1162,8 @@ struct amdgpu_device { bool debug_disable_soft_recovery; bool debug_use_vram_fw_buf; bool debug_enable_ras_aca; + + bool enforce_isolation[MAX_XCP]; }; static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, -- cgit From e189be9b2e3820c88164d95090f1fd6343cd77fc Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Mon, 27 May 2024 07:30:47 +0530 Subject: drm/amdgpu: Add enforce_isolation sysfs attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds a new sysfs attribute 'enforce_isolation' to control the 'enforce_isolation' setting per GPU. The attribute can be read and written, and accepts values 0 (disabled) and 1 (enabled). When 'enforce_isolation' is enabled, reserved VMIDs are allocated for each ring. When it's disabled, the reserved VMIDs are freed. The set function locks a mutex before changing the 'enforce_isolation' flag and the VMIDs, and unlocks it afterwards. This ensures that these operations are atomic and prevents race conditions and other concurrency issues. Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Suggested-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 0dceeea235cf..aa97bbefe934 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1164,6 +1164,8 @@ struct amdgpu_device { bool debug_enable_ras_aca; bool enforce_isolation[MAX_XCP]; + /* Added this mutex for cleaner shader isolation between GFX and compute processes */ + struct mutex enforce_isolation_mutex; }; static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, -- cgit From afefd6f245024684fff75100052065d6a9e8f75f Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Thu, 6 Jun 2024 13:28:02 +0530 Subject: drm/amdgpu: Implement Enforce Isolation Handler for KGD/KFD serialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces the Enforce Isolation Handler designed to enforce shader isolation on AMD GPUs, which helps to prevent data leakage between different processes. The handler counts the number of emitted fences for each GFX and compute ring. If there are any fences, it schedules the `enforce_isolation_work` to be run after a delay of `GFX_SLICE_PERIOD`. If there are no fences, it signals the Kernel Fusion Driver (KFD) to resume the runqueue. The function is synchronized using the `enforce_isolation_mutex`. This commit also introduces a reference count mechanism (kfd_sch_req_count) to keep track of the number of requests to enable the KFD scheduler. When a request to enable the KFD scheduler is made, the reference count is decremented. When the reference count reaches zero, a delayed work is scheduled to enforce isolation after a delay of GFX_SLICE_PERIOD. When a request to disable the KFD scheduler is made, the function first checks if the reference count is zero. If it is, it cancels the delayed work for enforcing isolation and checks if the KFD scheduler is active. If the KFD scheduler is active, it sends a request to stop the KFD scheduler and sets the KFD scheduler state to inactive. Then, it increments the reference count. The function is synchronized using the kfd_sch_mutex to ensure that the KFD scheduler state and reference count are updated atomically. Cc: Christian König Cc: Alex Deucher Signed-off-by: Alex Deucher Signed-off-by: Srinivasan Shanmugam Suggested-by: Christian König Suggested-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index aa97bbefe934..e8c284aea1f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -118,6 +118,8 @@ #define MAX_GPU_INSTANCE 64 +#define GFX_SLICE_PERIOD msecs_to_jiffies(250) + struct amdgpu_gpu_instance { struct amdgpu_device *adev; int mgpu_fan_enabled; -- cgit From a9b67c036c7f5d187fb88eb74fe04dff1098700f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 20 Aug 2024 15:19:04 -0400 Subject: drm/amdgpu: add experimental resets debug flag Add this flag to enable experimental resets for testing before they are fully validated. Reviewed-and-tested-by: Jiadong Zhu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 7fe41a3c2541..e095572458cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1164,6 +1164,7 @@ struct amdgpu_device { bool debug_disable_soft_recovery; bool debug_use_vram_fw_buf; bool debug_enable_ras_aca; + bool debug_exp_resets; bool enforce_isolation[MAX_XCP]; /* Added this mutex for cleaner shader isolation between GFX and compute processes */ -- cgit From 01be2b62c0f3c66832472ed3e48e61d631094606 Mon Sep 17 00:00:00 2001 From: Ramesh Errabolu Date: Tue, 20 Aug 2024 16:05:30 -0500 Subject: drm/amdgpu: Surface svm_default_granularity, a RW module parameter Enables users to update SVM's default granularity, used in buffer migration and handling of recoverable page faults. Param value is set in terms of log(numPages(buffer)), e.g. 9 for a 2 MIB buffer Signed-off-by: Ramesh Errabolu Reviewed-by: Philip Yang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e095572458cd..dcd59040c449 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -237,6 +237,7 @@ extern int sched_policy; extern bool debug_evictions; extern bool no_system_mem_limit; extern int halt_if_hws_hang; +extern uint amdgpu_svm_default_granularity; #else static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS; static const bool __maybe_unused debug_evictions; /* = false */ -- cgit From 7181faaa4703705939580abffaf9cb5d6b50dbb7 Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 27 Aug 2024 16:12:11 +0200 Subject: drm/amdgpu: nuke the VM PD/PT shadow handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was only used as workaround for recovering the page tables after VRAM was lost and is no longer necessary after the function amdgpu_vm_bo_reset_state_machine() started to do the same. Compute never used shadows either, so the only proplematic case left is SVM and that is most likely not recoverable in any way when VRAM is lost. Signed-off-by: Christian König Acked-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index dcd59040c449..9b1e0ede05a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1083,10 +1083,6 @@ struct amdgpu_device { struct amdgpu_virt virt; - /* link all shadow bo */ - struct list_head shadow_list; - struct mutex shadow_list_lock; - /* record hw reset is performed */ bool has_hw_reset; u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; -- cgit