From f87f686482c6d2d4465245356854710b01f312c1 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Mon, 9 May 2022 22:22:20 -0400 Subject: drm/amdgpu: Add XCC inst to PASID TLB flushing Add XCC instance to select the correct KIQ ring when flushing TLBs on a multi-XCC setup. Signed-off-by: Mukul Joshi Tested-by: Amber Lin Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 01ba3589b60a..df07e212c21e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -160,7 +160,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev); int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct amdgpu_device *adev, uint16_t vmid); int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev, - uint16_t pasid, enum TLB_FLUSH_TYPE flush_type); + uint16_t pasid, enum TLB_FLUSH_TYPE flush_type, + uint32_t inst); bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); -- cgit From 0c7315e7d5ef9b36ca4db32ffeb34a187cbaf231 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 10 Jun 2022 09:41:29 -0400 Subject: drm/amdkfd: Add device repartition support GFX9.4.3 will support dynamic repartitioning of the GPU through sysfs. Add device repartitioning support in KFD to repartition GPU from one mode to other. v2: squash in fix ("drm/amdkfd: Fix warning kgd2kfd_unlock_kfd defined but not used") Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index df07e212c21e..d1d643a050a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -151,6 +151,8 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev, void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); void amdgpu_amdkfd_device_fini_sw(struct amdgpu_device *adev); +int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev); +void amdgpu_amdkfd_unlock_kfd(struct amdgpu_device *adev); int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev, enum kgd_engine_type engine, uint32_t vmid, uint64_t gpu_addr, @@ -373,6 +375,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd); void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd); void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask); +int kgd2kfd_check_and_lock_kfd(void); +void kgd2kfd_unlock_kfd(void); #else static inline int kgd2kfd_init(void) { @@ -438,5 +442,14 @@ static inline void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask) { } + +static inline int kgd2kfd_check_and_lock_kfd(void) +{ + return 0; +} + +static inline void kgd2kfd_unlock_kfd(void) +{ +} #endif #endif /* AMDGPU_AMDKFD_H_INCLUDED */ -- cgit From 610dab118ff5013d46069c828b58d576e0907b66 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 31 Mar 2023 11:13:40 -0400 Subject: drm/amdkfd: Move pgmap to amdgpu_kfd_dev structure VRAM pgmap resource is allocated every time when switching compute partitions because kfd_dev is re-initialized by post_partition_switch, As a result, it causes memory region resource leaking and system memory usage accounting unbalanced. pgmap resource should be allocated and registered only once when loading driver and freed when unloading driver, move it from kfd_dev to amdgpu_kfd_dev. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index d1d643a050a1..e4e1dbba060a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include "amdgpu_sync.h" @@ -101,6 +102,9 @@ struct amdgpu_kfd_dev { uint64_t vram_used_aligned; bool init_complete; struct work_struct reset_work; + + /* HMM page migration MEMORY_DEVICE_PRIVATE mapping */ + struct dev_pagemap pgmap; }; enum kgd_engine_type { -- cgit From 4c6ce75fdd628c43aea11448ed41b52119dae42b Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Thu, 26 Jan 2023 18:11:29 -0500 Subject: drm/amdkfd: Show KFD node memory partition info Show KFD node memory partition id and size, add helper function KFD_XCP_MEMORY_SIZE to get kfd node memory size, will be used later to support memory accounting per partition. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index e4e1dbba060a..324cb566ca2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -330,6 +330,11 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag); +#define KFD_XCP_MEMORY_SIZE(n) ((n)->adev->gmc.num_mem_partitions ?\ + (n)->adev->gmc.mem_partitions[(n)->xcp->mem_id].size /\ + (n)->adev->xcp_mgr->num_xcp_per_mem_partition :\ + (n)->adev->gmc.real_vram_size) + #if IS_ENABLED(CONFIG_HSA_AMD) void amdgpu_amdkfd_gpuvm_init_mem_limits(void); void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, -- cgit From 3ebfd221c1a83e5f0edadb87d173d8fd93d1d125 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Wed, 8 Mar 2023 11:57:00 -0500 Subject: drm/amdkfd: Store xcp partition id to amdgpu bo For memory accounting per compute partition and export drm amdgpu bo and then import to KFD, we need the xcp id to account the memory usage or find the KFD node of the original amdgpu bo to create the KFD bo on the correct adev KFD node. Set xcp_id_plus1 of amdgpu_bo_param to create bo and store xcp_id to amddgpu bo. Add helper macro to get the mem_id from adev and xcp_id. v2: squash in fix ("drm/amdgpu: Fix BO creation failure on GFX 9.4.3 dGPU") Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 324cb566ca2f..05c54776951b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -330,6 +330,10 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag); +#define KFD_XCP_MEM_ID(adev, xcp_id) \ + ((adev)->xcp_mgr && (xcp_id) >= 0 ?\ + (adev)->xcp_mgr->xcp[(xcp_id)].mem_id : -1) + #define KFD_XCP_MEMORY_SIZE(n) ((n)->adev->gmc.num_mem_partitions ?\ (n)->adev->gmc.mem_partitions[(n)->xcp->mem_id].size /\ (n)->adev->xcp_mgr->num_xcp_per_mem_partition :\ -- cgit From 2fa9ff25de08e598af051c76b216d2f073b2ee89 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Thu, 9 Mar 2023 19:30:02 -0500 Subject: drm/amdgpu: KFD graphics interop support compute partition kfd_ioctl_get_dmabuf use the amdgpu bo xcp_id to get the gpu_id of the KFD node from the exported dmabuf_adev, and then create kfd bo on the correct adev and KFD node when importing the amdgpu bo to KFD. Remove function kfd_device_by_adev, it is not needed as it is the same result as dmabuf_adev->kfd.dev->nodes[0]->id. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 05c54776951b..4e6221bccffe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -241,7 +241,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct amdgpu_device *adev, int dma_buf_fd, struct amdgpu_device **dmabuf_adev, uint64_t *bo_size, void *metadata_buffer, size_t buffer_size, uint32_t *metadata_size, - uint32_t *flags); + uint32_t *flags, int8_t *xcp_id); uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct amdgpu_device *dst, struct amdgpu_device *src); int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst, -- cgit From 315e29eca57f85107cc6f687c2d510aa532fb3f0 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Mon, 20 Mar 2023 11:21:38 -0400 Subject: drm/amdkfd: Move local_mem_info to kfd_node We need to track memory usage on a per partition basis. To do that, store the local memory information in KFD node instead of kfd device. v2: squash in fix ("amdkfd: Use mem_id to access mem_partition info") Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 4e6221bccffe..4bf6f5659568 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -231,7 +231,8 @@ int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem); uint32_t amdgpu_amdkfd_get_fw_version(struct amdgpu_device *adev, enum kgd_engine_type type); void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev, - struct kfd_local_mem_info *mem_info); + struct kfd_local_mem_info *mem_info, + uint8_t xcp_id); uint64_t amdgpu_amdkfd_get_gpu_clock_counter(struct amdgpu_device *adev); uint32_t amdgpu_amdkfd_get_max_engine_clock_in_mhz(struct amdgpu_device *adev); @@ -334,10 +335,11 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, ((adev)->xcp_mgr && (xcp_id) >= 0 ?\ (adev)->xcp_mgr->xcp[(xcp_id)].mem_id : -1) -#define KFD_XCP_MEMORY_SIZE(n) ((n)->adev->gmc.num_mem_partitions ?\ - (n)->adev->gmc.mem_partitions[(n)->xcp->mem_id].size /\ - (n)->adev->xcp_mgr->num_xcp_per_mem_partition :\ - (n)->adev->gmc.real_vram_size) +#define KFD_XCP_MEMORY_SIZE(adev, xcp_id)\ + ((adev)->gmc.num_mem_partitions && (xcp_id) >= 0 ?\ + (adev)->gmc.mem_partitions[KFD_XCP_MEM_ID((adev), (xcp_id))].size /\ + (adev)->xcp_mgr->num_xcp_per_mem_partition :\ + (adev)->gmc.real_vram_size) #if IS_ENABLED(CONFIG_HSA_AMD) void amdgpu_amdkfd_gpuvm_init_mem_limits(void); -- cgit From 1c77527a69d5ca19cb276e2728992d922b687f35 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Mon, 20 Mar 2023 11:22:30 -0400 Subject: drm/amdkfd: Fix memory reporting on GFX 9.4.3 This patch fixes memory reporting on the GFX 9.4.3 APU and dGPU by reporting available memory on a per partition basis. If its an APU, available and used memory calculations take into account system and TTM memory. v2: squash in fix ("drm/amdkfd: Fix array out of bound warning") squash in fix ("drm/amdgpu: Update memory reporting for GFX9.4.3") Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 4bf6f5659568..948d362adabb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -35,6 +35,7 @@ #include #include "amdgpu_sync.h" #include "amdgpu_vm.h" +#include "amdgpu_xcp.h" extern uint64_t amdgpu_amdkfd_total_mem_size; @@ -98,8 +99,8 @@ struct amdgpu_amdkfd_fence { struct amdgpu_kfd_dev { struct kfd_dev *dev; - int64_t vram_used; - uint64_t vram_used_aligned; + int64_t vram_used[MAX_XCP]; + uint64_t vram_used_aligned[MAX_XCP]; bool init_complete; struct work_struct reset_work; @@ -287,7 +288,8 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev, void *drm_priv); uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv); -size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev); +size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, + uint8_t xcp_id); int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( struct amdgpu_device *adev, uint64_t va, uint64_t size, void *drm_priv, struct kgd_mem **mem, @@ -327,9 +329,9 @@ void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev); int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, - uint64_t size, u32 alloc_flag); + uint64_t size, u32 alloc_flag, int8_t xcp_id); void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, - uint64_t size, u32 alloc_flag); + uint64_t size, u32 alloc_flag, int8_t xcp_id); #define KFD_XCP_MEM_ID(adev, xcp_id) \ ((adev)->xcp_mgr && (xcp_id) >= 0 ?\ -- cgit From 84b4dd3f84de424a68e1fda0d483530ddaa92b45 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 31 Mar 2023 11:18:12 -0400 Subject: drm/amdkfd: Refactor migrate init to support partition switch Rename smv_migrate_init to a better name kgd2kfd_init_zone_device because it setup zone devive pgmap for page migration and keep it in kfd_migrate.c to access static functions svm_migrate_pgmap_ops. Call it only once in amdgpu_device_ip_init after adev ip blocks are initialized, but before amdgpu_amdkfd_device_init initialize kfd nodes which enable SVM support based on pgmap. svm_range_set_max_pages is called by kgd2kfd_device_init everytime after switching compute partition mode. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 948d362adabb..48d12dbff968 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -372,6 +372,17 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo) { } #endif + +#if IS_ENABLED(CONFIG_HSA_AMD_SVM) +int kgd2kfd_init_zone_device(struct amdgpu_device *adev); +#else +static inline +int kgd2kfd_init_zone_device(struct amdgpu_device *adev) +{ + return 0; +} +#endif + /* KGD2KFD callbacks */ int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger); int kgd2kfd_resume_mm(struct mm_struct *mm); -- cgit From 9a3ce1a7a9e5372d8c275bf3fbef4456c8407145 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 12 May 2023 13:22:57 +0800 Subject: drm/amdgpu: Do not access members of xcp w/o check (v2) Not all the asic needs xcp. ensure check xcp availabity before accessing its member. v2: add missing change in kfd_topology.c Signed-off-by: Hawking Zhang Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 48d12dbff968..be43d71ba7ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -233,7 +233,7 @@ uint32_t amdgpu_amdkfd_get_fw_version(struct amdgpu_device *adev, enum kgd_engine_type type); void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev, struct kfd_local_mem_info *mem_info, - uint8_t xcp_id); + struct amdgpu_xcp *xcp); uint64_t amdgpu_amdkfd_get_gpu_clock_counter(struct amdgpu_device *adev); uint32_t amdgpu_amdkfd_get_max_engine_clock_in_mhz(struct amdgpu_device *adev); -- cgit From 45b3a914d40e63d2c9e3a3e02fb2014be975b9b0 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 16 May 2023 17:16:30 -0400 Subject: drm/amdgpu/gmc9: fix 64 bit division in partition code Rework logic or use do_div() to avoid problems on 32 bit. v2: add a missing case for XCP macro v3: fix out of bounds array access v4: fix xcp handling harder Acked-by: Guchun Chen (v1) Reviewed-by: Mukul Joshi (v3) Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index be43d71ba7ef..94cc456761e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -333,15 +333,14 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag, int8_t xcp_id); +u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id); + #define KFD_XCP_MEM_ID(adev, xcp_id) \ ((adev)->xcp_mgr && (xcp_id) >= 0 ?\ (adev)->xcp_mgr->xcp[(xcp_id)].mem_id : -1) -#define KFD_XCP_MEMORY_SIZE(adev, xcp_id)\ - ((adev)->gmc.num_mem_partitions && (xcp_id) >= 0 ?\ - (adev)->gmc.mem_partitions[KFD_XCP_MEM_ID((adev), (xcp_id))].size /\ - (adev)->xcp_mgr->num_xcp_per_mem_partition :\ - (adev)->gmc.real_vram_size) +#define KFD_XCP_MEMORY_SIZE(adev, xcp_id) amdgpu_amdkfd_xcp_memory_size((adev), (xcp_id)) + #if IS_ENABLED(CONFIG_HSA_AMD) void amdgpu_amdkfd_gpuvm_init_mem_limits(void); -- cgit From 12fb1ad70d65edc3405884792d044fa79df7244f Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Fri, 22 Apr 2022 12:26:18 -0400 Subject: drm/amdkfd: update process interrupt handling for debug events The debugger must be notified by any debugger subscribed exception that comes from hardware interrupts. If a debugger session exits, any exceptions it subscribed to may still have interrupts in the interrupt ring buffer or KGD/KFD pipeline. To prevent a new session from inheriting stale interrupts, when a new queue is created, open an interrupt drain and allow the IH ring to drain from a timestamped checkpoint. Then inject a custom IV so that once the custom IV is picked up by the KFD, it's safe to close the drain and proceed with queue creation. The drain must also be on debug disable as SW interrupts may still be processed. Drain at this time and clear all the exception status. The debugger may also not be attached nor subscibed to certain exceptions so forward them directly to the runtime. GFX10 also requires its own IV processing, hence the creation of kfd_int_process_v10.c. This is because the IV from SQ interrupts are packed into a new continguous format unlike GFX9. To make this clear, a separate interrupting handling code file was created. Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 94cc456761e5..dd740e64e6e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -250,6 +250,8 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst, struct amdgpu_device *src, bool is_min); int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_min); +int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, + uint32_t *payload); /* Read user wptr from a specified user address space with page fault * disabled. The memory must be pinned and mapped to the hardware when -- cgit From a70a93fa568b4f05aba548dadb673703eccf5480 Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Thu, 5 May 2022 16:15:37 -0400 Subject: drm/amdkfd: add debug suspend and resume process queues operation In order to inspect waves from the saved context at any point during a debug session, the debugger must be able to preempt queues to trigger context save by suspending them. On queue suspend, the KFD will copy the context save header information so that the debugger can correctly crawl the appropriate size of the saved context. The debugger must then also be allowed to resume suspended queues. A queue that is newly created cannot be suspended because queue ids are recycled after destruction so the debugger needs to know that this has occurred. Query functions will be later added that will clear a given queue of its new queue status. A queue cannot be destroyed while it is suspended to preserve its saved context during debugger inspection. Have queue destruction block while a queue is suspended and unblocked when it is resumed. Likewise, if a queue is about to be destroyed, it cannot be suspended. Return the number of queues successfully suspended or resumed along with a per queue status array where the upper bits per queue status show that the request was invalid (new/destroyed queue suspend request, missing queue) or an error occurred (HWS in a fatal state so it can't suspend or resume queues). Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index dd740e64e6e1..2d0406bff84e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -322,6 +322,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev, uint64_t *mmap_offset); int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem, struct dma_buf **dmabuf); +void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, struct tile_config *config); void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, -- cgit