drm/amdgpu: fix userptr HMM range handling v2

The basic problem here is that it's not allowed to page fault while holding the reservation lock. So it can happen that multiple processes try to validate an userptr at the same time. Work around that by putting the HMM range object into the mutex protected bo list for now. v2: make sure range is set to NULL in case of an error Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> CC: stable@vger.kernel.org Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Christian König <christian.koenig@amd.com> 2022-11-10 12:31:41 +0100
committer: Alex Deucher <alexander.deucher@amd.com> 2022-11-17 00:23:30 -0500
commit: fec8fdb54e8f74d88951c9f998f47bf4f2031fe0 (patch)
tree: 6488c4cbf8dedf902da012af243f8156d34a7be2 /drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
parent: 631945e04e1e243a503089f4487cad797476e8ca (diff)
download: linux-fec8fdb54e8f74d88951c9f998f47bf4f2031fe0.tar.gz
linux-fec8fdb54e8f74d88951c9f998f47bf4f2031fe0.tar.bz2
linux-fec8fdb54e8f74d88951c9f998f47bf4f2031fe0.zip
1 files changed, 8 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 68741b157153..e44d740022bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -938,6 +938,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
 	struct amdkfd_process_info *process_info = mem->process_info;
 	struct amdgpu_bo *bo = mem->bo;
 	struct ttm_operation_ctx ctx = { true, false };
+	struct hmm_range *range;
 	int ret = 0;
 
 	mutex_lock(&process_info->lock);
@@ -967,7 +968,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
 		return 0;
 	}
 
-	ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+	ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);
 	if (ret) {
 		pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
 		goto unregister_out;
@@ -985,7 +986,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
 	amdgpu_bo_unreserve(bo);
 
 release_out:
-	amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
+	amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
 unregister_out:
 	if (ret)
 		amdgpu_mn_unregister(bo);
@@ -2317,6 +2318,8 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
 	/* Go through userptr_inval_list and update any invalid user_pages */
 	list_for_each_entry(mem, &process_info->userptr_inval_list,
 			    validate_list.head) {
+		struct hmm_range *range;
+
 		invalid = atomic_read(&mem->invalid);
 		if (!invalid)
 			/* BO hasn't been invalidated since the last
@@ -2327,7 +2330,8 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
 		bo = mem->bo;
 
 		/* Get updated user pages */
-		ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+		ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
+						   &range);
 		if (ret) {
 			pr_debug("Failed %d to get user pages\n", ret);
 
@@ -2346,7 +2350,7 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
 			 * FIXME: Cannot ignore the return code, must hold
 			 * notifier_lock
 			 */
-			amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
+			amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
 		}
 
 		/* Mark the BO as valid unless it was invalidated
author	Christian König <christian.koenig@amd.com>	2022-11-10 12:31:41 +0100
committer	Alex Deucher <alexander.deucher@amd.com>	2022-11-17 00:23:30 -0500
commit	fec8fdb54e8f74d88951c9f998f47bf4f2031fe0 (patch)
tree	6488c4cbf8dedf902da012af243f8156d34a7be2 /drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
parent	631945e04e1e243a503089f4487cad797476e8ca (diff)
download	linux-fec8fdb54e8f74d88951c9f998f47bf4f2031fe0.tar.gz linux-fec8fdb54e8f74d88951c9f998f47bf4f2031fe0.tar.bz2 linux-fec8fdb54e8f74d88951c9f998f47bf4f2031fe0.zip