drm/amdkfd: Fix circular locking dependency warning

[ 150.887733] ====================================================== [ 150.893903] WARNING: possible circular locking dependency detected [ 150.905917] ------------------------------------------------------ [ 150.912129] kfdtest/4081 is trying to acquire lock: [ 150.917002] ffff8f7f3762e118 (&mm->mmap_sem#2){++++}, at: __might_fault+0x3e/0x90 [ 150.924490] but task is already holding lock: [ 150.930320] ffff8f7f49d229e8 (&dqm->lock_hidden){+.+.}, at: destroy_queue_cpsch+0x29/0x210 [amdgpu] [ 150.939432] which lock already depends on the new lock. [ 150.947603] the existing dependency chain (in reverse order) is: [ 150.955074] -> #3 (&dqm->lock_hidden){+.+.}: [ 150.960822] __mutex_lock+0xa1/0x9f0 [ 150.964996] evict_process_queues_cpsch+0x22/0x120 [amdgpu] [ 150.971155] kfd_process_evict_queues+0x3b/0xc0 [amdgpu] [ 150.977054] kgd2kfd_quiesce_mm+0x25/0x60 [amdgpu] [ 150.982442] amdgpu_amdkfd_evict_userptr+0x35/0x70 [amdgpu] [ 150.988615] amdgpu_mn_invalidate_hsa+0x41/0x60 [amdgpu] [ 150.994448] __mmu_notifier_invalidate_range_start+0xa4/0x240 [ 151.000714] copy_page_range+0xd70/0xd80 [ 151.005159] dup_mm+0x3ca/0x550 [ 151.008816] copy_process+0x1bdc/0x1c70 [ 151.013183] _do_fork+0x76/0x6c0 [ 151.016929] __x64_sys_clone+0x8c/0xb0 [ 151.021201] do_syscall_64+0x4a/0x1d0 [ 151.025404] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 151.030977] -> #2 (&adev->notifier_lock){+.+.}: [ 151.036993] __mutex_lock+0xa1/0x9f0 [ 151.041168] amdgpu_mn_invalidate_hsa+0x30/0x60 [amdgpu] [ 151.047019] __mmu_notifier_invalidate_range_start+0xa4/0x240 [ 151.053277] copy_page_range+0xd70/0xd80 [ 151.057722] dup_mm+0x3ca/0x550 [ 151.061388] copy_process+0x1bdc/0x1c70 [ 151.065748] _do_fork+0x76/0x6c0 [ 151.069499] __x64_sys_clone+0x8c/0xb0 [ 151.073765] do_syscall_64+0x4a/0x1d0 [ 151.077952] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 151.083523] -> #1 (mmu_notifier_invalidate_range_start){+.+.}: [ 151.090833] change_protection+0x802/0xab0 [ 151.095448] mprotect_fixup+0x187/0x2d0 [ 151.099801] setup_arg_pages+0x124/0x250 [ 151.104251] load_elf_binary+0x3a4/0x1464 [ 151.108781] search_binary_handler+0x6c/0x210 [ 151.113656] __do_execve_file.isra.40+0x7f7/0xa50 [ 151.118875] do_execve+0x21/0x30 [ 151.122632] call_usermodehelper_exec_async+0x17e/0x190 [ 151.128393] ret_from_fork+0x24/0x30 [ 151.132489] -> #0 (&mm->mmap_sem#2){++++}: [ 151.138064] __lock_acquire+0x11a1/0x1490 [ 151.142597] lock_acquire+0x90/0x180 [ 151.146694] __might_fault+0x68/0x90 [ 151.150879] read_sdma_queue_counter+0x5f/0xb0 [amdgpu] [ 151.156693] update_sdma_queue_past_activity_stats+0x3b/0x90 [amdgpu] [ 151.163725] destroy_queue_cpsch+0x1ae/0x210 [amdgpu] [ 151.169373] pqm_destroy_queue+0xf0/0x250 [amdgpu] [ 151.174762] kfd_ioctl_destroy_queue+0x32/0x70 [amdgpu] [ 151.180577] kfd_ioctl+0x223/0x400 [amdgpu] [ 151.185284] ksys_ioctl+0x8f/0xb0 [ 151.189118] __x64_sys_ioctl+0x16/0x20 [ 151.193389] do_syscall_64+0x4a/0x1d0 [ 151.197569] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 151.203141] other info that might help us debug this: [ 151.211140] Chain exists of: &mm->mmap_sem#2 --> &adev->notifier_lock --> &dqm->lock_hidden [ 151.222535] Possible unsafe locking scenario: [ 151.228447] CPU0 CPU1 [ 151.232971] ---- ---- [ 151.237502] lock(&dqm->lock_hidden); [ 151.241254] lock(&adev->notifier_lock); [ 151.247774] lock(&dqm->lock_hidden); [ 151.254038] lock(&mm->mmap_sem#2); This commit fixes the warning by ensuring get_user() is not called while reading SDMA stats with dqm_lock held as get_user() could cause a page fault which leads to the circular locking scenario. Signed-off-by: Mukul Joshi <mukul.joshi@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Mukul Joshi <mukul.joshi@amd.com> 2020-06-24 13:47:12 -0400
committer: Alex Deucher <alexander.deucher@amd.com> 2020-07-01 01:59:27 -0400
commit: d69fd951e60ae48c8dd100aa8ceb799ab965f9b3 (patch)
tree: e9a5d66a0ef9da31bc686693d5b2eaf31a41b3b1 /drivers/gpu/drm/amd/amdkfd/kfd_process.c
parent: 7ee78aff9de13d5dccba133f4a0de5367194b243 (diff)
download: linux-d69fd951e60ae48c8dd100aa8ceb799ab965f9b3.tar.gz
linux-d69fd951e60ae48c8dd100aa8ceb799ab965f9b3.tar.bz2
linux-d69fd951e60ae48c8dd100aa8ceb799ab965f9b3.zip
1 files changed, 123 insertions, 20 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d0409df35032..013c2b018edc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -86,6 +86,13 @@ struct kfd_sdma_activity_handler_workarea {
 	uint64_t sdma_activity_counter;
 };
 
+struct temp_sdma_queue_list {
+	uint64_t rptr;
+	uint64_t sdma_val;
+	unsigned int queue_id;
+	struct list_head list;
+};
+
 static void kfd_sdma_activity_worker(struct work_struct *work)
 {
 	struct kfd_sdma_activity_handler_workarea *workarea;
@@ -96,6 +103,8 @@ static void kfd_sdma_activity_worker(struct work_struct *work)
 	struct qcm_process_device *qpd;
 	struct device_queue_manager *dqm;
 	int ret = 0;
+	struct temp_sdma_queue_list sdma_q_list;
+	struct temp_sdma_queue_list *sdma_q, *next;
 
 	workarea = container_of(work, struct kfd_sdma_activity_handler_workarea,
 				sdma_activity_work);
@@ -109,41 +118,135 @@ static void kfd_sdma_activity_worker(struct work_struct *work)
 	qpd = &pdd->qpd;
 	if (!dqm || !qpd)
 		return;
+	/*
+	 * Total SDMA activity is current SDMA activity + past SDMA activity
+	 * Past SDMA count is stored in pdd.
+	 * To get the current activity counters for all active SDMA queues,
+	 * we loop over all SDMA queues and get their counts from user-space.
+	 *
+	 * We cannot call get_user() with dqm_lock held as it can cause
+	 * a circular lock dependency situation. To read the SDMA stats,
+	 * we need to do the following:
+	 *
+	 * 1. Create a temporary list of SDMA queue nodes from the qpd->queues_list,
+	 *    with dqm_lock/dqm_unlock().
+	 * 2. Call get_user() for each node in temporary list without dqm_lock.
+	 *    Save the SDMA count for each node and also add the count to the total
+	 *    SDMA count counter.
+	 *    Its possible, during this step, a few SDMA queue nodes got deleted
+	 *    from the qpd->queues_list.
+	 * 3. Do a second pass over qpd->queues_list to check if any nodes got deleted.
+	 *    If any node got deleted, its SDMA count would be captured in the sdma
+	 *    past activity counter. So subtract the SDMA counter stored in step 2
+	 *    for this node from the total SDMA count.
+	 */
+	INIT_LIST_HEAD(&sdma_q_list.list);
 
-	mm = get_task_mm(pdd->process->lead_thread);
-	if (!mm) {
-		return;
+	/*
+	 * Create the temp list of all SDMA queues
+	 */
+	dqm_lock(dqm);
+
+	list_for_each_entry(q, &qpd->queues_list, list) {
+		if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) &&
+		    (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI))
+			continue;
+
+		sdma_q = kzalloc(sizeof(struct temp_sdma_queue_list), GFP_KERNEL);
+		if (!sdma_q) {
+			dqm_unlock(dqm);
+			goto cleanup;
+		}
+
+		INIT_LIST_HEAD(&sdma_q->list);
+		sdma_q->rptr = (uint64_t)q->properties.read_ptr;
+		sdma_q->queue_id = q->properties.queue_id;
+		list_add_tail(&sdma_q->list, &sdma_q_list.list);
 	}
 
-	use_mm(mm);
+	/*
+	 * If the temp list is empty, then no SDMA queues nodes were found in
+	 * qpd->queues_list. Return the past activity count as the total sdma
+	 * count
+	 */
+	if (list_empty(&sdma_q_list.list)) {
+		workarea->sdma_activity_counter = pdd->sdma_past_activity_counter;
+		dqm_unlock(dqm);
+		return;
+	}
 
-	dqm_lock(dqm);
+	dqm_unlock(dqm);
 
 	/*
-	 * Total SDMA activity is current SDMA activity + past SDMA activity
+	 * Get the usage count for each SDMA queue in temp_list.
 	 */
-	workarea->sdma_activity_counter = pdd->sdma_past_activity_counter;
+	mm = get_task_mm(pdd->process->lead_thread);
+	if (!mm)
+		goto cleanup;
+
+	use_mm(mm);
+
+	list_for_each_entry(sdma_q, &sdma_q_list.list, list) {
+		val = 0;
+		ret = read_sdma_queue_counter(sdma_q->rptr, &val);
+		if (ret) {
+			pr_debug("Failed to read SDMA queue active counter for queue id: %d",
+				 sdma_q->queue_id);
+		} else {
+			sdma_q->sdma_val = val;
+			workarea->sdma_activity_counter += val;
+		}
+	}
+
+	unuse_mm(mm);
+	mmput(mm);
 
 	/*
-	 * Get the current activity counters for all active SDMA queues
+	 * Do a second iteration over qpd_queues_list to check if any SDMA
+	 * nodes got deleted while fetching SDMA counter.
 	 */
+	dqm_lock(dqm);
+
+	workarea->sdma_activity_counter += pdd->sdma_past_activity_counter;
+
 	list_for_each_entry(q, &qpd->queues_list, list) {
-		if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
-		    (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
-			val = 0;
-			ret = read_sdma_queue_counter(q, &val);
-			if (ret)
-				pr_debug("Failed to read SDMA queue active "
-					 "counter for queue id: %d",
-					 q->properties.queue_id);
-			else
-				workarea->sdma_activity_counter += val;
+		if (list_empty(&sdma_q_list.list))
+			break;
+
+		if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) &&
+		    (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI))
+			continue;
+
+		list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) {
+			if (((uint64_t)q->properties.read_ptr == sdma_q->rptr) &&
+			     (sdma_q->queue_id == q->properties.queue_id)) {
+				list_del(&sdma_q->list);
+				kfree(sdma_q);
+				break;
+			}
 		}
 	}
 
 	dqm_unlock(dqm);
-	unuse_mm(mm);
-	mmput(mm);
+
+	/*
+	 * If temp list is not empty, it implies some queues got deleted
+	 * from qpd->queues_list during SDMA usage read. Subtract the SDMA
+	 * count for each node from the total SDMA count.
+	 */
+	list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) {
+		workarea->sdma_activity_counter -= sdma_q->sdma_val;
+		list_del(&sdma_q->list);
+		kfree(sdma_q);
+	}
+
+	return;
+
+cleanup:
+	list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) {
+		list_del(&sdma_q->list);
+		kfree(sdma_q);
+	}
 }
 
 static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr,
author	Mukul Joshi <mukul.joshi@amd.com>	2020-06-24 13:47:12 -0400
committer	Alex Deucher <alexander.deucher@amd.com>	2020-07-01 01:59:27 -0400
commit	d69fd951e60ae48c8dd100aa8ceb799ab965f9b3 (patch)
tree	e9a5d66a0ef9da31bc686693d5b2eaf31a41b3b1 /drivers/gpu/drm/amd/amdkfd/kfd_process.c
parent	7ee78aff9de13d5dccba133f4a0de5367194b243 (diff)
download	linux-d69fd951e60ae48c8dd100aa8ceb799ab965f9b3.tar.gz linux-d69fd951e60ae48c8dd100aa8ceb799ab965f9b3.tar.bz2 linux-d69fd951e60ae48c8dd100aa8ceb799ab965f9b3.zip