diff options
author | Mukul Joshi <mukul.joshi@amd.com> | 2020-06-24 13:47:12 -0400 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2020-07-01 01:59:27 -0400 |
commit | d69fd951e60ae48c8dd100aa8ceb799ab965f9b3 (patch) | |
tree | e9a5d66a0ef9da31bc686693d5b2eaf31a41b3b1 /drivers/gpu/drm/amd/amdkfd/kfd_process.c | |
parent | 7ee78aff9de13d5dccba133f4a0de5367194b243 (diff) | |
download | linux-d69fd951e60ae48c8dd100aa8ceb799ab965f9b3.tar.gz linux-d69fd951e60ae48c8dd100aa8ceb799ab965f9b3.tar.bz2 linux-d69fd951e60ae48c8dd100aa8ceb799ab965f9b3.zip |
drm/amdkfd: Fix circular locking dependency warning
[ 150.887733] ======================================================
[ 150.893903] WARNING: possible circular locking dependency detected
[ 150.905917] ------------------------------------------------------
[ 150.912129] kfdtest/4081 is trying to acquire lock:
[ 150.917002] ffff8f7f3762e118 (&mm->mmap_sem#2){++++}, at:
__might_fault+0x3e/0x90
[ 150.924490]
but task is already holding lock:
[ 150.930320] ffff8f7f49d229e8 (&dqm->lock_hidden){+.+.}, at:
destroy_queue_cpsch+0x29/0x210 [amdgpu]
[ 150.939432]
which lock already depends on the new lock.
[ 150.947603]
the existing dependency chain (in reverse order) is:
[ 150.955074]
-> #3 (&dqm->lock_hidden){+.+.}:
[ 150.960822] __mutex_lock+0xa1/0x9f0
[ 150.964996] evict_process_queues_cpsch+0x22/0x120 [amdgpu]
[ 150.971155] kfd_process_evict_queues+0x3b/0xc0 [amdgpu]
[ 150.977054] kgd2kfd_quiesce_mm+0x25/0x60 [amdgpu]
[ 150.982442] amdgpu_amdkfd_evict_userptr+0x35/0x70 [amdgpu]
[ 150.988615] amdgpu_mn_invalidate_hsa+0x41/0x60 [amdgpu]
[ 150.994448] __mmu_notifier_invalidate_range_start+0xa4/0x240
[ 151.000714] copy_page_range+0xd70/0xd80
[ 151.005159] dup_mm+0x3ca/0x550
[ 151.008816] copy_process+0x1bdc/0x1c70
[ 151.013183] _do_fork+0x76/0x6c0
[ 151.016929] __x64_sys_clone+0x8c/0xb0
[ 151.021201] do_syscall_64+0x4a/0x1d0
[ 151.025404] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 151.030977]
-> #2 (&adev->notifier_lock){+.+.}:
[ 151.036993] __mutex_lock+0xa1/0x9f0
[ 151.041168] amdgpu_mn_invalidate_hsa+0x30/0x60 [amdgpu]
[ 151.047019] __mmu_notifier_invalidate_range_start+0xa4/0x240
[ 151.053277] copy_page_range+0xd70/0xd80
[ 151.057722] dup_mm+0x3ca/0x550
[ 151.061388] copy_process+0x1bdc/0x1c70
[ 151.065748] _do_fork+0x76/0x6c0
[ 151.069499] __x64_sys_clone+0x8c/0xb0
[ 151.073765] do_syscall_64+0x4a/0x1d0
[ 151.077952] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 151.083523]
-> #1 (mmu_notifier_invalidate_range_start){+.+.}:
[ 151.090833] change_protection+0x802/0xab0
[ 151.095448] mprotect_fixup+0x187/0x2d0
[ 151.099801] setup_arg_pages+0x124/0x250
[ 151.104251] load_elf_binary+0x3a4/0x1464
[ 151.108781] search_binary_handler+0x6c/0x210
[ 151.113656] __do_execve_file.isra.40+0x7f7/0xa50
[ 151.118875] do_execve+0x21/0x30
[ 151.122632] call_usermodehelper_exec_async+0x17e/0x190
[ 151.128393] ret_from_fork+0x24/0x30
[ 151.132489]
-> #0 (&mm->mmap_sem#2){++++}:
[ 151.138064] __lock_acquire+0x11a1/0x1490
[ 151.142597] lock_acquire+0x90/0x180
[ 151.146694] __might_fault+0x68/0x90
[ 151.150879] read_sdma_queue_counter+0x5f/0xb0 [amdgpu]
[ 151.156693] update_sdma_queue_past_activity_stats+0x3b/0x90 [amdgpu]
[ 151.163725] destroy_queue_cpsch+0x1ae/0x210 [amdgpu]
[ 151.169373] pqm_destroy_queue+0xf0/0x250 [amdgpu]
[ 151.174762] kfd_ioctl_destroy_queue+0x32/0x70 [amdgpu]
[ 151.180577] kfd_ioctl+0x223/0x400 [amdgpu]
[ 151.185284] ksys_ioctl+0x8f/0xb0
[ 151.189118] __x64_sys_ioctl+0x16/0x20
[ 151.193389] do_syscall_64+0x4a/0x1d0
[ 151.197569] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 151.203141]
other info that might help us debug this:
[ 151.211140] Chain exists of:
&mm->mmap_sem#2 --> &adev->notifier_lock --> &dqm->lock_hidden
[ 151.222535] Possible unsafe locking scenario:
[ 151.228447] CPU0 CPU1
[ 151.232971] ---- ----
[ 151.237502] lock(&dqm->lock_hidden);
[ 151.241254] lock(&adev->notifier_lock);
[ 151.247774] lock(&dqm->lock_hidden);
[ 151.254038] lock(&mm->mmap_sem#2);
This commit fixes the warning by ensuring get_user() is not called
while reading SDMA stats with dqm_lock held as get_user() could cause a
page fault which leads to the circular locking scenario.
Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_process.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process.c | 143 |
1 files changed, 123 insertions, 20 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index d0409df35032..013c2b018edc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -86,6 +86,13 @@ struct kfd_sdma_activity_handler_workarea { uint64_t sdma_activity_counter; }; +struct temp_sdma_queue_list { + uint64_t rptr; + uint64_t sdma_val; + unsigned int queue_id; + struct list_head list; +}; + static void kfd_sdma_activity_worker(struct work_struct *work) { struct kfd_sdma_activity_handler_workarea *workarea; @@ -96,6 +103,8 @@ static void kfd_sdma_activity_worker(struct work_struct *work) struct qcm_process_device *qpd; struct device_queue_manager *dqm; int ret = 0; + struct temp_sdma_queue_list sdma_q_list; + struct temp_sdma_queue_list *sdma_q, *next; workarea = container_of(work, struct kfd_sdma_activity_handler_workarea, sdma_activity_work); @@ -109,41 +118,135 @@ static void kfd_sdma_activity_worker(struct work_struct *work) qpd = &pdd->qpd; if (!dqm || !qpd) return; + /* + * Total SDMA activity is current SDMA activity + past SDMA activity + * Past SDMA count is stored in pdd. + * To get the current activity counters for all active SDMA queues, + * we loop over all SDMA queues and get their counts from user-space. + * + * We cannot call get_user() with dqm_lock held as it can cause + * a circular lock dependency situation. To read the SDMA stats, + * we need to do the following: + * + * 1. Create a temporary list of SDMA queue nodes from the qpd->queues_list, + * with dqm_lock/dqm_unlock(). + * 2. Call get_user() for each node in temporary list without dqm_lock. + * Save the SDMA count for each node and also add the count to the total + * SDMA count counter. + * Its possible, during this step, a few SDMA queue nodes got deleted + * from the qpd->queues_list. + * 3. Do a second pass over qpd->queues_list to check if any nodes got deleted. + * If any node got deleted, its SDMA count would be captured in the sdma + * past activity counter. So subtract the SDMA counter stored in step 2 + * for this node from the total SDMA count. + */ + INIT_LIST_HEAD(&sdma_q_list.list); - mm = get_task_mm(pdd->process->lead_thread); - if (!mm) { - return; + /* + * Create the temp list of all SDMA queues + */ + dqm_lock(dqm); + + list_for_each_entry(q, &qpd->queues_list, list) { + if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) && + (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI)) + continue; + + sdma_q = kzalloc(sizeof(struct temp_sdma_queue_list), GFP_KERNEL); + if (!sdma_q) { + dqm_unlock(dqm); + goto cleanup; + } + + INIT_LIST_HEAD(&sdma_q->list); + sdma_q->rptr = (uint64_t)q->properties.read_ptr; + sdma_q->queue_id = q->properties.queue_id; + list_add_tail(&sdma_q->list, &sdma_q_list.list); } - use_mm(mm); + /* + * If the temp list is empty, then no SDMA queues nodes were found in + * qpd->queues_list. Return the past activity count as the total sdma + * count + */ + if (list_empty(&sdma_q_list.list)) { + workarea->sdma_activity_counter = pdd->sdma_past_activity_counter; + dqm_unlock(dqm); + return; + } - dqm_lock(dqm); + dqm_unlock(dqm); /* - * Total SDMA activity is current SDMA activity + past SDMA activity + * Get the usage count for each SDMA queue in temp_list. */ - workarea->sdma_activity_counter = pdd->sdma_past_activity_counter; + mm = get_task_mm(pdd->process->lead_thread); + if (!mm) + goto cleanup; + + use_mm(mm); + + list_for_each_entry(sdma_q, &sdma_q_list.list, list) { + val = 0; + ret = read_sdma_queue_counter(sdma_q->rptr, &val); + if (ret) { + pr_debug("Failed to read SDMA queue active counter for queue id: %d", + sdma_q->queue_id); + } else { + sdma_q->sdma_val = val; + workarea->sdma_activity_counter += val; + } + } + + unuse_mm(mm); + mmput(mm); /* - * Get the current activity counters for all active SDMA queues + * Do a second iteration over qpd_queues_list to check if any SDMA + * nodes got deleted while fetching SDMA counter. */ + dqm_lock(dqm); + + workarea->sdma_activity_counter += pdd->sdma_past_activity_counter; + list_for_each_entry(q, &qpd->queues_list, list) { - if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || - (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { - val = 0; - ret = read_sdma_queue_counter(q, &val); - if (ret) - pr_debug("Failed to read SDMA queue active " - "counter for queue id: %d", - q->properties.queue_id); - else - workarea->sdma_activity_counter += val; + if (list_empty(&sdma_q_list.list)) + break; + + if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) && + (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI)) + continue; + + list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) { + if (((uint64_t)q->properties.read_ptr == sdma_q->rptr) && + (sdma_q->queue_id == q->properties.queue_id)) { + list_del(&sdma_q->list); + kfree(sdma_q); + break; + } } } dqm_unlock(dqm); - unuse_mm(mm); - mmput(mm); + + /* + * If temp list is not empty, it implies some queues got deleted + * from qpd->queues_list during SDMA usage read. Subtract the SDMA + * count for each node from the total SDMA count. + */ + list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) { + workarea->sdma_activity_counter -= sdma_q->sdma_val; + list_del(&sdma_q->list); + kfree(sdma_q); + } + + return; + +cleanup: + list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) { + list_del(&sdma_q->list); + kfree(sdma_q); + } } static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, |