diff options
Diffstat (limited to 'drivers/gpu/drm/scheduler')
-rw-r--r-- | drivers/gpu/drm/scheduler/sched_entity.c | 52 | ||||
-rw-r--r-- | drivers/gpu/drm/scheduler/sched_main.c | 86 |
2 files changed, 92 insertions, 46 deletions
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index a75eede8bf8d..69bcf0e99d57 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -51,7 +51,7 @@ * drm_sched_entity_set_priority(). For changing the set of schedulers * @sched_list at runtime see drm_sched_entity_modify_sched(). * - * An entity is cleaned up by callind drm_sched_entity_fini(). See also + * An entity is cleaned up by calling drm_sched_entity_fini(). See also * drm_sched_entity_destroy(). * * Returns 0 on success or a negative error code on failure. @@ -105,7 +105,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, /* We start in an idle state. */ complete_all(&entity->entity_idle); - spin_lock_init(&entity->rq_lock); + spin_lock_init(&entity->lock); spsc_queue_init(&entity->job_queue); atomic_set(&entity->fence_seq, 0); @@ -133,10 +133,10 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity, { WARN_ON(!num_sched_list || !sched_list); - spin_lock(&entity->rq_lock); + spin_lock(&entity->lock); entity->sched_list = sched_list; entity->num_sched_list = num_sched_list; - spin_unlock(&entity->rq_lock); + spin_unlock(&entity->lock); } EXPORT_SYMBOL(drm_sched_entity_modify_sched); @@ -244,10 +244,10 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity) if (!entity->rq) return; - spin_lock(&entity->rq_lock); + spin_lock(&entity->lock); entity->stopped = true; drm_sched_rq_remove_entity(entity->rq, entity); - spin_unlock(&entity->rq_lock); + spin_unlock(&entity->lock); /* Make sure this entity is not used by the scheduler at the moment */ wait_for_completion(&entity->entity_idle); @@ -372,8 +372,8 @@ static void drm_sched_entity_clear_dep(struct dma_fence *f, } /* - * drm_sched_entity_clear_dep - callback to clear the entities dependency and - * wake up scheduler + * drm_sched_entity_wakeup - callback to clear the entity's dependency and + * wake up the scheduler */ static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb) @@ -391,14 +391,14 @@ static void drm_sched_entity_wakeup(struct dma_fence *f, * @entity: scheduler entity * @priority: scheduler priority * - * Update the priority of runqueus used for the entity. + * Update the priority of runqueues used for the entity. */ void drm_sched_entity_set_priority(struct drm_sched_entity *entity, enum drm_sched_priority priority) { - spin_lock(&entity->rq_lock); + spin_lock(&entity->lock); entity->priority = priority; - spin_unlock(&entity->rq_lock); + spin_unlock(&entity->lock); } EXPORT_SYMBOL(drm_sched_entity_set_priority); @@ -514,8 +514,17 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) struct drm_sched_job *next; next = to_drm_sched_job(spsc_queue_peek(&entity->job_queue)); - if (next) - drm_sched_rq_update_fifo(entity, next->submit_ts); + if (next) { + struct drm_sched_rq *rq; + + spin_lock(&entity->lock); + rq = entity->rq; + spin_lock(&rq->lock); + drm_sched_rq_update_fifo_locked(entity, rq, + next->submit_ts); + spin_unlock(&rq->lock); + spin_unlock(&entity->lock); + } } /* Jobs and entities might have different lifecycles. Since we're @@ -555,14 +564,14 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity) if (fence && !dma_fence_is_signaled(fence)) return; - spin_lock(&entity->rq_lock); + spin_lock(&entity->lock); sched = drm_sched_pick_best(entity->sched_list, entity->num_sched_list); rq = sched ? sched->sched_rq[entity->priority] : NULL; if (rq != entity->rq) { drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq; } - spin_unlock(&entity->rq_lock); + spin_unlock(&entity->lock); if (entity->num_sched_list == 1) entity->sched_list = NULL; @@ -576,8 +585,6 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity) * fence sequence number this function should be called with drm_sched_job_arm() * under common lock for the struct drm_sched_entity that was set up for * @sched_job in drm_sched_job_init(). - * - * Returns 0 for success, negative error code otherwise. */ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) { @@ -603,9 +610,9 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) struct drm_sched_rq *rq; /* Add the entity to the run queue */ - spin_lock(&entity->rq_lock); + spin_lock(&entity->lock); if (entity->stopped) { - spin_unlock(&entity->rq_lock); + spin_unlock(&entity->lock); DRM_ERROR("Trying to push to a killed entity\n"); return; @@ -614,11 +621,14 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) rq = entity->rq; sched = rq->sched; + spin_lock(&rq->lock); drm_sched_rq_add_entity(rq, entity); - spin_unlock(&entity->rq_lock); if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) - drm_sched_rq_update_fifo(entity, submit_ts); + drm_sched_rq_update_fifo_locked(entity, rq, submit_ts); + + spin_unlock(&rq->lock); + spin_unlock(&entity->lock); drm_sched_wakeup(sched); } diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index e97c6c60bc96..7ce25281c74c 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -41,7 +41,7 @@ * 4. Entities themselves maintain a queue of jobs that will be scheduled on * the hardware. * - * The jobs in a entity are always scheduled in the order that they were pushed. + * The jobs in an entity are always scheduled in the order in which they were pushed. * * Note that once a job was taken from the entities queue and pushed to the * hardware, i.e. the pending queue, the entity must not be referenced anymore @@ -159,35 +159,33 @@ static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a, return ktime_before(ent_a->oldest_job_waiting, ent_b->oldest_job_waiting); } -static inline void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity) +static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity, + struct drm_sched_rq *rq) { - struct drm_sched_rq *rq = entity->rq; - if (!RB_EMPTY_NODE(&entity->rb_tree_node)) { rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root); RB_CLEAR_NODE(&entity->rb_tree_node); } } -void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts) +void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity, + struct drm_sched_rq *rq, + ktime_t ts) { /* * Both locks need to be grabbed, one to protect from entity->rq change * for entity from within concurrent drm_sched_entity_select_rq and the * other to update the rb tree structure. */ - spin_lock(&entity->rq_lock); - spin_lock(&entity->rq->lock); + lockdep_assert_held(&entity->lock); + lockdep_assert_held(&rq->lock); - drm_sched_rq_remove_fifo_locked(entity); + drm_sched_rq_remove_fifo_locked(entity, rq); entity->oldest_job_waiting = ts; - rb_add_cached(&entity->rb_tree_node, &entity->rq->rb_tree_root, + rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root, drm_sched_entity_compare_before); - - spin_unlock(&entity->rq->lock); - spin_unlock(&entity->rq_lock); } /** @@ -219,15 +217,14 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler *sched, void drm_sched_rq_add_entity(struct drm_sched_rq *rq, struct drm_sched_entity *entity) { + lockdep_assert_held(&entity->lock); + lockdep_assert_held(&rq->lock); + if (!list_empty(&entity->list)) return; - spin_lock(&rq->lock); - atomic_inc(rq->sched->score); list_add_tail(&entity->list, &rq->entities); - - spin_unlock(&rq->lock); } /** @@ -241,6 +238,8 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq, void drm_sched_rq_remove_entity(struct drm_sched_rq *rq, struct drm_sched_entity *entity) { + lockdep_assert_held(&entity->lock); + if (list_empty(&entity->list)) return; @@ -253,7 +252,7 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq, rq->current_entity = NULL; if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) - drm_sched_rq_remove_fifo_locked(entity); + drm_sched_rq_remove_fifo_locked(entity, rq); spin_unlock(&rq->lock); } @@ -355,7 +354,6 @@ drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched, return ERR_PTR(-ENOSPC); } - rq->current_entity = entity; reinit_completion(&entity->entity_idle); break; } @@ -601,6 +599,9 @@ static void drm_sched_job_timedout(struct work_struct *work) * callers responsibility to release it manually if it's not part of the * pending list any more. * + * This function is typically used for reset recovery (see the docu of + * drm_sched_backend_ops.timedout_job() for details). Do not call it for + * scheduler teardown, i.e., before calling drm_sched_fini(). */ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) { @@ -673,16 +674,20 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) */ cancel_delayed_work(&sched->work_tdr); } - EXPORT_SYMBOL(drm_sched_stop); /** * drm_sched_start - recover jobs after a reset * * @sched: scheduler instance + * @errno: error to set on the pending fences * + * This function is typically used for reset recovery (see the docu of + * drm_sched_backend_ops.timedout_job() for details). Do not call it for + * scheduler startup. The scheduler itself is fully operational after + * drm_sched_init() succeeded. */ -void drm_sched_start(struct drm_gpu_scheduler *sched) +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno) { struct drm_sched_job *s_job, *tmp; @@ -697,13 +702,13 @@ void drm_sched_start(struct drm_gpu_scheduler *sched) atomic_add(s_job->credits, &sched->credit_count); if (!fence) { - drm_sched_job_done(s_job, -ECANCELED); + drm_sched_job_done(s_job, errno ?: -ECANCELED); continue; } if (dma_fence_add_callback(fence, &s_job->cb, drm_sched_job_done_cb)) - drm_sched_job_done(s_job, fence->error); + drm_sched_job_done(s_job, fence->error ?: errno); } drm_sched_start_timeout_unlocked(sched); @@ -778,6 +783,10 @@ EXPORT_SYMBOL(drm_sched_resubmit_jobs); * Drivers must make sure drm_sched_job_cleanup() if this function returns * successfully, even when @job is aborted before drm_sched_job_arm() is called. * + * Note that this function does not assign a valid value to each struct member + * of struct drm_sched_job. Take a look at that struct's documentation to see + * who sets which struct member with what lifetime. + * * WARNING: amdgpu abuses &drm_sched.ready to signal when the hardware * has died, which can mean that there's no valid runqueue for a @entity. * This function returns -ENOENT in this case (which probably should be -EIO as @@ -803,6 +812,14 @@ int drm_sched_job_init(struct drm_sched_job *job, return -EINVAL; } + /* + * We don't know for sure how the user has allocated. Thus, zero the + * struct so that unallowed (i.e., too early) usage of pointers that + * this function does not set is guaranteed to lead to a NULL pointer + * exception instead of UB. + */ + memset(job, 0, sizeof(*job)); + job->entity = entity; job->credits = credits; job->s_fence = drm_sched_fence_alloc(entity, owner); @@ -1333,6 +1350,19 @@ EXPORT_SYMBOL(drm_sched_init); * @sched: scheduler instance * * Tears down and cleans up the scheduler. + * + * This stops submission of new jobs to the hardware through + * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job() + * will not be called for all jobs still in drm_gpu_scheduler.pending_list. + * There is no solution for this currently. Thus, it is up to the driver to make + * sure that + * a) drm_sched_fini() is only called after for all submitted jobs + * drm_sched_backend_ops.free_job() has been called or that + * b) the jobs for which drm_sched_backend_ops.free_job() has not been called + * after drm_sched_fini() ran are freed manually. + * + * FIXME: Take care of the above problem and prevent this function from leaking + * the jobs in drm_gpu_scheduler.pending_list under any circumstances. */ void drm_sched_fini(struct drm_gpu_scheduler *sched) { @@ -1348,7 +1378,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched) list_for_each_entry(s_entity, &rq->entities, list) /* * Prevents reinsertion and marks job_queue as idle, - * it will removed from rq in drm_sched_entity_fini + * it will be removed from the rq in drm_sched_entity_fini() * eventually */ s_entity->stopped = true; @@ -1428,8 +1458,10 @@ EXPORT_SYMBOL(drm_sched_wqueue_ready); /** * drm_sched_wqueue_stop - stop scheduler submission - * * @sched: scheduler instance + * + * Stops the scheduler from pulling new jobs from entities. It also stops + * freeing jobs automatically through drm_sched_backend_ops.free_job(). */ void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched) { @@ -1441,8 +1473,12 @@ EXPORT_SYMBOL(drm_sched_wqueue_stop); /** * drm_sched_wqueue_start - start scheduler submission - * * @sched: scheduler instance + * + * Restarts the scheduler after drm_sched_wqueue_stop() has stopped it. + * + * This function is not necessary for 'conventional' startup. The scheduler is + * fully operational after drm_sched_init() succeeded. */ void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched) { |