Skip to content

Commit d52cf48

Browse files
bweltonkentrussell
authored andcommitted
amd/amdkfd: fix double lock aquisition in set_perfcount
Seperates out locking from update_queue to allow updating of queues by code already holding the mqd lock. Fixes a hang in set_perfcount. This change was in the original mailing list commit for set_perfcount but was not included in gerrit. Fixes: b58289f0abf7 ("Add kfd_ioctl_profiler to contain profiler kernel driver changes") Signed-off-by: Benjamin Welton <benjamin.welton@amd.com> Acked-by: Kent Russell <kent.russell@amd.com> Change-Id: Ica4a5881c357a67dfe0922b39574e25f07718687
1 parent 1746a9c commit d52cf48

File tree

1 file changed

+39
-33
lines changed

1 file changed

+39
-33
lines changed

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

+39-33
Original file line numberDiff line numberDiff line change
@@ -318,29 +318,6 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
318318
return r;
319319
}
320320

321-
static void set_perfcount(struct device_queue_manager *dqm, int enable)
322-
{
323-
struct device_process_node *cur;
324-
struct qcm_process_device *qpd;
325-
struct queue *q;
326-
struct mqd_update_info minfo = { 0 };
327-
328-
if (!dqm)
329-
return;
330-
331-
minfo.update_flag = (enable == 1 ? UPDATE_FLAG_PERFCOUNT_ENABLE :
332-
UPDATE_FLAG_PERFCOUNT_DISABLE);
333-
dqm_lock(dqm);
334-
list_for_each_entry(cur, &dqm->queues, list) {
335-
qpd = cur->qpd;
336-
list_for_each_entry(q, &qpd->queues_list, list) {
337-
pqm_update_mqd(qpd->pqm, q->properties.queue_id,
338-
&minfo);
339-
}
340-
}
341-
dqm_unlock(dqm);
342-
}
343-
344321
static int remove_all_queues_mes(struct device_queue_manager *dqm)
345322
{
346323
struct device_process_node *cur;
@@ -952,7 +929,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
952929
return retval;
953930
}
954931

955-
static int update_queue(struct device_queue_manager *dqm, struct queue *q,
932+
static int update_queue_locked(struct device_queue_manager *dqm, struct queue *q,
956933
struct mqd_update_info *minfo)
957934
{
958935
int retval = 0;
@@ -961,11 +938,9 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
961938
struct kfd_process_device *pdd;
962939
bool prev_active = false;
963940

964-
dqm_lock(dqm);
965941
pdd = kfd_get_process_device_data(q->device, q->process);
966942
if (!pdd) {
967-
retval = -ENODEV;
968-
goto out_unlock;
943+
return -ENODEV;
969944
}
970945
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
971946
q->properties.type)];
@@ -983,13 +958,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
983958

984959
/* queue is reset so inaccessable */
985960
if (pdd->has_reset_queue) {
986-
retval = -EACCES;
987-
goto out_unlock;
961+
return -EACCES;
988962
}
989963

990964
if (retval) {
991965
dev_err(dev, "unmap queue failed\n");
992-
goto out_unlock;
966+
return retval;
993967
}
994968
} else if (prev_active &&
995969
(q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
@@ -998,7 +972,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
998972

999973
if (!dqm->sched_running) {
1000974
WARN_ONCE(1, "Update non-HWS queue while stopped\n");
1001-
goto out_unlock;
975+
return retval;
1002976
}
1003977

1004978
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
@@ -1008,7 +982,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
1008982
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
1009983
if (retval) {
1010984
dev_err(dev, "destroy mqd failed\n");
1011-
goto out_unlock;
985+
return retval;
1012986
}
1013987
}
1014988

@@ -1056,11 +1030,43 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
10561030
&q->properties, current->mm);
10571031
}
10581032

1059-
out_unlock:
1033+
return retval;
1034+
}
1035+
1036+
static int update_queue(struct device_queue_manager *dqm, struct queue *q,
1037+
struct mqd_update_info *minfo)
1038+
{
1039+
int retval;
1040+
1041+
dqm_lock(dqm);
1042+
retval = update_queue_locked(dqm, q, minfo);
10601043
dqm_unlock(dqm);
1044+
10611045
return retval;
10621046
}
10631047

1048+
static void set_perfcount(struct device_queue_manager *dqm, int enable)
1049+
{
1050+
struct device_process_node *cur;
1051+
struct qcm_process_device *qpd;
1052+
struct queue *q;
1053+
struct mqd_update_info minfo = { 0 };
1054+
1055+
if (!dqm)
1056+
return;
1057+
1058+
minfo.update_flag = (enable == 1 ? UPDATE_FLAG_PERFCOUNT_ENABLE :
1059+
UPDATE_FLAG_PERFCOUNT_DISABLE);
1060+
dqm_lock(dqm);
1061+
list_for_each_entry(cur, &dqm->queues, list) {
1062+
qpd = cur->qpd;
1063+
list_for_each_entry(q, &qpd->queues_list, list) {
1064+
update_queue_locked(dqm, q, &minfo);
1065+
}
1066+
}
1067+
dqm_unlock(dqm);
1068+
}
1069+
10641070
/* suspend_single_queue does not lock the dqm like the
10651071
* evict_process_queues_cpsch or evict_process_queues_nocpsch. You should
10661072
* lock the dqm before calling, and unlock after calling.

0 commit comments

Comments
 (0)