Skip to content

Commit

Permalink
Merge pull request #56 from MoseleyBioinformaticsLab/amd
Browse files Browse the repository at this point in the history
Implements the AMD GPU querier class
  • Loading branch information
erikhuck authored Feb 22, 2025
2 parents dedc361 + a724702 commit 2c006ec
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 16 deletions.
39 changes: 38 additions & 1 deletion src/gpu_tracker/tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class _NvidiaQuerier(_GPUQuerier):
command = 'nvidia-smi'

@classmethod
def _query_gpu(cls, *args: list[str], ram_column: str | None = None):
def _query_gpu(cls, *args: list[str], ram_column: str):
gpu_info = super()._query_gpu(*args, '--format=csv')
gpu_info.columns = [col.replace('[MiB]', '').replace('[%]', '').strip() for col in gpu_info.columns]
gpu_info[ram_column] = gpu_info[ram_column].apply(lambda ram: int(ram.replace('MiB', '').strip()))
Expand All @@ -75,8 +75,45 @@ def ram_and_utilization(cls) -> pd.DataFrame:
gpu_info.utilization_percent = [float(percentage.replace('%', '').strip()) for percentage in gpu_info.utilization_percent]
return gpu_info


class _AMDQuerier(_GPUQuerier):
command = 'amd-smi'
__id_to_uuid = None

@classmethod
@property
def _id_to_uuid(cls) -> dict[int, str]:
if cls.__id_to_uuid is None:
gpu_info = super()._query_gpu('list', '--csv')
cls.__id_to_uuid = {gpu_id: uuid for gpu_id, uuid in zip(gpu_info.gpu, gpu_info.gpu_uuid)}
return cls.__id_to_uuid

@classmethod
def _query_gpu(cls, *args: list[str], ram_column: str) -> pd.DataFrame:
gpu_info = super()._query_gpu(*args, '--csv')
if 'gpu' in gpu_info.columns:
gpu_info.gpu = [cls._id_to_uuid[gpu_id] for gpu_id in gpu_info.gpu]
gpu_info = gpu_info.rename(columns={'gpu': 'uuid'})
return gpu_info.rename(columns={ram_column: 'ram'})

@classmethod
def static_info(cls) -> pd.DataFrame:
gpu_info = cls._query_gpu('static', '--vram', ram_column='size')
return gpu_info[['uuid', 'ram']]

@classmethod
def process_ram(cls) -> pd.DataFrame:
gpu_info = cls._query_gpu('process', ram_column='vram_mem')
gpu_info.ram = [ram / 1e6 for ram in gpu_info.ram] # RAM is in bytes for the process subcommand.
return gpu_info[['pid', 'ram']]

@classmethod
def ram_and_utilization(cls) -> pd.DataFrame:
gpu_info = cls._query_gpu('monitor', '--vram-usage', '--gfx', ram_column='vram_used')
gpu_info = gpu_info[['uuid', 'gfx', 'ram']]
gpu_info.gfx = gpu_info.gfx.astype(float)
return gpu_info.rename(columns={'gfx': 'utilization_percent'})


class _TrackingProcess(mproc.Process):
_CPU_PERCENT_INTERVAL = 0.1
Expand Down
50 changes: 35 additions & 15 deletions tests/test_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
'GPU. Otherwise the GPU RAM and GPU utilization values will remain 0.0.')


@pt.fixture(name='gpu_brand', params=['amd', 'nvidia'])
def get_gpu_brand(request) -> str:
yield request.param


@pt.fixture(name='operating_system', params=['Linux', 'not-linux'])
def get_operating_system(request) -> str:
yield request.param
Expand All @@ -30,8 +35,8 @@ def get_use_context_manager(request) -> bool:

@pt.mark.parametrize('ram_unit,gpu_ram_unit,time_unit,gpu_uuids,n_expected_cores', test_tracker_data)
def test_tracker(
mocker, use_context_manager: bool, operating_system: str, ram_unit: str, gpu_ram_unit: str, time_unit: str, gpu_uuids: set[str],
n_expected_cores: int):
mocker, gpu_brand: str, use_context_manager: bool, operating_system: str, ram_unit: str, gpu_ram_unit: str, time_unit: str,
gpu_uuids: set[str], n_expected_cores: int):
class EventMock:
def __init__(self):
self.count = 0
Expand Down Expand Up @@ -126,17 +131,31 @@ def start_mock(self):
'gpu_tracker.tracker.psutil.virtual_memory', side_effect=[
mocker.MagicMock(total=67 * 1e9), mocker.MagicMock(used=30 * 1e9), mocker.MagicMock(used=31 * 1e9),
mocker.MagicMock(used=29 * 1e9)])
nvidia_smi_outputs = [
b'',
b'',
b' uuid,memory.total [MiB]\ngpu-id1,12198 MiB\ngpu-id2,12198 MiB\ngpu-id3 , 12198MiB',
b'pid, used_gpu_memory [MiB]\n',
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 0 MiB, 0 %\ngpu-id2 , 0 MiB, 0 %\ngpu-id3 , 0 MiB, 0 %',
b'pid, used_gpu_memory [MiB]\n12,1600 MiB\n21,700 MiB\n22,200 MiB',
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1600 MiB,75 %\ngpu-id2,900 MiB , 50 %\n gpu-id3, 500 MiB, 25 %',
b'pid, used_gpu_memory [MiB]\n12,1500 MiB\n21,2100 MiB\n22,2200 MiB',
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1500 MiB, 55 %\n gpu-id2, 4300 MiB, 45%\ngpu-id3,700MiB,35%']
check_output_mock = mocker.patch('gpu_tracker.tracker.subp.check_output', side_effect=nvidia_smi_outputs)
if gpu_brand == 'nvidia':
check_output_side_effect = [
b'',
b'',
b' uuid,memory.total [MiB]\ngpu-id1,12198 MiB\ngpu-id2,12198 MiB\ngpu-id3 , 12198MiB',
b'pid, used_gpu_memory [MiB]\n',
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 0 MiB, 0 %\ngpu-id2 , 0 MiB, 0 %\ngpu-id3 , 0 MiB, 0 %',
b'pid, used_gpu_memory [MiB]\n12,1600 MiB\n21,700 MiB\n22,200 MiB',
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1600 MiB,75 %\ngpu-id2,900 MiB , 50 %\n gpu-id3, 500 MiB, 25 %',
b'pid, used_gpu_memory [MiB]\n12,1500 MiB\n21,2100 MiB\n22,2200 MiB',
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1500 MiB, 55 %\n gpu-id2, 4300 MiB, 45%\ngpu-id3,700MiB,35%']
else:
check_output_side_effect = [
FileNotFoundError,
b'',
b'gpu,size,extraneous-col\n0,12198,some-val\n1,12198,some-val\n2 , 12198,some-val',
b'gpu,gpu_uuid,extraneous-col\n0,gpu-id1,some-val\n1,gpu-id2,some-val\n2,gpu-id3 ,some-val',
b'pid,vram_mem\n',
b'gpu,vram_used,gfx\n0,0,0\n1 ,0,0\n2 ,0,0',
b'pid,vram_mem\n12,1600000000\n21,700000000\n22,200000000',
b'gpu,vram_used,gfx\n0,1600,75\n1,900,50\n2,500,25',
b'pid,vram_mem\n12,1500000000\n21,2100000000\n22,2200000000',
b'gpu,vram_used,gfx\n0,1500,55\n1,4300,45\n2,700,35'
]
check_output_mock = mocker.patch('gpu_tracker.tracker.subp.check_output', side_effect=check_output_side_effect)
cpu_count_mock = mocker.patch('gpu_tracker.tracker.psutil.cpu_count', return_value=4)
cpu_percent_mock = mocker.patch(
'gpu_tracker.tracker.psutil.cpu_percent', side_effect=[[67.5, 27.3, 77.8, 97.9], [57.6, 58.2, 23.5, 99.8], [78.3, 88.3, 87.2, 22.5]])
Expand All @@ -157,6 +176,7 @@ def start_mock(self):
gpu_uuids=gpu_uuids, n_expected_cores=n_expected_cores)
tracker.start()
tracker.stop()
gput.tracker._AMDQuerier._AMDQuerier__id_to_uuid = None
assert start_mock.called
assert not os.path.isfile(tracker._resource_usage_file)
assert not log_spy.called
Expand All @@ -175,7 +195,7 @@ def start_mock(self):
utils.assert_args_list(mock=main_process_mock.memory_info, expected_args_list=[()] * 3)
utils.assert_args_list(mock=child1_mock.memory_info, expected_args_list=[()] * 3)
utils.assert_args_list(mock=child2_mock.memory_info, expected_args_list=[()] * 3)
assert len(check_output_mock.call_args_list) == 9
assert len(check_output_mock.call_args_list) == 10 if gpu_brand == 'amd' else 9
os_mock.getpid.assert_called_once_with()
utils.assert_args_list(mock=time_mock.time, expected_args_list=[()] * 5)
cpu_percent_interval = gput.tracker._TrackingProcess._CPU_PERCENT_INTERVAL
Expand Down Expand Up @@ -211,7 +231,7 @@ def side_effect_func(command, *_, **__) -> None:
# The check_output mock is called 3 times before it's supposed to, causing a "RuntimeError: generator raised StopIteration".
if command in ('nvidia-smi', 'amd-smi'):
raise exceptions.pop()
raise FileNotFoundError()
raise FileNotFoundError() # pragma: nocover
mocker.patch('gpu_tracker.tracker.subp.check_output', side_effect=side_effect_func)
gput.Tracker()
gput.Tracker()
Expand Down

0 comments on commit 2c006ec

Please sign in to comment.