-
Notifications
You must be signed in to change notification settings - Fork 424
/
Copy pathgpu-integrations.yml
106 lines (98 loc) · 3.83 KB
/
gpu-integrations.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Create and test a Python package on multiple dependencies versions.
trigger:
tags:
include:
- "*"
branches:
include:
- master
- release/*
- refs/tags/*
pr:
- master
- release/*
jobs:
- job: integrate_GPU
strategy:
matrix:
"torch | 2.0":
docker-image: "pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime"
torch-ver: "2.0"
requires: "oldest"
"torch | 2.x":
docker-image: "pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime"
torch-ver: "2.6"
# how long to run the job before automatically cancelling
timeoutInMinutes: "40"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: "lit-rtx-3090"
variables:
DEVICES: $( python -c 'name = "$(Agent.Name)" ; gpus = name.split("_")[-1] if "_" in name else "0,1"; print(gpus)' )
# these two caches assume to run repetitively on the same set of machines
TORCH_HOME: "/var/tmp/torch"
TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
HF_HOME: "/var/tmp/hf/home"
HF_HUB_CACHE: "/var/tmp/hf/hub"
PIP_CACHE_DIR: "/var/tmp/pip"
container:
image: "$(docker-image)"
options: "--gpus=all --shm-size=8g -v /usr/bin/docker:/tmp/docker:ro -v /var/tmp:/var/tmp"
workspace:
clean: all
steps:
- bash: |
set -ex
devices=$(DEVICES)
# overwrite and use only single device
device=${devices%,*}
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$device"
# nvcc --version # FIXME!
CUDA_version=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p')
CUDA_version_mm="${CUDA_version//'.'/''}"
echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$CUDA_version_mm"
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${CUDA_version_mm}/torch_stable.html"
# packages for running assistant
pip install -q fire wget packaging
displayName: "set Env. vars"
- bash: |
whoami && id
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
echo $CUDA_VISIBLE_DEVICES
echo $TORCH_URL
python --version
pip --version
pip cache dir
pip list
displayName: "Image info & NVIDIA"
- bash: |
set -e
python .github/assistant.py set-oldest-versions --req_files='["requirements/_integrate.txt"]'
cat requirements/_integrate.txt
condition: eq(variables['requires'], 'oldest')
displayName: "Setting oldest req."
- bash: |
set -e
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
for fpath in `ls requirements/*.txt`; do
# torch version shall be sourced based on the used docker
python adjust-torch-versions.py $fpath
done
displayName: "Adjust versions"
- bash: |
pip install -q -r requirements/_integrate.txt
# force reinstall TM as it could be overwritten by integration's dependencies
pip install . -U -r requirements/_tests.txt --find-links ${TORCH_URL}
displayName: "Install package & integrations"
- bash: |
set -e
pip list
python -c "from torch import __version__ as ver ; assert '.'.join(str(ver).split('.')[:2]) == '$(torch-ver)', f'PyTorch: {ver}'"
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 1, f'found GPUs: {mgpu}'"
displayName: "Sanity check"
- bash: pytest . -v --durations=0 --timeout=360
workingDirectory: "tests/integrations/"
timeoutInMinutes: "15"
displayName: "Test integrations"