.azure/gpu-integrations.yml

# Create and test a Python package on multiple dependencies versions.

trigger:
  tags:
    include:
      - "*"
  branches:
    include:
      - master
      - release/*
      - refs/tags/*
pr:
  - master
  - release/*

jobs:
  - job: integrate_GPU
    strategy:
      matrix:
        "torch | 2.0":
          docker-image: "pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime"
          torch-ver: "2.0"
          requires: "oldest"
        "torch | 2.x":
          docker-image: "pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime"
          torch-ver: "2.6"
    # how long to run the job before automatically cancelling
    timeoutInMinutes: "40"
    # how much time to give 'run always even if cancelled tasks' before stopping them
    cancelTimeoutInMinutes: "2"
    pool: "lit-rtx-3090"
    variables:
      DEVICES: $( python -c 'name = "$(Agent.Name)" ; gpus = name.split("_")[-1] if "_" in name else "0,1"; print(gpus)' )
      # these two caches assume to run repetitively on the same set of machines
      TORCH_HOME: "/var/tmp/torch"
      TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
      HF_HOME: "/var/tmp/hf/home"
      HF_HUB_CACHE: "/var/tmp/hf/hub"
      PIP_CACHE_DIR: "/var/tmp/pip"
    container:
      image: "$(docker-image)"
      options: "--gpus=all --shm-size=8g -v /usr/bin/docker:/tmp/docker:ro  -v /var/tmp:/var/tmp"
    workspace:
      clean: all
    steps:
      - bash: |
          set -ex
          devices=$(DEVICES)
          # overwrite and use only single device
          device=${devices%,*}
          echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$device"
          # nvcc --version  # FIXME!
          CUDA_version=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p')
          CUDA_version_mm="${CUDA_version//'.'/''}"
          echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$CUDA_version_mm"
          echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${CUDA_version_mm}/torch_stable.html"
          # packages for running assistant
          pip install -q fire wget packaging
        displayName: "set Env. vars"

      - bash: |
          whoami && id
          lspci | egrep 'VGA|3D'
          whereis nvidia
          nvidia-smi
          echo $CUDA_VISIBLE_DEVICES
          echo $TORCH_URL
          python --version
          pip --version
          pip cache dir
          pip list
        displayName: "Image info & NVIDIA"

      - bash: |
          set -e
          python .github/assistant.py set-oldest-versions --req_files='["requirements/_integrate.txt"]'
          cat requirements/_integrate.txt
        condition: eq(variables['requires'], 'oldest')
        displayName: "Setting oldest req."

      - bash: |
          set -e
          python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
          for fpath in `ls requirements/*.txt`; do
              # torch version shall be sourced based on the used docker
              python adjust-torch-versions.py $fpath
          done
        displayName: "Adjust versions"

      - bash: |
          pip install -q -r requirements/_integrate.txt
          # force reinstall TM as it could be overwritten by integration's dependencies
          pip install . -U -r requirements/_tests.txt --find-links ${TORCH_URL}
        displayName: "Install package & integrations"

      - bash: |
          set -e
          pip list
          python -c "from torch import __version__ as ver ; assert '.'.join(str(ver).split('.')[:2]) == '$(torch-ver)', f'PyTorch: {ver}'"
          python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 1, f'found GPUs: {mgpu}'"
        displayName: "Sanity check"

      - bash: pytest . -v --durations=0 --timeout=360
        workingDirectory: "tests/integrations/"
        timeoutInMinutes: "15"
        displayName: "Test integrations"