Merge branch 'main' into fix-quantization-lib

huggingface · Sep 4, 2024 · 459262b · 459262b
2 parents 1b66595 + 6dfec54
commit 459262b
Show file tree

Hide file tree

Showing 22 changed files with 95 additions and 50 deletions.
diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml
@@ -27,7 +27,7 @@ concurrency:
 
 jobs:
   run_api_rocm_tests:
-    runs-on: [self-hosted, amd-gpu, single-gpu, mi250]
+    runs-on: [self-hosted, amd-gpu, single-gpu]
 
     container:
       image: ghcr.io/huggingface/optimum-benchmark:latest-rocm
@@ -37,6 +37,7 @@ jobs:
         --device /dev/kfd
         --device /dev/dri
         --env ROCR_VISIBLE_DEVICES
+        --env HIP_VISIBLE_DEVICES=0
         --volume /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:

diff --git a/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml
@@ -37,6 +37,7 @@ jobs:
         --device /dev/kfd
         --device /dev/dri
         --env ROCR_VISIBLE_DEVICES
+        --env HIP_VISIBLE_DEVICES=0,1
         --volume /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
@@ -45,7 +46,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq]
+          pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq] "deepspeed<0.15"
 
       - name: Run tests
         env:

diff --git a/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml
@@ -37,6 +37,7 @@ jobs:
         --device /dev/kfd
         --device /dev/dri
         --env ROCR_VISIBLE_DEVICES
+        --env HIP_VISIBLE_DEVICES=0
         --volume /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
@@ -52,4 +53,4 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/rocm
         run: |
-          pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
+          pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq or gptq)"
diff --git a/Makefile b/Makefile
@@ -172,10 +172,10 @@ test_cli_rocm_pytorch_multi_gpu:
 test_cli_rocm_pytorch_single_gpu:
 	pytest -s -k "cli and rocm and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
 
-# llm-perf
 test_cli_llama_cpp:
 	pytest -s -k "llama_cpp"
 
+# llm-perf
 install_llm_perf_cuda_pytorch:
 	pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
 	pip install -U transformers huggingface_hub[hf_transfer]

diff --git a/docker/rocm/Dockerfile b/docker/rocm/Dockerfile
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG ROCM_VERSION=5.7.1
+ARG ROCM_VERSION=6.1.2
 ARG UBUNTU_VERSION=22.04
 
 FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}
 
 # Install necessary packages
 ENV PATH="/opt/rocm/bin:${PATH}"
-ENV DEBIAN_FRONTEND noninteractive
+ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ 
     sudo build-essential git bash-completion \
     python3.10 python3-pip python3.10-dev && \
@@ -29,13 +29,13 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco
     cd /opt/rocm/share/amd_smi && pip install .
 
 # Install PyTorch
-ARG TORCH_ROCM=rocm5.7
+ARG TORCH_ROCM=rocm6.1
 ARG TORCH_VERSION=stable
 
 RUN if [ "${TORCH_VERSION}" = "stable" ]; then \
     pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \
-elif [ "${TORCH_VERSION}" = "nighly" ]; then \
-    pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \
+elif [ "${TORCH_VERSION}" = "nightly" ]; then \
+    pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/${TORCH_ROCM} ; \
 else \
     pip install --no-cache-dir torch==${TORCH_VERSION} torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \
 fi
diff --git a/docker/unroot/Dockerfile b/docker/unroot/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE
+ARG IMAGE="optimum-benchmark:latest"
 
 FROM $IMAGE
 

diff --git a/optimum_benchmark/__init__.py b/optimum_benchmark/__init__.py
@@ -1,7 +1,7 @@
 from .backends import (
     BackendConfig,
-    IPEXConfig,
     INCConfig,
+    IPEXConfig,
     LlamaCppConfig,
     LLMSwarmConfig,
     ORTConfig,

diff --git a/optimum_benchmark/backends/__init__.py b/optimum_benchmark/backends/__init__.py
@@ -1,7 +1,7 @@
 from .config import BackendConfig
+from .ipex.config import IPEXConfig
 from .llama_cpp.config import LlamaCppConfig
 from .llm_swarm.config import LLMSwarmConfig
-from .ipex.config import IPEXConfig
 from .neural_compressor.config import INCConfig
 from .onnxruntime.config import ORTConfig
 from .openvino.config import OVConfig

diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
@@ -97,9 +97,10 @@ def __post_init__(self):
             if is_nvidia_system():
                 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
                 os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids
+                LOGGER.info(f"CUDA_VISIBLE_DEVICES was set to {os.environ['CUDA_VISIBLE_DEVICES']}.")
             elif is_rocm_system():
-                # https://rocm.docs.amd.com/en/latest/conceptual/gpu-isolation.html
                 os.environ["ROCR_VISIBLE_DEVICES"] = self.device_ids
+                LOGGER.info(f"ROCR_VISIBLE_DEVICES was set to {os.environ['ROCR_VISIBLE_DEVICES']}.")
             else:
                 raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.")
 

diff --git a/optimum_benchmark/backends/ipex/backend.py b/optimum_benchmark/backends/ipex/backend.py
@@ -1,14 +1,11 @@
-import inspect
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
 from typing import Any, Dict
 
 import torch
 from hydra.utils import get_class
 
-from ...generators.dataset_generator import DatasetGenerator
 from ...import_utils import is_accelerate_available, is_torch_distributed_available
-from ...task_utils import TEXT_GENERATION_TASKS
 from ..base import Backend
 from ..transformers_utils import fast_weights_init
 from .config import IPEXConfig
@@ -33,7 +30,6 @@ def __init__(self, config: IPEXConfig) -> None:
         else:
             raise NotImplementedError(f"IPEXBackend does not support task {self.config.task}")
 
-
     def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()

diff --git a/optimum_benchmark/backends/ipex/config.py b/optimum_benchmark/backends/ipex/config.py
@@ -1,11 +1,12 @@
-from dataclasses import dataclass, field
-from typing import Any, Dict, Optional
+from dataclasses import dataclass
+from typing import Optional
 
 from ...import_utils import ipex_version
 from ..config import BackendConfig
 
 TORCH_DTYPES = ["bfloat16", "float16", "float32", "auto"]
 
+
 @dataclass
 class IPEXConfig(BackendConfig):
     name: str = "ipex"
@@ -34,4 +35,3 @@ def __post_init__(self):
 
         if self.torch_dtype is not None and self.torch_dtype not in TORCH_DTYPES:
             raise ValueError(f"`torch_dtype` must be one of {TORCH_DTYPES}. Got {self.torch_dtype} instead.")
-
diff --git a/optimum_benchmark/backends/ipex/utils.py b/optimum_benchmark/backends/ipex/utils.py
@@ -7,4 +7,3 @@
     "image-classification": "optimum.intel.IPEXModelForImageClassification",
     "audio-classification": "optimum.intel.IPEXModelForAudioClassification",
 }
-
diff --git a/optimum_benchmark/cli.py b/optimum_benchmark/cli.py
@@ -10,10 +10,10 @@
     Benchmark,
     BenchmarkConfig,
     EnergyStarConfig,
-    IPEXConfig,
     INCConfig,
     InferenceConfig,
     InlineConfig,
+    IPEXConfig,
     LlamaCppConfig,
     LLMSwarmConfig,
     ORTConfig,

diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py
@@ -158,14 +158,17 @@ def onnxruntime_version():
             except importlib.metadata.PackageNotFoundError:
                 return None
 
+
 def openvino_version():
     if _openvino_available:
         return importlib.metadata.version("openvino")
 
+
 def ipex_version():
     if _ipex_available:
         return importlib.metadata.version("intel_extension_for_pytorch")
 
+
 def neural_compressor_version():
     if _neural_compressor_available:
         return importlib.metadata.version("neural_compressor")

diff --git a/optimum_benchmark/launchers/base.py b/optimum_benchmark/launchers/base.py
@@ -39,10 +39,10 @@ def launch(self, worker: Callable[..., BenchmarkReport], worker_args: List[Any])
     @contextmanager
     def device_isolation(self, pid: int, device_ids: Optional[str] = None):
         if device_ids is None:
-            if is_nvidia_system():
-                device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-            elif is_rocm_system():
+            if is_rocm_system():
                 device_ids = os.environ.get("ROCR_VISIBLE_DEVICES", None)
+            elif is_nvidia_system():
+                device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None)
 
         self.device_isolation_process = Process(
             target=assert_device_isolation,

diff --git a/optimum_benchmark/system_utils.py b/optimum_benchmark/system_utils.py
@@ -173,7 +173,9 @@ def get_gpu_vram_mb() -> List[int]:
 
 def get_gpu_device_ids() -> str:
     if is_nvidia_system():
-        if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
+        if os.environ.get("NVIDIA_VISIBLE_DEVICES", None) is not None:
+            device_ids = os.environ["NVIDIA_VISIBLE_DEVICES"]
+        elif os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
             device_ids = os.environ["CUDA_VISIBLE_DEVICES"]
         else:
             if not is_pynvml_available():
@@ -187,12 +189,12 @@ def get_gpu_device_ids() -> str:
             device_ids = ",".join(str(i) for i in device_ids)
             pynvml.nvmlShutdown()
     elif is_rocm_system():
-        if os.environ.get("GPU_DEVICE_ORDINAL", None) is not None:
-            device_ids = os.environ["GPU_DEVICE_ORDINAL"]
+        if os.environ.get("ROCR_VISIBLE_DEVICES", None) is not None:
+            device_ids = os.environ["ROCR_VISIBLE_DEVICES"]
         elif os.environ.get("HIP_VISIBLE_DEVICES", None) is not None:
             device_ids = os.environ["HIP_VISIBLE_DEVICES"]
-        elif os.environ.get("ROCR_VISIBLE_DEVICES", None) is not None:
-            device_ids = os.environ["ROCR_VISIBLE_DEVICES"]
+        elif os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
+            device_ids = os.environ["CUDA_VISIBLE_DEVICES"]
         else:
             if not is_amdsmi_available() or not is_pyrsmi_available():
                 raise ValueError(
@@ -201,17 +203,18 @@ def get_gpu_device_ids() -> str:
                     "or PyRSMI library from https://github.com/ROCm/pyrsmi."
                 )
 
-            if is_amdsmi_available():
+            if is_pyrsmi_available():
+                rocml.smi_initialize()
+                device_ids = list(range(rocml.smi_get_device_count()))
+                device_ids = ",".join(str(i) for i in device_ids)
+                rocml.smi_shutdown()
+
+            elif is_amdsmi_available():
                 amdsmi.amdsmi_init()
                 device_ids = list(range(len(amdsmi.amdsmi_get_processor_handles())))
                 device_ids = ",".join(str(i) for i in device_ids)
                 amdsmi.amdsmi_shut_down()
 
-            elif is_pyrsmi_available():
-                rocml.smi_initialize()
-                device_ids = list(range(rocml.smi_get_device_count()))
-                device_ids = ",".join(str(i) for i in device_ids)
-                rocml.smi_shutdown()
     else:
         raise ValueError("Couldn't infer GPU device ids.")
 

diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py
@@ -105,7 +105,7 @@ def aggregate(efficiencies: List["Efficiency"]) -> "Efficiency":
     def from_energy(energy: "Energy", volume: int, unit: str) -> "Efficiency":
         return Efficiency(value=volume / energy.total if energy.total > 0 else 0, unit=unit)
 
-    def log(self, prefix: str = "method"):
+    def log(self, prefix: str = ""):
         LOGGER.info(f"\t\t+ {prefix} energy efficiency: {self.value:f} ({self.unit})")
 
 

diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
@@ -76,7 +76,7 @@ def from_values(values: List[float], unit: str) -> "Latency":
             values=values,
         )
 
-    def log(self, prefix: str = "method"):
+    def log(self, prefix: str = ""):
         stdev_percentage = 100 * self.stdev / self.mean if self.mean > 0 else 0
         LOGGER.info(f"\t\t+ {prefix} latency:")
         LOGGER.info(f"\t\t\t- count: {self.count}")

diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
@@ -81,7 +81,7 @@ def aggregate(memories: List["Memory"]) -> "Memory":
             max_allocated=max_allocated,
         )
 
-    def log(self, prefix: str = "forward"):
+    def log(self, prefix: str = ""):
         LOGGER.info(f"\t\t+ {prefix} memory:")
         if self.max_ram is not None:
             LOGGER.info(f"\t\t\t- max RAM: {self.max_ram:f} ({self.unit})")
@@ -303,7 +303,12 @@ def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connectio
                 device_handle = devices_handles[device_id]
 
                 try:
-                    used_global_memory += rocml.smi_get_device_memory_used(device_id)
+                    if is_amdsmi_available():
+                        used_global_memory += amdsmi.amdsmi_get_gpu_memory_total(
+                            device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM
+                        )
+                    elif is_pyrsmi_available():
+                        used_global_memory += rocml.smi_get_device_memory_used(device_id, type="VRAM")
                 except Exception as e:
                     LOGGER.warning(f"Could not get memory usage for device {device_id}: {e}")
 

diff --git a/setup.py b/setup.py
@@ -63,7 +63,7 @@
     "quality": ["ruff"],
     "testing": ["pytest", "hydra-joblib-launcher"],
     # optimum backends
-    "ipex":[f"optimum[ipex]>={MIN_OPTIMUM_VERSION}"],
+    "ipex": [f"optimum[ipex]>={MIN_OPTIMUM_VERSION}"],
     "openvino": [f"optimum[openvino,nncf]>={MIN_OPTIMUM_VERSION}"],
     "onnxruntime": [f"optimum[onnxruntime]>={MIN_OPTIMUM_VERSION}"],
     "onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={MIN_OPTIMUM_VERSION}"],
Original file line number	Diff line number	Diff line change
Expand Up		@@ -7,4 +7,3 @@
		"image-classification": "optimum.intel.IPEXModelForImageClassification",
		"audio-classification": "optimum.intel.IPEXModelForAudioClassification",
		}