From 70e846d53b596b2dfa213d5c3fce8569d972c7a5 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Mon, 24 Feb 2025 17:36:26 +0000 Subject: [PATCH 1/9] test(neuron): refactor to prepare batch export --- integration-tests/fixtures/neuron/model.py | 27 +++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/integration-tests/fixtures/neuron/model.py b/integration-tests/fixtures/neuron/model.py index 3345e2ea221..2d58351ce71 100644 --- a/integration-tests/fixtures/neuron/model.py +++ b/integration-tests/fixtures/neuron/model.py @@ -118,10 +118,11 @@ def get_tgi_docker_image(): return docker_image -def export_model(config_name, model_config, neuron_model_name): - """Export a neuron model. +def maybe_export_model(config_name, model_config): + """Export a neuron model for the specified test configuration. - The model is exported by a custom image built on the fly from the base TGI image. + If the neuron model has not already been compiled and pushed to the hub, it is + exported by a custom image built on the fly from the base TGI image. This makes sure the exported model and image are aligned and avoids introducing neuron specific imports in the test suite. @@ -130,9 +131,15 @@ def export_model(config_name, model_config, neuron_model_name): Used to identify test configurations model_config (`str`): The model configuration for export (includes the original model id) - neuron_model_name (`str`): - The name of the exported model on the hub """ + neuron_model_name = get_neuron_model_name(config_name) + neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}" + hub = huggingface_hub.HfApi() + if hub.repo_exists(neuron_model_id): + logger.info( + f"Skipping model export for config {config_name} as {neuron_model_id} already exists" + ) + return neuron_model_id client = docker.from_env() @@ -206,6 +213,7 @@ def export_model(config_name, model_config, neuron_model_name): except Exception as e: logger.error("Error while removing image %s, skipping", image.id) logger.exception(e) + return neuron_model_id @pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys()) @@ -232,14 +240,11 @@ def neuron_model_config(request): """ config_name = request.param model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param]) - neuron_model_name = get_neuron_model_name(config_name) - neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}" + # Export the model first (only if needed) + neuron_model_id = maybe_export_model(config_name, model_config) with TemporaryDirectory() as neuron_model_path: - hub = huggingface_hub.HfApi() - if not hub.repo_exists(neuron_model_id): - # Export the model first - export_model(config_name, model_config, neuron_model_name) logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub") + hub = huggingface_hub.HfApi() hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path) # Add dynamic parameters to the model configuration model_config["neuron_model_path"] = neuron_model_path From 53c12269397a9bea7701d8f488a5b4531b75b342 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Mon, 24 Feb 2025 17:37:37 +0000 Subject: [PATCH 2/9] test(neuron): add helper to batch export models Also rename fixture file fro clarity. --- integration-tests/conftest.py | 2 +- .../fixtures/neuron/{model.py => export_models.py} | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) rename integration-tests/fixtures/neuron/{model.py => export_models.py} (97%) diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index e0451052120..0ffcd162b85 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -1,4 +1,4 @@ -pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.model"] +pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"] # ruff: noqa: E402 from _pytest.fixtures import SubRequest import requests diff --git a/integration-tests/fixtures/neuron/model.py b/integration-tests/fixtures/neuron/export_models.py similarity index 97% rename from integration-tests/fixtures/neuron/model.py rename to integration-tests/fixtures/neuron/export_models.py index 2d58351ce71..a49642da826 100644 --- a/integration-tests/fixtures/neuron/model.py +++ b/integration-tests/fixtures/neuron/export_models.py @@ -216,6 +216,11 @@ def maybe_export_model(config_name, model_config): return neuron_model_id +def maybe_export_models(): + for config_name, model_config in MODEL_CONFIGURATIONS.items(): + maybe_export_model(config_name, model_config) + + @pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys()) def neuron_model_config(request): """Expose a pre-trained neuron model @@ -262,3 +267,7 @@ def neuron_model_config(request): @pytest.fixture(scope="module") def neuron_model_path(neuron_model_config): yield neuron_model_config["neuron_model_path"] + + +if __name__ == "__main__": + maybe_export_models() From 40e2f3f99515c5c88d9830bf89c8c32abe9c07f5 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 25 Feb 2025 10:39:36 +0000 Subject: [PATCH 3/9] ci(neuron): do not run tests twice --- .github/workflows/build.yaml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 824a5a28413..c9918ac64aa 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -252,16 +252,13 @@ jobs: - name: Install run: | make install-integration-tests - - name: Run tests + - name: Export neuron models run: | - export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }} export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }} - export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }} - export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}" - export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }} echo $DOCKER_IMAGE docker pull $DOCKER_IMAGE - pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST} + export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }} + python integration-tests/fixtures/neuron/export_models.py integration_tests: concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }} From 0cff388a104220fa68d3310a9979a79be336e4fe Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 25 Feb 2025 10:40:42 +0000 Subject: [PATCH 4/9] ci(neuron): rename precompilation job --- .github/workflows/build.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index c9918ac64aa..b7cc79556a3 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -230,7 +230,7 @@ jobs: echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT" echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT" echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT" - precompile_static_models: + precompile_neuron_models: concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true @@ -263,7 +263,7 @@ jobs: concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true - needs: [precompile_static_models, build-and-push] + needs: [precompile_neuron_models, build-and-push] if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }} runs-on: group: ${{ needs.build-and-push.outputs.runs_on }} From f6859c417951b7d83e4224c203fa315f7cd9fddb Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 25 Feb 2025 09:07:19 +0000 Subject: [PATCH 5/9] test(neuron): remove redundant subdirectory --- integration-tests/neuron/{integration => }/test_generate.py | 0 integration-tests/neuron/{integration => }/test_implicit_env.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename integration-tests/neuron/{integration => }/test_generate.py (100%) rename integration-tests/neuron/{integration => }/test_implicit_env.py (100%) diff --git a/integration-tests/neuron/integration/test_generate.py b/integration-tests/neuron/test_generate.py similarity index 100% rename from integration-tests/neuron/integration/test_generate.py rename to integration-tests/neuron/test_generate.py diff --git a/integration-tests/neuron/integration/test_implicit_env.py b/integration-tests/neuron/test_implicit_env.py similarity index 100% rename from integration-tests/neuron/integration/test_implicit_env.py rename to integration-tests/neuron/test_implicit_env.py From e783f88dc526ed182c61df51c837ec0b28330566 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 25 Feb 2025 09:21:25 +0000 Subject: [PATCH 6/9] test(neuron): remove erroneous line --- integration-tests/fixtures/neuron/export_models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/integration-tests/fixtures/neuron/export_models.py b/integration-tests/fixtures/neuron/export_models.py index a49642da826..fe5bdddc8c5 100644 --- a/integration-tests/fixtures/neuron/export_models.py +++ b/integration-tests/fixtures/neuron/export_models.py @@ -190,7 +190,7 @@ def maybe_export_model(config_name, model_config): logger.debug("Build logs %s", logs) try: - container = client.containers.run( + client.containers.run( export_image, environment=env, auto_remove=True, @@ -199,7 +199,6 @@ def maybe_export_model(config_name, model_config): shm_size="1G", ) logger.info(f"Successfully exported model for config {config_name}") - container.logs() except Exception as e: logger.exception(f"An exception occurred while running container: {e}.") pass From d59b4fdce9cb4df062ddae04a0f63e839d9f7a3e Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 25 Feb 2025 10:48:17 +0000 Subject: [PATCH 7/9] doc(neuron): update links to installation page --- docs/source/_toctree.yml | 2 +- docs/source/architecture.md | 2 +- docs/source/installation_inferentia.md | 2 +- docs/source/multi_backend_support.md | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 39f0ef4bdbc..37b57d6f428 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -12,7 +12,7 @@ - local: installation_gaudi title: Using TGI with Intel Gaudi - local: installation_inferentia - title: Using TGI with AWS Inferentia + title: Using TGI with AWS Trainium and Inferentia - local: installation_tpu title: Using TGI with Google TPUs - local: installation_intel diff --git a/docs/source/architecture.md b/docs/source/architecture.md index d3a6fa9260e..b475bb6dc7e 100644 --- a/docs/source/architecture.md +++ b/docs/source/architecture.md @@ -107,7 +107,7 @@ Several variants of the model server exist that are actively supported by Huggin - A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ. - A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ. - The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi). -- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference). +- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained in the main TGI repository. Some model features differ. - A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference). Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations. diff --git a/docs/source/installation_inferentia.md b/docs/source/installation_inferentia.md index 0394e6ded37..bfd0f657754 100644 --- a/docs/source/installation_inferentia.md +++ b/docs/source/installation_inferentia.md @@ -1,3 +1,3 @@ # Using TGI with Inferentia -Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2. +You can use TGI on AWS Trainium and Inferentia platforms using the [TGI neuron backend](https://huggingface.co/docs/text-generation-inference/backends/neuron). diff --git a/docs/source/multi_backend_support.md b/docs/source/multi_backend_support.md index 03d6d30be55..997503a4f19 100644 --- a/docs/source/multi_backend_support.md +++ b/docs/source/multi_backend_support.md @@ -13,3 +13,4 @@ TGI remains consistent across backends, allowing you to switch between them seam However, it requires a model-specific compilation step for each GPU architecture. * **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models (LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation. +* **[TGI Neuron backend](./backends/neuron)**: This backend leverages the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) to allow the deployment of large language models (LLMs) on [AWS Trainium and Inferentia chips](https://aws.amazon.com/ai/machine-learning/trainium/). From be06297e624b4be2a11d61f7efcfd21a472609a1 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 25 Feb 2025 09:10:25 +0000 Subject: [PATCH 8/9] feat(neuron): cleanup Dockerfile CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse is not required anymore. --- Dockerfile.neuron | 2 -- 1 file changed, 2 deletions(-) diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 17d256915f5..c7c4af68ba7 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -24,8 +24,6 @@ RUN cargo install cargo-chef --locked WORKDIR /usr/src -ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse - FROM chef AS planner COPY backends/neuron/Cargo.toml Cargo.toml COPY Cargo.lock Cargo.lock From b3709026266f85746017f7c1068f13ed17aaaf7c Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 25 Feb 2025 12:54:22 +0000 Subject: [PATCH 9/9] test(neuron): try to reduce download errors --- integration-tests/fixtures/neuron/export_models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/integration-tests/fixtures/neuron/export_models.py b/integration-tests/fixtures/neuron/export_models.py index fe5bdddc8c5..836402ecaf0 100644 --- a/integration-tests/fixtures/neuron/export_models.py +++ b/integration-tests/fixtures/neuron/export_models.py @@ -249,7 +249,9 @@ def neuron_model_config(request): with TemporaryDirectory() as neuron_model_path: logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub") hub = huggingface_hub.HfApi() - hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path) + hub.snapshot_download( + neuron_model_id, etag_timeout=30, local_dir=neuron_model_path + ) # Add dynamic parameters to the model configuration model_config["neuron_model_path"] = neuron_model_path model_config["neuron_model_id"] = neuron_model_id