vllm-project · simon-mo · May 7, 2025 · Feb 14, 2025 · Oct 15, 2024 · Oct 15, 2024
diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to run offline inference with an EAGLE speculative 
+decoding model on neuron. To use EAGLE speculative decoding, you must use
+a draft model that is specifically fine-tuned for EAGLE speculation.
+Additionally, to use EAGLE with NxD Inference, the draft model must include
+the LM head weights from the target model. These weights are shared between
+the draft and target model.
+"""
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "What is annapurna labs?",
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
+
+# Create an LLM.
+llm = LLM(
+    model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
+    speculative_config={
+        "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
+        "num_speculative_tokens": 5,
+        "max_model_len": 2048
+    },
+    max_num_seqs=4,
+    # The max_model_len and block_size arguments are required to be same as
+    # max sequence length when targeting neuron device.
+    # Currently, this is a known limitation in continuous batching support
+    # in neuronx-distributed-inference.
+    max_model_len=2048,
+    block_size=2048,
+    # The device can be automatically detected when AWS Neuron SDK is installed.
+    # The device argument can be either unspecified for automated detection,
+    # or explicitly assigned.
+    device="neuron",
+    tensor_parallel_size=32,
+    override_neuron_config={
+        "enable_eagle_speculation": True,
+        "enable_fused_speculation": True
+    },
+)
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to run offline inference with a speculative 
+decoding model on neuron.
+"""
+
+import os
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, I am a language model and I can help",
+    "The president of the United States is",
+    "The capital of France is",
+]
+
+
+def config_buckets():
+    """Configure context length and token gen buckets."""
+    # creates XLA hlo graphs for all the context length buckets.
+    os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
+    # creates XLA hlo graphs for all the token gen buckets.
+    os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
+
+
+def initialize_model():
+    """Create an LLM with speculative decoding."""
+    return LLM(
+        model="openlm-research/open_llama_7b",
+        speculative_config={
+            "model": "openlm-research/open_llama_3b",
+            "num_speculative_tokens": 4,
+            "max_model_len": 2048
+        },
+        max_num_seqs=4,
+        max_model_len=2048,
+        block_size=2048,
+        use_v2_block_manager=True,
+        device="neuron",
+        tensor_parallel_size=32,
+    )
+
+
+def process_requests(model: LLM, sampling_params: SamplingParams):
+    """Generate texts from prompts and print them."""
+    outputs = model.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+def main():
+    """Main function that sets up the model and processes prompts."""
+    config_buckets()
+    model = initialize_model()
+    # Create a sampling params object.
+    sampling_params = SamplingParams(max_tokens=100, top_k=1)
+    process_requests(model, sampling_params)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements/neuron.txt b/requirements/neuron.txt
@@ -5,4 +5,5 @@
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 torch-neuronx >= 2.5.0
-neuronx-cc
+neuronx-cc>=2.0.0a0
+torchvision # Required for Llama3.2 multimodal image preprocessing
diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+from unittest.mock import MagicMock
+
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
+from vllm.platforms.neuron import NeuronFramework
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import SequenceData, SequenceGroupMetadata
+from vllm.worker.neuron_model_runner import NeuronModelRunner
+
+os.environ[
+    'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value
+
+
+def _create_neuron_model_runner(model: str, *args,
+                                **kwargs) -> NeuronModelRunner:
+    engine_args = EngineArgs(model, *args, **kwargs)
+    engine_config = engine_args.create_engine_config()
+    vllm_config = VllmConfig(
+        model_config=engine_config.model_config,
+        parallel_config=engine_config.parallel_config,
+        scheduler_config=engine_config.scheduler_config,
+        device_config=engine_config.device_config,
+    )
+    neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config)
+    return neuron_model_runner
+
+
+def test_update_neuron_sampling_params_not_full_batch():
+    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
+    model_runner = _create_neuron_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        max_num_seqs=2,
+    )
+    assert not model_runner._on_device_sampling_disabled
+    # Test sampling param updating only when TNx is framework
+    # NxDI handles sampling parameter updating inside model
+    if current_platform.use_transformers_neuronx():
+        model_mock = MagicMock()
+        model_runner.model = model_mock
+
+        seq_group_metadata_list = [
+            SequenceGroupMetadata(
+                request_id="test_0",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(temperature=0.5,
+                                               top_k=1,
+                                               top_p=0.5),
+                block_tables={0: [1]},
+            )
+        ]
+
+        model_runner.prepare_model_input(seq_group_metadata_list)
+
+        # Index neuron sampling parameters based on block_tables indices.
+        # The first block_id of the sequence 0 is 1, so its parameters are
+        # placed at index 1. So the sampling parameters will be:
+        # Index 0: default sampling parameters
+        # Index 1: sequecne 0's sampling parameters.
+        neuron_sampling_params = (
+            model_runner.model_config.neuron_sampling_params)
+        assert neuron_sampling_params.temperature == [1.0, 0.5]
+        assert neuron_sampling_params.top_k == [
+            model_runner._MAX_NEURON_SAMPLING_TOP_K, 1
+        ]
+        assert neuron_sampling_params.top_p == [1.0, 0.5]
+        model_mock.model.update_generation_config.assert_called_once_with(
+            neuron_sampling_params)
+
+
+def test_update_neuron_sampling_params_full_batch():
+    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
+    model_runner = _create_neuron_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        max_num_seqs=2,
+    )
+    assert not model_runner._on_device_sampling_disabled
+
+    # Test sampling param updating only when TNx is framework
+    # NxDI handles sampling parameter updating inside model
+    if current_platform.use_transformers_neuronx():
+        model_mock = MagicMock()
+        model_runner.model = model_mock
+
+        seq_group_metadata_list = [
+            SequenceGroupMetadata(
+                request_id="test_0",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(temperature=0.5,
+                                               top_k=1,
+                                               top_p=0.5),
+                block_tables={0: [1]},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_0",
+                is_prompt=True,
+                seq_data={1: SequenceData.from_seqs([4, 5, 6])},
+                sampling_params=SamplingParams(temperature=0.2,
+                                               top_k=2,
+                                               top_p=0.2),
+                block_tables={1: [0]},
+            )
+        ]
+
+        model_runner.prepare_model_input(seq_group_metadata_list)
+
+        # Index neuron sampling parameters based on block_tables indices.
+        # The first block_id of the sequence 0 is 1, so its parameters are
+        # placed at index 1. So the sampling parameters will be:
+        # Index 0: sequence 1's sampling parameters
+        # Index 1: sequecne 0's sampling parameters.
+        neuron_sampling_params = (
+            model_runner.model_config.neuron_sampling_params)
+        assert neuron_sampling_params.temperature == [0.2, 0.5]
+        assert neuron_sampling_params.top_k == [2, 1]
+        assert neuron_sampling_params.top_p == [0.2, 0.5]
+        model_mock.model.update_generation_config.assert_called_once_with(
+            neuron_sampling_params)
diff --git a/vllm/config.py b/vllm/config.py
@@ -2269,6 +2269,9 @@ class SpeculativeConfig:
     """Scaling factor for entropy-based threshold, applied when using
     `TypicalAcceptanceSampler`."""
 
+    speculative_token_tree: Optional[str] = None
+    """Specifies the tree structure for speculative token generation. 
+    """
     # required configuration params passed from engine
     target_model_config: ModelConfig = field(default=None,
                                              init=True)  # type: ignore
@@ -2443,10 +2446,11 @@ def __post_init__(self):
                             "Chunked prefill and EAGLE are not compatible "
                             "when using V0.")
 
+                    from vllm.platforms import current_platform
                     from vllm.transformers_utils.configs.eagle import (
                         EAGLEConfig)
                     if isinstance(self.draft_model_config.hf_config,
-                                  EAGLEConfig):
+                                  EAGLEConfig) or current_platform.is_neuron():
                         pass
                     else:
                         eagle_config = EAGLEConfig(

@@ -399,10 +399,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
                 self.scheduler,
                 self.seq_counter,
                 get_tokenizer_for_seq,
-                stop_checker=StopChecker(
-                    self.scheduler_config.max_model_len,
-                    get_tokenizer_for_seq,
-                ),
+                stop_checker=StopChecker(self.scheduler_config.max_model_len,
+                                         get_tokenizer_for_seq),
             ))
 
         self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}