[Misc][Speculative decoding] Typos and typing fixes (vllm-project#6467)

ShangmingCai · Alvant · commit 4c5549e001eb · 2024-10-26T09:45:30.000+03:00
Co-authored-by: caishangming.csm &lt;caishangming.csm@alibaba-inc.com&gt;
Signed-off-by: Alvant &lt;alvasian@yandex.ru&gt;
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
@@ -43,7 +43,7 @@ def init_device(self) -> None:
         )
 
     def set_include_gpu_probs_tensor(self) -> None:
-        # Need include_gpu_probs_tensor for multi_step_worker
+        # Need include_gpu_probs_tensor for MultiStepWorker
         self.model_runner.model.sampler.include_gpu_probs_tensor = True
 
     @torch.inference_mode()
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
@@ -13,7 +13,7 @@
 class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
     """NGramWorker provides a light drafter without need for model.
 
-    Current NGramWorker only implement prompt lookup decoding,
+    Current NGramWorker only implements prompt lookup decoding,
     and in future we may also do RAG type drafter and other scenarios
     which don't rely on LLM model to give proposals.
     """
@@ -37,7 +37,7 @@ def init_device(self):
         self.device = torch.device(f"cuda:{self.local_rank}")
         self.load_model = lambda *args, **kwargs: None
 
-        # Current only support Top1Proposer
+        # Current NGramWorker only supports Top1Proposer
         self._proposer = Top1Proposer(
             weakref.proxy(self),  # type: ignore[arg-type]
             device=self.device,
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
@@ -24,7 +24,7 @@ def sampler_output(
     ) -> Tuple[Optional[List[SamplerOutput]], bool]:
         raise NotImplementedError
 
-    def set_include_gpu_probs_tensor(self):
+    def set_include_gpu_probs_tensor(self) -> None:
         """Implementation optional"""
         pass
 
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
@@ -206,7 +206,7 @@ def __init__(
 
         self.probs_dtype = self.spec_decode_sampler.probs_dtype
         self.token_id_dtype = self.spec_decode_sampler.token_id_dtype
-        # Lazy initiazliation.
+        # Lazy initialization.
         self.scorer: SpeculativeScorer
 
         # Hidden states from target model to pass to proposer
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
@@ -138,7 +138,7 @@ def _split_by_proposal_len(
 
             # Currently only proposal lens of 0 or the global batch proposal len
             # are supported.
-            # If max_proposal_len is defined, then we shall no exccess this
+            # If max_proposal_len is defined, then we shall no exceed this
             # quota for nonzero_proposal
             new_k = 0
             if (self.max_proposal_len is None
@@ -219,7 +219,7 @@ def _merge_outputs(
         proposal_lens: List[int],
         nonzero_proposal_len_indices: List[int],
         sampler_transposed: bool,
-    ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """After speculations are produced, merge the speculation results with
         the skipped sequences.
         """

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ def init_device(self) -> None:`
`43`	`43`	`)`
`44`	`44`
`45`	`45`	`def set_include_gpu_probs_tensor(self) -> None:`
`46`		`- # Need include_gpu_probs_tensor for multi_step_worker`
	`46`	`+ # Need include_gpu_probs_tensor for MultiStepWorker`
`47`	`47`	`self.model_runner.model.sampler.include_gpu_probs_tensor = True`
`48`	`48`
`49`	`49`	`@torch.inference_mode()`