Fix vocab padding in generator

turboderp · turboderp · commit 312f400723c0 · 2024-02-04T21:30:37.000+01:00
diff --git a/exllamav2/generator/streaming.py b/exllamav2/generator/streaming.py
@@ -62,7 +62,7 @@ def __init__(self, model, cache, tokenizer, draft_model = None, draft_cache = No
 
         self.no_tokens = torch.empty((1, 0), dtype = torch.long)
         self.no_probs = torch.empty((1, 0), dtype = torch.float)
-        self.no_logits = torch.empty((0, self.model.config.vocab_size), dtype = torch.float)
+        self.no_logits = torch.empty((0, ((self.model.config.vocab_size + 31) // 32) * 32), dtype = torch.float)
 
         if draft_model:
             self.draft_model = draft_model
@@ -193,7 +193,8 @@ def _stream(self) -> (str, bool, torch.Tensor, torch.Tensor, torch.Tensor):
         self.held_text += new_text
         self.held_tokens = torch.cat([self.held_tokens, next_token], dim = -1)
         self.held_probs = torch.cat([self.held_probs, next_prob], dim = -1)
-        self.held_logits = torch.cat([self.held_logits, next_logits], dim = 0)
+        if self.return_logits:
+            self.held_logits = torch.cat([self.held_logits, next_logits], dim = 0)
 
         # Return now if newly added token ends a filter