Merge pull request #84 from huggingface/kv-default

andimarafioti · web-flow · commit 8be6c6a467fa · 2025-05-28T13:14:36.000+02:00
Make KV Cache as the only strategy for inference
diff --git a/models/vision_language_model.py b/models/vision_language_model.py
@@ -60,7 +60,7 @@ def forward(self, input_ids, image, attention_mask=None, targets=None):
         return logits, loss
 
     @torch.inference_mode()
-    def generate(self, input_ids, image, attention_mask=None, max_new_tokens=5, top_k=50, top_p=0.9, temperature=0.5, greedy=False, use_kv_cache: bool = True):
+    def generate(self, input_ids, image, attention_mask=None, max_new_tokens=5, top_k=50, top_p=0.9, temperature=0.5, greedy=False):
 
         # 1. Process image
         image_embd = self.vision_encoder(image) # [B, T_img, D_model]
@@ -122,27 +122,14 @@ def generate(self, input_ids, image, attention_mask=None, max_new_tokens=5, top_
             if attention_mask is not None:
                 attention_mask = torch.cat((attention_mask, torch.ones((batch_size, 1), device=attention_mask.device, dtype=attention_mask.dtype)), dim=1)
 
-            if use_kv_cache:
-                # With KV cache: only process the new token
-                decode_step_output, kv_cache_list = self.decoder(
-                    next_token_embed,
-                    attention_mask=attention_mask,
-                    kv_cache=kv_cache_list,
-                    start_pos=current_token_start_pos
-                )
-            else:
-                # Without KV cache: process the entire sequence from scratch
-                # Reconstruct the full sequence: image + prompt + generated tokens so far
-                generated_token_embeds = torch.cat([self.decoder.token_embedding(tid) for tid in newly_generated_ids_list], dim=1)
-                full_sequence_embeds = torch.cat([initial_combined_embeds, generated_token_embeds], dim=1)
- 
-                decode_step_output, _ = self.decoder(
-                    full_sequence_embeds,
-                    attention_mask=attention_mask,
-                    kv_cache=None,
-                    start_pos=0
-                )
-                
+            # With KV cache: only process the new token
+            decode_step_output, kv_cache_list = self.decoder(
+                next_token_embed,
+                attention_mask=attention_mask,
+                kv_cache=kv_cache_list,
+                start_pos=current_token_start_pos
+            )
+      
             last_token_output = decode_step_output[:, -1, :] 
             
             # Apply head to get logits (if model is in embedding mode)
@@ -152,7 +139,7 @@ def generate(self, input_ids, image, attention_mask=None, max_new_tokens=5, top_
                 current_logits = last_token_output
         
         if not newly_generated_ids_list: # Handle case where max_new_tokens might be 0
-             return torch.empty((B,0), dtype=torch.long, device=input_ids.device)
+            return torch.empty((batch_size,0), dtype=torch.long, device=input_ids.device)
 
         return torch.cat(newly_generated_ids_list, dim=1)