[fix][1760] Added fix for the missing context key issue in dolly! (#1766)

pytholic · rasbt · web-flow · commit c16c72c04ce0 · 2024-10-04T12:55:18.000-05:00
Co-authored-by: Sebastian Raschka &lt;mail@sebastianraschka.com&gt;
diff --git a/litgpt/data/dolly.py b/litgpt/data/dolly.py
@@ -71,6 +71,6 @@ def setup(self, stage: str = "") -> None:
 
 
 def _transform(item: dict) -> dict:
-    item["input"] = item.pop("context")
-    item["output"] = item.pop("response")
+    item["input"] = item.get("context", "")
+    item["output"] = item.get("response", "")
     return item
diff --git a/litgpt/finetune/adapter.py b/litgpt/finetune/adapter.py
@@ -382,7 +382,7 @@ def generate_example(fabric: L.Fabric, model: GPT, tokenizer: Tokenizer, eval: E
         model.clear_kv_cache()
         model.train()
         output = tokenizer.decode(output)
-        fabric.print(output)
+        fabric.print(f"{output}\n")
     else:
         print(
             f"Length of encoded instruction ({len(encoded)}) and eval.max_new_tokens ({eval.max_new_tokens}) "
diff --git a/litgpt/finetune/adapter_v2.py b/litgpt/finetune/adapter_v2.py
@@ -378,7 +378,7 @@ def generate_example(fabric: L.Fabric, model: GPT, tokenizer: Tokenizer, eval: E
         model.clear_kv_cache()
         model.train()
         output = tokenizer.decode(output)
-        fabric.print(output)
+        fabric.print(f"{output}\n")
     else:
         print(
             f"Length of encoded instruction ({len(encoded)}) and eval.max_new_tokens ({eval.max_new_tokens}) "
diff --git a/litgpt/finetune/full.py b/litgpt/finetune/full.py
@@ -354,7 +354,7 @@ def generate_example(fabric: L.Fabric, model: GPT, tokenizer: Tokenizer, eval: E
         model.clear_kv_cache()
         model.train()
         output = tokenizer.decode(output)
-        fabric.print(output)
+        fabric.print(f"{output}\n")
     else:
         print(
             f"Length of encoded instruction ({len(encoded)}) and eval.max_new_tokens ({eval.max_new_tokens}) "
diff --git a/litgpt/finetune/lora.py b/litgpt/finetune/lora.py
@@ -410,7 +410,7 @@ def generate_example(fabric: L.Fabric, model: GPT, tokenizer: Tokenizer, eval: E
         model.clear_kv_cache()
         model.train()
         output = tokenizer.decode(output)
-        fabric.print(output)
+        fabric.print(f"{output}\n")
     else:
         print(
             f"Length of encoded instruction ({len(encoded)}) and eval.max_new_tokens ({eval.max_new_tokens}) "
diff --git a/tests/data/test_dolly.py b/tests/data/test_dolly.py
@@ -5,7 +5,12 @@
 
 
 def test_dolly(mock_tokenizer, dolly_path):
-    dolly = Dolly(val_split_fraction=0.5, download_dir=dolly_path.parent, file_name=dolly_path.name, num_workers=0)
+    dolly = Dolly(
+        val_split_fraction=0.5,
+        download_dir=dolly_path.parent,
+        file_name=dolly_path.name,
+        num_workers=0,
+    )
     assert isinstance(dolly.prompt_style, AlpacaPromptStyle)
     dolly.connect(mock_tokenizer, batch_size=2, max_seq_length=10)
     dolly.prepare_data()
@@ -29,3 +34,55 @@ def test_dolly(mock_tokenizer, dolly_path):
 
     # has attributes from super class `LightningDataModule`
     assert dolly.prepare_data_per_node
+
+
+def test_dolly_missing_keys(mock_tokenizer, dolly_path):
+    """
+    Notes
+    -----
+    - Added only for the dolly dataset.
+
+    References
+    ----------
+    - Reference issue: https://github.com/Lightning-AI/litgpt/issues/1760
+
+    Methodology
+    -----------
+    - Simulate the original behavior by popping `context` key.
+    - Run dataloader which will apply `transform`.
+        - Previously it would have thrown missing `context` key error because we `popped` the key.
+        - Now we are using `get` method to not remove they key(s).
+    """
+
+    dolly = Dolly(
+        val_split_fraction=0.5,
+        download_dir=dolly_path.parent,
+        file_name=dolly_path.name,
+        num_workers=0,
+    )
+    dolly.connect(mock_tokenizer, batch_size=2, max_seq_length=10)
+    dolly.prepare_data()
+    dolly.setup()
+
+    # check if the dataset was created without errors
+    assert dolly.train_dataset is not None
+    assert dolly.test_dataset is not None
+
+    # Verify that the transform function handled missing keys correctly
+    for dataset in [dolly.train_dataset, dolly.test_dataset]:
+        for item in dataset.data:
+            assert "context" in item
+            assert "response" in item
+            assert isinstance(item["context"], str)
+            assert isinstance(item["response"], str)
+            # Drop `context` and `response` keys
+            # This is to simulate the behavior of original issue with `item.pop`
+            item.pop("context")
+            item.pop("response")
+
+    # Check if we can iterate through the dataloader without errors
+    # Previous approach would through key error here since we already popped the keys
+    train_dataloader = dolly.train_dataloader()
+    train_batch = next(iter(train_dataloader))
+    assert "input_ids" in train_batch
+    assert "labels" in train_batch