Multi-gpu serving (#1670)

rasbt · web-flow · commit ef9647cfa7cd · 2024-08-12T13:42:34.000-05:00
diff --git a/litgpt/deploy/serve.py b/litgpt/deploy/serve.py
@@ -27,7 +27,8 @@ def __init__(
         temperature: float = 0.8,
         top_k: int = 50,
         top_p: float = 1.0,
-        max_new_tokens: int = 50
+        max_new_tokens: int = 50,
+        devices: int = 1
     ) -> None:
 
         if not _LITSERVE_AVAILABLE:
@@ -41,6 +42,7 @@ def __init__(
         self.top_k = top_k
         self.max_new_tokens = max_new_tokens
         self.top_p = top_p
+        self.devices = devices
 
     def setup(self, device: str) -> None:
         if ":" in device:
@@ -57,9 +59,11 @@ def setup(self, device: str) -> None:
         )
 
         self.llm.distribute(
+            devices=self.devices,
             accelerator=accelerator,
             quantize=self.quantize,
-            precision=self.precision
+            precision=self.precision,
+            generate_strategy="sequential" if self.devices is not None and self.devices > 1 else None
         )
         print("Model successfully initialized.")
 
@@ -78,9 +82,10 @@ def __init__(
         temperature: float = 0.8,
         top_k: int = 50,
         top_p: float = 1.0,
-        max_new_tokens: int = 50
+        max_new_tokens: int = 50,
+        devices: int = 1
     ):
-        super().__init__(checkpoint_dir, quantize, precision, temperature, top_k, top_p, max_new_tokens)   
+        super().__init__(checkpoint_dir, quantize, precision, temperature, top_k, top_p, max_new_tokens, devices)   
 
     def setup(self, device: str):
         super().setup(device)
@@ -109,9 +114,10 @@ def __init__(
         temperature: float = 0.8,
         top_k: int = 50,
         top_p: float = 1.0,
-        max_new_tokens: int = 50
+        max_new_tokens: int = 50,
+        devices: int = 1
     ):
-        super().__init__(checkpoint_dir, quantize, precision, temperature, top_k, top_p, max_new_tokens)   
+        super().__init__(checkpoint_dir, quantize, precision, temperature, top_k, top_p, max_new_tokens, devices)   
 
     def setup(self, device: str):
         super().setup(device)
@@ -197,9 +203,10 @@ def run_server(
                 top_k=top_k,
                 top_p=top_p,
                 max_new_tokens=max_new_tokens,
+                devices=devices
                 ),
             accelerator=accelerator,
-            devices=devices
+            devices=1  # We need to use the devives inside the `SimpleLitAPI` class
             )
 
     else:
@@ -212,9 +219,10 @@ def run_server(
                 top_k=top_k,
                 top_p=top_p,
                 max_new_tokens=max_new_tokens,
+                devices=devices  # We need to use the devives inside the `StreamLitAPI` class
                 ),
             accelerator=accelerator,
-            devices=devices,
+            devices=1,
             stream=True
             )
 
diff --git a/tests/test_serve.py b/tests/test_serve.py
@@ -98,3 +98,46 @@ def run_server():
         if process:
             process.kill()
         server_thread.join()
+
+
+@RunIf(min_cuda_gpus=2)
+def test_multi_gpu_serve(tmp_path):
+    seed_everything(123)
+    ours_config = Config.from_name("pythia-14m")
+    download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
+    shutil.move(str(tmp_path / "EleutherAI" / "pythia-14m" / "tokenizer.json"), str(tmp_path))
+    shutil.move(str(tmp_path / "EleutherAI" / "pythia-14m" / "tokenizer_config.json"), str(tmp_path))
+    ours_model = GPT(ours_config)
+    checkpoint_path = tmp_path / "lit_model.pth"
+    torch.save(ours_model.state_dict(), checkpoint_path)
+    config_path = tmp_path / "model_config.yaml"
+    with open(config_path, "w", encoding="utf-8") as fp:
+        yaml.dump(asdict(ours_config), fp)
+
+    run_command = [
+        "litgpt", "serve", tmp_path, "--devices", "2"
+    ]
+
+    process = None
+
+    def run_server():
+        nonlocal process
+        try:
+            process = subprocess.Popen(run_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            stdout, stderr = process.communicate(timeout=10)
+        except subprocess.TimeoutExpired:
+            print('Server start-up timeout expired')
+
+    server_thread = threading.Thread(target=run_server)
+    server_thread.start()
+
+    time.sleep(10)
+
+    try:
+        response = requests.get("http://127.0.0.1:8000")
+        print(response.status_code)
+        assert response.status_code == 200, "Server did not respond as expected."
+    finally:
+        if process:
+            process.kill()
+        server_thread.join()