Add paligemma server support

abetlen · abetlen · commit 9558b22fa4f5 · 2024-10-02T04:50:18.000-04:00
diff --git a/llama_cpp/_llava.py b/llama_cpp/_llava.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import os
+import ctypes
+import typing
+import contextlib
+
+import numpy as np
+
+import llama_cpp
+import llama_cpp.llava_cpp as llava_cpp
+
+
+class LlavaEmbedding:
+    def __init__(self, embedding: ctypes._Pointer[llava_cpp.llava_image_embed]):
+        self._embedding = embedding
+        self._exit_stack = contextlib.ExitStack()
+
+        def llava_image_embed_free():
+            llava_cpp.llava_image_embed_free(self._embedding)
+
+        self._exit_stack.callback(llava_image_embed_free)
+
+    @property
+    def n_image_pos(self) -> int:
+        return self._embedding.contents.n_image_pos
+
+    def embed(
+        self, llama_ctx: llama_cpp.llama_context_p, n_tokens: int, n_batch: int
+    ) -> int:
+        n_past = ctypes.c_int(n_tokens)
+        n_past_p = ctypes.pointer(n_past)
+        llava_cpp.llava_eval_image_embed(
+            llama_ctx,
+            self._embedding,
+            n_batch,
+            n_past_p,
+        )
+        return n_past.value
+
+    def numpy_view(self, shape: typing.Tuple[int, int]) -> np.ndarray:
+        return np.ctypeslib.as_array(
+            self._embedding.contents.embed, shape=shape
+        )
+
+
+class LlavaModel:
+    def __init__(self, path: str, n_threads: int = 1):
+        self._path = path
+        self._n_threads = n_threads
+        self._exit_stack = contextlib.ExitStack()
+
+        if not os.path.exists(self._path):
+            raise ValueError(f"Clip model path does not exist: {self._path}")
+
+        clip_ctx = llava_cpp.clip_model_load(self._path.encode(), 0)
+
+        if clip_ctx is None:
+            raise ValueError(f"Failed to load clip model: {self._path}")
+
+        self._clip_ctx = clip_ctx
+
+        def clip_free():
+            llava_cpp.clip_free(self._clip_ctx)
+            print("Clip model freed")
+
+        self._exit_stack.callback(clip_free)
+
+    def embed_bytes(self, image_bytes: bytes):
+        embed = llava_cpp.llava_image_embed_make_with_bytes(
+            self._clip_ctx,
+            self._n_threads,
+            (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
+            len(image_bytes),
+        )
+        return LlavaEmbedding(embed)
+
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -171,6 +171,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 chat_handler = llama_cpp.llama_chat_format.MiniCPMv26ChatHandler(
                     clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
+        elif settings.chat_format == "paligemma":
+            assert settings.clip_model_path is not None, "clip model not found"
+            if settings.hf_model_repo_id is not None:
+                chat_handler = (
+                    llama_cpp.llama_chat_format.PaligemmaChatHandler.from_pretrained(
+                        repo_id=settings.hf_model_repo_id,
+                        filename=settings.clip_model_path,
+                        verbose=settings.verbose,
+                    )
+                )
+            else:
+                chat_handler = llama_cpp.llama_chat_format.PaligemmaChatHandler(
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                )
         elif settings.chat_format == "hf-autotokenizer":
             assert (
                 settings.hf_pretrained_model_name_or_path is not None