Update deprecated pydantic functions, fix dataprep vdms_multimodal output, add vdms retriever without tei, make search engine and distance strategy configurable, fix vdms retrieval of clip, fix vdms dataprep test scripts, and fix vdms retrieval test script to test both with and without clip. These changes should help fix opea-project/GenAIExamples#1476

cwlacewe · cwlacewe · commit c477648be2a9 · 2025-03-17T14:27:02.000-07:00
Signed-off-by: Lacewell, Chaunte W &lt;chaunte.w.lacewell@intel.com&gt;
diff --git a/comps/dataprep/src/integrations/utils/store_embeddings.py b/comps/dataprep/src/integrations/utils/store_embeddings.py
@@ -6,9 +6,9 @@
 import numpy as np
 import torchvision.transforms as T
 from decord import VideoReader, cpu
-from langchain.pydantic_v1 import BaseModel, root_validator
 from langchain_core.embeddings import Embeddings
 from langchain_vdms.vectorstores import VDMS, VDMS_Client
+from pydantic import BaseModel, model_validator
 
 toPIL = T.ToPILImage()
 
@@ -20,7 +20,7 @@ class vCLIPEmbeddings(BaseModel, Embeddings):
 
     model: Any
 
-    @root_validator(allow_reuse=True)
+    @model_validator(mode="before")
     def validate_environment(cls, values: Dict) -> Dict:
         """Validate that open_clip and torch libraries are installed."""
         try:
@@ -98,6 +98,8 @@ def __init__(
         collection_name,
         embedding_dimensions: int = 512,
         chosen_video_search_type="similarity",
+        engine: str = "FaissFlat",
+        distance_strategy: str = "IP",
     ):
 
         self.host = host
@@ -109,6 +111,8 @@ def __init__(
         self.video_embedder = vCLIPEmbeddings(model=video_retriever_model)
         self.chosen_video_search_type = chosen_video_search_type
         self.embedding_dimensions = embedding_dimensions
+        self.engine = engine
+        self.distance_strategy = distance_strategy
 
         # initialize_db
         self.get_db_client()
@@ -127,7 +131,7 @@ def init_db(self):
                 client=self.client,
                 embedding=self.video_embedder,
                 collection_name=self.video_collection,
-                engine="FaissFlat",
-                distance_strategy="IP",
+                engine=self.engine,
+                distance_strategy=self.distance_strategy,
                 embedding_dimensions=self.embedding_dimensions,
             )
diff --git a/comps/dataprep/src/integrations/vdms_multimodal.py b/comps/dataprep/src/integrations/vdms_multimodal.py
@@ -23,6 +23,8 @@
 VECTORDB_SERVICE_HOST_IP = os.getenv("VDMS_HOST", "0.0.0.0")
 VECTORDB_SERVICE_PORT = os.getenv("VDMS_PORT", 55555)
 collection_name = os.getenv("INDEX_NAME", "rag-vdms")
+SEARCH_ENGINE = os.getenv("SEARCH_ENGINE", "FaissFlat")
+DISTANCE_STRATEGY = os.getenv("DISTANCE_STRATEGY", "IP")
 
 logger = CustomLogger("opea_dataprep_vdms_multimodal")
 logflag = os.getenv("LOGFLAG", False)
@@ -145,14 +147,21 @@ async def ingest_videos(self, files: List[UploadFile] = File(None)):
         # init meanclip model
         model = self.setup_vclip_model(meanclip_cfg, device="cpu")
         vs = store_embeddings.VideoVS(
-            host, port, selected_db, model, collection_name, embedding_dimensions=vector_dimensions
+            host,
+            port,
+            selected_db,
+            model,
+            collection_name,
+            embedding_dimensions=vector_dimensions,
+            engine=SEARCH_ENGINE,
+            distance_strategy=DISTANCE_STRATEGY,
         )
         logger.info("done creating DB, sleep 5s")
         await asyncio.sleep(5)
 
         self.generate_embeddings(config, vector_dimensions, vs)
 
-        return {"message": "Videos ingested successfully"}
+        return {"status": 200, "message": "Videos ingested successfully"}
 
     async def get_videos(self):
         """Returns list of names of uploaded videos saved on the server."""
diff --git a/comps/retrievers/deployment/docker_compose/compose.yaml b/comps/retrievers/deployment/docker_compose/compose.yaml
@@ -179,6 +179,18 @@ services:
       tei-embedding-serving:
         condition: service_healthy
 
+  retriever-vdms-multimodal:
+    extends: retriever
+    container_name: retriever-vdms-multimodal
+    environment:
+      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_VDMS"
+      VDMS_INDEX_NAME: ${INDEX_NAME}
+      VDMS_HOST: ${host_ip}
+      VDMS_PORT: ${VDMS_PORT}
+      VDMS_USE_CLIP: ${VDMS_USE_CLIP}
+    depends_on:
+      vdms-vector-db:
+        condition: service_healthy
 
 networks:
   default:
diff --git a/comps/retrievers/src/integrations/config.py b/comps/retrievers/src/integrations/config.py
@@ -184,5 +184,5 @@ def format_opensearch_conn_from_env():
 VDMS_PORT = int(os.getenv("VDMS_PORT", 55555))
 VDMS_INDEX_NAME = os.getenv("VDMS_INDEX_NAME", "rag_vdms")
 VDMS_USE_CLIP = int(os.getenv("VDMS_USE_CLIP", 0))
-SEARCH_ENGINE = "FaissFlat"
-DISTANCE_STRATEGY = "IP"
+SEARCH_ENGINE = os.getenv("SEARCH_ENGINE", "FaissFlat")
+DISTANCE_STRATEGY = os.getenv("DISTANCE_STRATEGY", "IP")
diff --git a/comps/retrievers/src/integrations/vdms.py b/comps/retrievers/src/integrations/vdms.py
@@ -3,10 +3,19 @@
 
 
 import os
+from typing import Any, Dict, List
 
-from fastapi import HTTPException
-from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings
+import numpy as np
+import torch.nn as nn
+import torchvision.transforms as T
+from decord import VideoReader, cpu
+from einops import rearrange
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+from langchain_core.embeddings import Embeddings
 from langchain_vdms.vectorstores import VDMS, VDMS_Client
+from pydantic import BaseModel, model_validator
+from torch import cat as torch_cat
+from transformers import AutoProcessor, AutoTokenizer, CLIPModel
 
 from comps import CustomLogger, EmbedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
 
@@ -24,51 +33,43 @@
 
 logger = CustomLogger("vdms_retrievers")
 logflag = os.getenv("LOGFLAG", False)
+toPIL = T.ToPILImage()
 
 
 @OpeaComponentRegistry.register("OPEA_RETRIEVER_VDMS")
 class OpeaVDMsRetriever(OpeaComponent):
     """A specialized retriever component derived from OpeaComponent for vdms retriever services.
 
     Attributes:
-        client (VDMs): An instance of the vdms client for vector database operations.
+        client (VDMS): An instance of the vdms client for vector database operations.
     """
 
     def __init__(self, name: str, description: str, config: dict = None):
         super().__init__(name, ServiceType.RETRIEVER.name.lower(), description, config)
 
         self.embedder = self._initialize_embedder()
-        self.client = VDMS_Client(VDMS_HOST, VDMS_PORT)
+        self.client = VDMS_Client(host=VDMS_HOST, port=VDMS_PORT)
         self.vector_db = self._initialize_vector_db()
         health_status = self.check_health()
         if not health_status:
             logger.error("OpeaVDMsRetriever health check failed.")
 
     def _initialize_embedder(self):
         if VDMS_USE_CLIP:
-            from comps.third_parties.clip.src.clip_embedding import vCLIP
+            meanclip_cfg = {
+                "model_name": "openai/clip-vit-base-patch32",
+                "num_frm": 64,
+            }
+            video_retriever_model = vCLIP(meanclip_cfg)  # , device="cpu")
+            embeddings = vCLIPEmbeddings(model=video_retriever_model)
 
-            embeddings = vCLIP({"model_name": "openai/clip-vit-base-patch32", "num_frm": 64})
         elif TEI_EMBEDDING_ENDPOINT:
             # create embeddings using TEI endpoint service
             if logflag:
                 logger.info(f"[ init embedder ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}")
-            if not HUGGINGFACEHUB_API_TOKEN:
-                raise HTTPException(
-                    status_code=400,
-                    detail="You MUST offer the `HUGGINGFACEHUB_API_TOKEN` when using `TEI_EMBEDDING_ENDPOINT`.",
-                )
-            import requests
-
-            response = requests.get(TEI_EMBEDDING_ENDPOINT + "/info")
-            if response.status_code != 200:
-                raise HTTPException(
-                    status_code=400, detail=f"TEI embedding endpoint {TEI_EMBEDDING_ENDPOINT} is not available."
-                )
-            model_id = response.json()["model_id"]
-            embeddings = HuggingFaceInferenceAPIEmbeddings(
-                api_key=HUGGINGFACEHUB_API_TOKEN, model_name=model_id, api_url=TEI_EMBEDDING_ENDPOINT
-            )
+            from langchain_huggingface import HuggingFaceEndpointEmbeddings
+
+            embeddings = HuggingFaceEndpointEmbeddings(model=TEI_EMBEDDING_ENDPOINT)
         else:
             # create embeddings using local embedding model
             if logflag:
@@ -78,24 +79,13 @@ def _initialize_embedder(self):
 
     def _initialize_vector_db(self) -> VDMS:
         """Initializes the vdms client."""
-        if VDMS_USE_CLIP:
-            dimensions = self.embedder.get_embedding_length()
-            vector_db = VDMS(
-                client=self.client,
-                embedding=self.embedder,
-                collection_name=VDMS_INDEX_NAME,
-                embedding_dimensions=dimensions,
-                distance_strategy=DISTANCE_STRATEGY,
-                engine=SEARCH_ENGINE,
-            )
-        else:
-            vector_db = VDMS(
-                client=self.client,
-                embedding=self.embedder,
-                collection_name=VDMS_INDEX_NAME,
-                distance_strategy=DISTANCE_STRATEGY,
-                engine=SEARCH_ENGINE,
-            )
+        vector_db = VDMS(
+            client=self.client,
+            embedding=self.embedder,
+            collection_name=VDMS_INDEX_NAME,
+            distance_strategy=DISTANCE_STRATEGY,
+            engine=SEARCH_ENGINE,
+        )
         return vector_db
 
     def check_health(self) -> bool:
@@ -154,8 +144,127 @@ async def invoke(self, input: EmbedDoc) -> list:
                 lambda_mult=input.lambda_mult,
                 filter=input.constraints,
             )
+        else:
+            raise ValueError(f"{input.search_type} not valid")
 
         if logflag:
             logger.info(f"retrieve result: {search_res}")
 
         return search_res
+
+
+class vCLIPEmbeddings(BaseModel, Embeddings):
+    """MeanCLIP Embeddings model."""
+
+    model: Any
+
+    def get_embedding_length(self):
+        text_features = self.embed_query("sample_text")
+        t_len = len(text_features)
+        logger.info(f"text_features: {t_len}")
+        return t_len
+
+    @model_validator(mode="before")
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that open_clip and torch libraries are installed."""
+        try:
+            # Use the provided model if present
+            if "model" not in values:
+                raise ValueError("Model must be provided during initialization.")
+
+        except ImportError:
+            raise ImportError("Please ensure CLIP model is loaded")
+        return values
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        model_device = next(self.model.clip.parameters()).device
+        text_features = self.model.get_text_embeddings(texts)
+
+        return text_features.detach().numpy()
+
+    def embed_query(self, text: str) -> List[float]:
+        return self.embed_documents([text])[0]
+
+    def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]:
+        # Open images directly as PIL images
+
+        video_features = []
+        for vid_path in sorted(paths):
+            # Encode the video to get the embeddings
+            model_device = next(self.model.parameters()).device
+            # Preprocess the video for the model
+            clip_images = self.load_video_for_vclip(
+                vid_path,
+                num_frm=self.model.num_frm,
+                max_img_size=224,
+                start_time=kwargs.get("start_time", None),
+                clip_duration=kwargs.get("clip_duration", None),
+            )
+            embeddings_tensor = self.model.get_video_embeddings([clip_images])
+
+            # Convert tensor to list and add to the video_features list
+            embeddings_list = embeddings_tensor.tolist()
+
+            video_features.append(embeddings_list)
+
+        return video_features
+
+    def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs):
+        # Load video with VideoReader
+        import decord
+
+        decord.bridge.set_bridge("torch")
+        vr = VideoReader(vid_path, ctx=cpu(0))
+        fps = vr.get_avg_fps()
+        num_frames = len(vr)
+        start_idx = int(fps * kwargs.get("start_time", [0])[0])
+        end_idx = start_idx + int(fps * kwargs.get("clip_duration", [num_frames])[0])
+
+        frame_idx = np.linspace(start_idx, end_idx, num=num_frm, endpoint=False, dtype=int)  # Uniform sampling
+        clip_images = []
+
+        # read images
+        temp_frms = vr.get_batch(frame_idx.astype(int).tolist())
+        for idx in range(temp_frms.shape[0]):
+            im = temp_frms[idx]  # H W C
+            clip_images.append(toPIL(im.permute(2, 0, 1)))
+
+        return clip_images
+
+
+class vCLIP(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.num_frm = cfg["num_frm"]
+        self.model_name = cfg["model_name"]
+
+        self.clip = CLIPModel.from_pretrained(self.model_name)
+        self.processor = AutoProcessor.from_pretrained(self.model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+    def get_text_embeddings(self, texts):
+        """Input is list of texts."""
+        text_inputs = self.tokenizer(texts, padding=True, return_tensors="pt")
+        text_features = self.clip.get_text_features(**text_inputs)
+        return text_features
+
+    def get_image_embeddings(self, images):
+        """Input is list of images."""
+        image_inputs = self.processor(images=images, return_tensors="pt")
+        image_features = self.clip.get_image_features(**image_inputs)
+        return image_features
+
+    def get_video_embeddings(self, frames_batch):
+        """Input is list of list of frames in video."""
+        self.batch_size = len(frames_batch)
+        vid_embs = []
+        for frames in frames_batch:
+            frame_embeddings = self.get_image_embeddings(frames)
+            frame_embeddings = rearrange(frame_embeddings, "(b n) d -> b n d", b=len(frames_batch))
+            # Normalize, mean aggregate and return normalized video_embeddings
+            frame_embeddings = frame_embeddings / frame_embeddings.norm(dim=-1, keepdim=True)
+            video_embeddings = frame_embeddings.mean(dim=1)
+            video_embeddings = video_embeddings / video_embeddings.norm(dim=-1, keepdim=True)
+            vid_embs.append(video_embeddings)
+        return torch_cat(vid_embs, dim=0)
diff --git a/comps/retrievers/src/requirements.txt b/comps/retrievers/src/requirements.txt
@@ -1,6 +1,7 @@
 aiofiles
 bs4
 cairosvg
+decord
 docarray[full]
 docx2txt
 easyocr
diff --git a/tests/dataprep/test_dataprep_vdms.sh b/tests/dataprep/test_dataprep_vdms.sh
@@ -29,8 +29,8 @@ function start_service() {
     export VDMS_HOST=$ip_address
     export VDMS_PORT=55555
     export COLLECTION_NAME="test-comps"
-    export QDRANT_HOST=$ip_address
-    export QDRANT_PORT=$QDRANT_PORT
+    export VDMS_HOST=$ip_address
+    export VDMS_PORT=$VDMS_PORT
     service_name="vdms-vector-db dataprep-vdms"
     cd $WORKPATH/comps/dataprep/deployment/docker_compose/
     docker compose up ${service_name} -d
diff --git a/tests/dataprep/test_dataprep_vdms_multimodal.sh b/tests/dataprep/test_dataprep_vdms_multimodal.sh
@@ -28,8 +28,8 @@ function start_service() {
     export VDMS_HOST=$ip_address
     export VDMS_PORT=55555
     export COLLECTION_NAME="test-comps"
-    export QDRANT_HOST=$ip_address
-    export QDRANT_PORT=$QDRANT_PORT
+    export VDMS_HOST=$ip_address
+    export VDMS_PORT=$VDMS_PORT
     export TAG="comps"
     service_name="vdms-vector-db dataprep-vdms-multimodal"
     cd $WORKPATH/comps/dataprep/deployment/docker_compose/
diff --git a/tests/retrievers/test_retrievers_vdms.sh b/tests/retrievers/test_retrievers_vdms.sh