Skip to content

Commit fdb2005

Browse files
cwlacewechensuyue
andauthored
VDMS langchain package update (opea-project#1317)
* Update VDMS related components * Pinned protobuf==4.24.2 for vdms to avoid issues * Move opentelemetry installation to requirements file like others, unset proxy in opensearch tests. Opensearch tests pass locally with setting vm.max_map_count * pin protobuf and downgrade opentelemetry to v1.27.0 in dataprep and retriever dockerfiles * Move opentelemetry installation to requirements file like others, unset proxy in opensearch tests. Opensearch tests pass locally with setting vm.max_map_count Signed-off-by: Lacewell, Chaunte W <chaunte.w.lacewell@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>
1 parent e349c50 commit fdb2005

File tree

14 files changed

+278
-93
lines changed

14 files changed

+278
-93
lines changed

.github/workflows/pr-microservice-test.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ jobs:
6464
role-to-assume: ${{ secrets.AWS_IAM_ROLE_ARN }}
6565
aws-region: us-east-1
6666

67+
- name: Set Memory Map Limit
68+
if: ${{ contains(matrix.service, "opensearch") }}
69+
run: sudo sysctl -w vm.max_map_count=262144
70+
6771
- name: Run microservice test
6872
env:
6973
HF_TOKEN: ${{ secrets.HF_TOKEN }}

comps/dataprep/src/Dockerfile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,7 @@ RUN pip install --no-cache-dir --upgrade pip setuptools && \
3939
PIP_EXTRA_INDEX_URL=""; \
4040
fi && \
4141
pip install --no-cache-dir torch torchvision ${PIP_EXTRA_INDEX_URL} && \
42-
pip install --no-cache-dir ${PIP_EXTRA_INDEX_URL} -r /home/user/comps/dataprep/src/requirements.txt && \
43-
pip install opentelemetry-api==1.29.0 opentelemetry-exporter-otlp==1.29.0 opentelemetry-sdk==1.29.0
42+
pip install --no-cache-dir ${PIP_EXTRA_INDEX_URL} -r /home/user/comps/dataprep/src/requirements.txt
4443

4544
ENV PYTHONPATH=$PYTHONPATH:/home/user
4645

comps/dataprep/src/integrations/utils/store_embeddings.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,9 @@
66
import numpy as np
77
import torchvision.transforms as T
88
from decord import VideoReader, cpu
9-
from langchain.pydantic_v1 import BaseModel, root_validator
10-
from langchain_community.vectorstores import VDMS
11-
from langchain_community.vectorstores.vdms import VDMS_Client
129
from langchain_core.embeddings import Embeddings
10+
from langchain_vdms.vectorstores import VDMS, VDMS_Client
11+
from pydantic import BaseModel, model_validator
1312

1413
toPIL = T.ToPILImage()
1514

@@ -21,7 +20,7 @@ class vCLIPEmbeddings(BaseModel, Embeddings):
2120

2221
model: Any
2322

24-
@root_validator(allow_reuse=True)
23+
@model_validator(mode="before")
2524
def validate_environment(cls, values: Dict) -> Dict:
2625
"""Validate that open_clip and torch libraries are installed."""
2726
try:
@@ -99,6 +98,8 @@ def __init__(
9998
collection_name,
10099
embedding_dimensions: int = 512,
101100
chosen_video_search_type="similarity",
101+
engine: str = "FaissFlat",
102+
distance_strategy: str = "IP",
102103
):
103104

104105
self.host = host
@@ -110,6 +111,8 @@ def __init__(
110111
self.video_embedder = vCLIPEmbeddings(model=video_retriever_model)
111112
self.chosen_video_search_type = chosen_video_search_type
112113
self.embedding_dimensions = embedding_dimensions
114+
self.engine = engine
115+
self.distance_strategy = distance_strategy
113116

114117
# initialize_db
115118
self.get_db_client()
@@ -128,7 +131,7 @@ def init_db(self):
128131
client=self.client,
129132
embedding=self.video_embedder,
130133
collection_name=self.video_collection,
131-
engine="FaissFlat",
132-
distance_strategy="IP",
134+
engine=self.engine,
135+
distance_strategy=self.distance_strategy,
133136
embedding_dimensions=self.embedding_dimensions,
134137
)

comps/dataprep/src/integrations/vdms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
from fastapi import Body, File, Form, HTTPException, UploadFile
99
from langchain.text_splitter import RecursiveCharacterTextSplitter
1010
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings
11-
from langchain_community.vectorstores.vdms import VDMS, VDMS_Client
1211
from langchain_huggingface import HuggingFaceEmbeddings
1312
from langchain_text_splitters import HTMLHeaderTextSplitter
13+
from langchain_vdms.vectorstores import VDMS, VDMS_Client
1414

1515
from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType
1616
from comps.dataprep.src.utils import (

comps/dataprep/src/integrations/vdms_multimodal.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
VECTORDB_SERVICE_HOST_IP = os.getenv("VDMS_HOST", "0.0.0.0")
2424
VECTORDB_SERVICE_PORT = os.getenv("VDMS_PORT", 55555)
2525
collection_name = os.getenv("INDEX_NAME", "rag-vdms")
26+
SEARCH_ENGINE = os.getenv("SEARCH_ENGINE", "FaissFlat")
27+
DISTANCE_STRATEGY = os.getenv("DISTANCE_STRATEGY", "IP")
2628

2729
logger = CustomLogger("opea_dataprep_vdms_multimodal")
2830
logflag = os.getenv("LOGFLAG", False)
@@ -72,6 +74,7 @@ def store_into_vectordb(self, vs, metadata_file_path, dimensions):
7274
metadata_list = [data]
7375
if vs.selected_db == "vdms":
7476
vs.video_db.add_videos(
77+
texts=video_name_list,
7578
paths=video_name_list,
7679
metadatas=metadata_list,
7780
start_time=[data["timestamp"]],
@@ -145,14 +148,21 @@ async def ingest_videos(self, files: List[UploadFile] = File(None)):
145148
# init meanclip model
146149
model = self.setup_vclip_model(meanclip_cfg, device="cpu")
147150
vs = store_embeddings.VideoVS(
148-
host, port, selected_db, model, collection_name, embedding_dimensions=vector_dimensions
151+
host,
152+
port,
153+
selected_db,
154+
model,
155+
collection_name,
156+
embedding_dimensions=vector_dimensions,
157+
engine=SEARCH_ENGINE,
158+
distance_strategy=DISTANCE_STRATEGY,
149159
)
150160
logger.info("done creating DB, sleep 5s")
151161
await asyncio.sleep(5)
152162

153163
self.generate_embeddings(config, vector_dimensions, vs)
154164

155-
return {"message": "Videos ingested successfully"}
165+
return {"status": 200, "message": "Videos ingested successfully"}
156166

157167
async def get_videos(self):
158168
"""Returns list of names of uploaded videos saved on the server."""

comps/dataprep/src/requirements.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ einops
99
elasticsearch
1010
fastapi
1111
future
12-
graspologic
12+
graspologic
1313
html2text
1414
huggingface_hub
1515
ipython
@@ -21,9 +21,10 @@ langchain-openai
2121
langchain-pinecone
2222
langchain-redis
2323
langchain-text-splitters
24+
langchain-vdms>=0.1.4
2425
langchain_huggingface
2526
langchain_milvus
26-
llama-index
27+
llama-index
2728
llama-index-core==0.12.19
2829
llama-index-embeddings-text-embeddings-inference
2930
llama-index-graph-stores-neo4j
@@ -37,11 +38,15 @@ openai
3738
openai-whisper
3839
opencv-python
3940
opensearch-py
41+
opentelemetry-api==1.27.0
42+
opentelemetry-exporter-otlp==1.27.0
43+
opentelemetry-sdk==1.27.0
4044
pandas
4145
pgvector==0.2.5
4246
Pillow
4347
pinecone-client
4448
prometheus-fastapi-instrumentator
49+
protobuf==4.24.2
4550
psycopg2
4651
pymupdf
4752
pyspark
@@ -60,5 +65,4 @@ typing
6065
tzlocal
6166
unstructured[all-docs]
6267
uvicorn
63-
vdms
6468
webvtt-py

comps/retrievers/deployment/docker_compose/compose.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,18 @@ services:
196196
tei-embedding-serving:
197197
condition: service_healthy
198198

199+
retriever-vdms-multimodal:
200+
extends: retriever
201+
container_name: retriever-vdms-multimodal
202+
environment:
203+
RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_VDMS"
204+
VDMS_INDEX_NAME: ${INDEX_NAME}
205+
VDMS_HOST: ${host_ip}
206+
VDMS_PORT: ${VDMS_PORT}
207+
VDMS_USE_CLIP: ${VDMS_USE_CLIP}
208+
depends_on:
209+
vdms-vector-db:
210+
condition: service_healthy
199211

200212
networks:
201213
default:

comps/retrievers/src/Dockerfile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ RUN pip install --no-cache-dir --upgrade pip setuptools && \
2525
PIP_EXTRA_INDEX_URL=""; \
2626
fi && \
2727
pip install --no-cache-dir torch torchvision ${PIP_EXTRA_INDEX_URL} && \
28-
pip install --no-cache-dir ${PIP_EXTRA_INDEX_URL} -r /home/user/comps/retrievers/src/requirements.txt && \
29-
pip install opentelemetry-api==1.29.0 opentelemetry-exporter-otlp==1.29.0 opentelemetry-sdk==1.29.0
28+
pip install --no-cache-dir ${PIP_EXTRA_INDEX_URL} -r /home/user/comps/retrievers/src/requirements.txt
3029

3130
ENV PYTHONPATH=$PYTHONPATH:/home/user
3231

comps/retrievers/src/integrations/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,5 +184,5 @@ def format_opensearch_conn_from_env():
184184
VDMS_PORT = int(os.getenv("VDMS_PORT", 55555))
185185
VDMS_INDEX_NAME = os.getenv("VDMS_INDEX_NAME", "rag_vdms")
186186
VDMS_USE_CLIP = int(os.getenv("VDMS_USE_CLIP", 0))
187-
SEARCH_ENGINE = "FaissFlat"
188-
DISTANCE_STRATEGY = "IP"
187+
SEARCH_ENGINE = os.getenv("SEARCH_ENGINE", "FaissFlat")
188+
DISTANCE_STRATEGY = os.getenv("DISTANCE_STRATEGY", "IP")

0 commit comments

Comments
 (0)