populate_milvus_db.py

from pymilvus import connections
from dotenv import load_dotenv
from pymilvus import utility
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection
from connect_milvus import connect_to_milvus
import json

# Connect to Zilliz

connect_to_milvus()


# List all collections
available_collections = utility.list_collections()

# if 'movies_collection' in available_collections:
#     raise Exception("Collection already exists")


# Define fields
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384)  # Adjust dim based on your embeddings
]

# Create a schema
schema = CollectionSchema(fields, "Movies with embeddings")

# Create a collection
collection = Collection("movies_collection", schema)


# Load the data with embeddings
with open("movies_with_embeddings.json", "r") as f:
    movies = json.load(f)

# Prepare data for insertion
count = 0
titles = []
embeddings = []
for movie in movies:
    if movie.get("embedding"):
        # Elminate duplicates
        if movie.get("title") not in titles:
            titles.append(movie.get("title"))
            embeddings.append(movie.get('embedding'))

print(len(titles), len(embeddings))
# Insert data into Zilliz
entities = [titles, embeddings]
collection.insert(entities)

# Create an index for faster search
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128}
}
collection.create_index("embedding", index_params)

collection.load()