-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpopulate_milvus_db.py
63 lines (47 loc) · 1.59 KB
/
populate_milvus_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from pymilvus import connections
from dotenv import load_dotenv
from pymilvus import utility
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection
from connect_milvus import connect_to_milvus
import json
# Connect to Zilliz
connect_to_milvus()
# List all collections
available_collections = utility.list_collections()
# if 'movies_collection' in available_collections:
# raise Exception("Collection already exists")
# Define fields
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384) # Adjust dim based on your embeddings
]
# Create a schema
schema = CollectionSchema(fields, "Movies with embeddings")
# Create a collection
collection = Collection("movies_collection", schema)
# Load the data with embeddings
with open("movies_with_embeddings.json", "r") as f:
movies = json.load(f)
# Prepare data for insertion
count = 0
titles = []
embeddings = []
for movie in movies:
if movie.get("embedding"):
# Elminate duplicates
if movie.get("title") not in titles:
titles.append(movie.get("title"))
embeddings.append(movie.get('embedding'))
print(len(titles), len(embeddings))
# Insert data into Zilliz
entities = [titles, embeddings]
collection.insert(entities)
# Create an index for faster search
index_params = {
"index_type": "IVF_FLAT",
"metric_type": "L2",
"params": {"nlist": 128}
}
collection.create_index("embedding", index_params)
collection.load()