Skip to content

Commit 4d9bd51

Browse files
authored
Fix/file upload explore (#412)
1 parent ed61880 commit 4d9bd51

File tree

10 files changed

+130
-79
lines changed

10 files changed

+130
-79
lines changed

backend/models/brains.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ def brain_size(self):
4040
def remaining_brain_size(self):
4141
return float(self.max_brain_size) - self.brain_size
4242

43-
4443
@classmethod
4544
def create(cls, *args, **kwargs):
4645
commons = common_dependencies()
@@ -79,18 +78,17 @@ def create_brain(self):
7978
self.id = response.data[0]['brain_id']
8079
return response.data
8180

82-
def create_brain_user(self, user_id : UUID, rights, default_brain):
81+
def create_brain_user(self, user_id: UUID, rights, default_brain):
8382
commons = common_dependencies()
84-
response = commons["supabase"].table("brains_users").insert({"brain_id": str(self.id), "user_id":str( user_id), "rights": rights, "default_brain": default_brain}).execute()
83+
response = commons["supabase"].table("brains_users").insert({"brain_id": str(self.id), "user_id": str(user_id), "rights": rights, "default_brain": default_brain}).execute()
8584

86-
8785
return response.data
8886

89-
def create_brain_vector(self, vector_id):
87+
def create_brain_vector(self, vector_id, file_sha1):
9088
response = (
9189
self.commons["supabase"]
9290
.table("brains_vectors")
93-
.insert({"brain_id": str(self.id), "vector_id": str(vector_id)})
91+
.insert({"brain_id": str(self.id), "vector_id": str(vector_id), "file_sha1": file_sha1})
9492
.execute()
9593
)
9694
return response.data
@@ -115,7 +113,7 @@ def update_brain_with_file(self, file_sha1: str):
115113
# not used
116114
vector_ids = self.get_vector_ids_from_file_sha1(file_sha1)
117115
for vector_id in vector_ids:
118-
self.create_brain_vector(vector_id)
116+
self.create_brain_vector(vector_id, file_sha1)
119117

120118
def get_unique_brain_files(self):
121119
"""
@@ -142,15 +140,24 @@ def get_unique_brain_files(self):
142140

143141
return self.files
144142

145-
def get_unique_files_from_vector_ids(self, vectors_ids : List[int]):
143+
def get_unique_files_from_vector_ids(self, vectors_ids: List[int]):
146144
# Move into Vectors class
147145
"""
148146
Retrieve unique user data vectors.
149147
"""
150-
vectors_response = self.commons['supabase'].table("vectors").select(
151-
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
148+
print('vectors_ids', vectors_ids)
149+
print('tuple(vectors_ids)', tuple(vectors_ids))
150+
if len(vectors_ids) == 1:
151+
vectors_response = self.commons['supabase'].table("vectors").select(
152+
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
153+
.filter("id", "eq", vectors_ids[0])\
154+
.execute()
155+
else:
156+
vectors_response = self.commons['supabase'].table("vectors").select(
157+
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
152158
.filter("id", "in", tuple(vectors_ids))\
153159
.execute()
160+
154161
documents = vectors_response.data # Access the data from the response
155162
# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
156163
unique_files = [dict(t) for t in set(tuple(d.items()) for d in documents)]
@@ -187,6 +194,7 @@ def get_default_user_brain(user: User):
187194
.execute()
188195
)
189196

197+
print("Default brain response:", response.data)
190198
default_brain_id = response.data[0]["brain_id"] if response.data else None
191199

192200
print(f"Default brain id: {default_brain_id}")

backend/models/files.py

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,26 @@
66
from fastapi import UploadFile
77
from langchain.text_splitter import RecursiveCharacterTextSplitter
88
from logger import get_logger
9+
from models.brains import Brain
910
from models.settings import CommonsDep, common_dependencies
1011
from pydantic import BaseModel
1112
from utils.file import compute_sha1_from_file
1213

1314
logger = get_logger(__name__)
1415

16+
1517
class File(BaseModel):
1618
id: Optional[UUID] = None
1719
file: Optional[UploadFile]
1820
file_name: Optional[str] = ""
1921
file_size: Optional[int] = ""
2022
file_sha1: Optional[str] = ""
21-
vectors_ids: Optional[int]=[]
23+
vectors_ids: Optional[int] = []
2224
file_extension: Optional[str] = ""
23-
content: Optional[Any]= None
25+
content: Optional[Any] = None
2426
chunk_size: int = 500
25-
chunk_overlap: int= 0
26-
documents: Optional[Any]= None
27+
chunk_overlap: int = 0
28+
documents: Optional[Any] = None
2729
_commons: Optional[CommonsDep] = None
2830

2931
def __init__(self, **kwargs):
@@ -56,7 +58,6 @@ def compute_documents(self, loader_class):
5658

5759
print("documents", documents)
5860

59-
6061
os.remove(tmp_file.name)
6162

6263
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
@@ -68,6 +69,11 @@ def compute_documents(self, loader_class):
6869
print(self.documents)
6970

7071
def set_file_vectors_ids(self):
72+
"""
73+
Set the vectors_ids property with the ids of the vectors
74+
that are associated with the file in the vectors table
75+
"""
76+
7177
commons = common_dependencies()
7278
response = (
7379
commons["supabase"].table("vectors")
@@ -78,32 +84,45 @@ def set_file_vectors_ids(self):
7884
self.vectors_ids = response.data
7985
return
8086

81-
def file_already_exists(self, brain_id):
82-
commons = common_dependencies()
83-
87+
def file_already_exists(self):
88+
"""
89+
Check if file already exists in vectors table
90+
"""
8491
self.set_file_vectors_ids()
8592

8693
print("file_sha1", self.file_sha1)
8794
print("vectors_ids", self.vectors_ids)
8895
print("len(vectors_ids)", len(self.vectors_ids))
8996

97+
# if the file does not exist in vectors then no need to go check in brains_vectors
9098
if len(self.vectors_ids) == 0:
9199
return False
92-
93-
for vector in self.vectors_ids:
94-
response = (
95-
commons["supabase"].table("brains_vectors")
96-
.select("brain_id, vector_id")
97-
.filter("brain_id", "eq", brain_id)
98-
.filter("vector_id", "eq", vector['id'])
99-
.execute()
100-
)
101-
print("response.data", response.data)
102-
if len(response.data) == 0:
103-
return False
104-
100+
105101
return True
106102

103+
def file_already_exists_in_brain(self, brain_id):
104+
commons = common_dependencies()
105+
self.set_file_vectors_ids()
106+
# Check if file exists in that brain
107+
response = (
108+
commons["supabase"].table("brains_vectors")
109+
.select("brain_id, vector_id")
110+
.filter("brain_id", "eq", brain_id)
111+
.filter("file_sha1", "eq", self.file_sha1)
112+
.execute()
113+
)
114+
print("response.data", response.data)
115+
if len(response.data) == 0:
116+
return False
117+
118+
return True
119+
107120
def file_is_empty(self):
108121
return self.file.file._file.tell() < 1
109-
122+
123+
def link_file_to_brain(self, brain: Brain):
124+
self.set_file_vectors_ids()
125+
126+
for vector_id in self.vectors_ids:
127+
brain.create_brain_vector(vector_id['id'], self.file_sha1)
128+
print(f"Successfully linked file {self.file_sha1} to brain {brain.id}")

backend/models/users.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,23 @@
55
from pydantic import BaseModel
66

77
logger = get_logger(__name__)
8+
9+
810
class User(BaseModel):
911
id: UUID
1012
email: str
1113
user_openai_api_key: str = None
1214
requests_count: int = 0
13-
user_openai_api_key: str = None
14-
1515

1616
# [TODO] Rename the user table and its references to 'user_usage'
17-
def create_user( self,date):
17+
def create_user(self, date):
1818

1919
commons = common_dependencies()
2020
logger.info(f"New user entry in db document for user {self.email}")
2121

22-
return(commons['supabase'].table("users").insert(
22+
return (commons['supabase'].table("users").insert(
2323
{"user_id": self.id, "email": self.email, "date": date, "requests_count": 1}).execute())
2424

25-
2625
def get_user_request_stats(self):
2726
commons = common_dependencies()
2827
requests_stats = commons['supabase'].from_('users').select(
@@ -43,12 +42,11 @@ def fetch_user_requests_count(self, date):
4342

4443
return userItem["requests_count"]
4544

46-
4745
def increment_user_request_count(self, date):
4846
commons = common_dependencies()
4947
requests_count = self.fetch_user_requests_count(date) + 1
5048
logger.info(f"User {self.email} request count updated to {requests_count}")
5149
commons['supabase'].table("users").update(
52-
{ "requests_count": requests_count}).match({"user_id": self.id, "date": date}).execute()
50+
{"requests_count": requests_count}).match({"user_id": self.id, "date": date}).execute()
5351
self.requests_count = requests_count
5452

backend/parsers/common.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,15 @@ async def process_file(
3131
}
3232
doc_with_metadata = Document(
3333
page_content=doc.page_content, metadata=metadata)
34+
3435
neurons = Neurons(commons=commons)
3536
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
3637
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
3738

3839
created_vector_id = created_vector[0]
3940

4041
brain = Brain(id=brain_id)
41-
brain.create_brain_vector(created_vector_id)
42+
brain.create_brain_vector(created_vector_id, file.file_sha1)
4243

4344
return
4445

backend/parsers/github.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, brain_
1515
random_dir_name = os.urandom(16).hex()
1616
dateshort = time.strftime("%Y%m%d")
1717
loader = GitLoader(
18-
clone_url=repo,
19-
repo_path="/tmp/" + random_dir_name,
18+
clone_url=repo,
19+
repo_path="/tmp/" + random_dir_name,
2020
)
2121
documents = loader.load()
2222
os.system("rm -rf /tmp/" + random_dir_name)
@@ -44,21 +44,21 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, brain_
4444
doc_with_metadata = Document(
4545
page_content=doc.page_content, metadata=metadata)
4646

47-
file = File(file_sha1 = compute_sha1_from_content(doc.page_content.encode("utf-8")))
47+
file = File(file_sha1=compute_sha1_from_content(doc.page_content.encode("utf-8")))
4848

49-
exist = file.file_already_exists(brain_id)
50-
if not exist:
49+
file_exists = file.file_already_exists()
50+
51+
if not file_exists:
52+
print(f"Creating entry for file {file.file_sha1} in vectors...")
5153
neurons = Neurons(commons=commons)
5254
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
55+
print("Created vector sids ", created_vector)
56+
print("Created vector for ", doc.metadata["file_name"])
5357

54-
created_vector_id = created_vector[0]
58+
file_exists_in_brain = file.file_already_exists_in_brain(brain_id)
5559

60+
if not file_exists_in_brain:
61+
file.add_file_to_brain(brain_id)
5662
brain = Brain(id=brain_id)
57-
brain.create_brain_vector(created_vector_id)
58-
59-
print("Created vector for ", doc.metadata["file_name"])
60-
# add created_vector x brains in db
61-
62-
63+
file.link_file_to_brain(brain)
6364
return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"}
64-

backend/routes/brain_routes.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,13 @@ async def brain_endpoint(
131131
brain.create_brain()
132132
default_brain = get_default_user_brain(current_user)
133133
if default_brain:
134-
# create a brain X user entry
134+
logger.info(f"Default brain already exists for user {current_user.id}")
135135
brain.create_brain_user(user_id = current_user.id, rights="Owner", default_brain=False)
136136
else:
137+
logger.info(f"Default brain does not exist for user {current_user.id}. It will be created.")
137138
brain.create_brain_user(user_id = current_user.id, rights="Owner", default_brain=True)
139+
140+
138141
return {"id": brain.id, "name": brain.name}
139142

140143
# update existing brain

backend/routes/explore_routes.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
explore_router = APIRouter()
1010

1111

12-
@explore_router.get("/explore", dependencies=[Depends(AuthBearer())], tags=["Explore"])
13-
async def explore_endpoint(brain_id: UUID = Query(..., description="The ID of the brain"),current_user: User = Depends(get_current_user)):
12+
@explore_router.get("/explore/", dependencies=[Depends(AuthBearer())], tags=["Explore"])
13+
async def explore_endpoint(brain_id: UUID = Query(..., description="The ID of the brain"), current_user: User = Depends(get_current_user)):
1414
"""
1515
Retrieve and explore unique user data vectors.
1616
"""

backend/utils/processors.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11

2+
from models.brains import Brain
23
from models.files import File
34
from models.settings import CommonsDep
45
from parsers.audio import process_audio
@@ -35,20 +36,32 @@
3536
}
3637

3738

39+
def create_response(message, type):
40+
return {"message": message, "type": type}
3841

3942

4043
async def filter_file(commons: CommonsDep, file: File, enable_summarization: bool, brain_id, openai_api_key):
4144
await file.compute_file_sha1()
4245

4346
print("file sha1", file.file_sha1)
44-
if file.file_already_exists( brain_id):
45-
return {"message": f"🤔 {file.file.filename} already exists in brain {brain_id}.", "type": "warning"}
47+
file_exists = file.file_already_exists()
48+
file_exists_in_brain = file.file_already_exists_in_brain(brain_id)
49+
50+
if file_exists_in_brain:
51+
return create_response(f"🤔 {file.file.filename} already exists in brain {brain_id}.", "warning")
4652
elif file.file_is_empty():
47-
return {"message": f"❌ {file.file.filename} is empty.", "type": "error"}
48-
else:
49-
if file.file_extension in file_processors:
50-
await file_processors[file.file_extension](commons,file, enable_summarization, brain_id ,openai_api_key )
51-
return {"message": f"✅ {file.file.filename} has been uploaded to brain {brain_id}.", "type": "success"}
52-
else:
53-
return {"message": f"❌ {file.file.filename} is not supported.", "type": "error"}
53+
return create_response(f"❌ {file.file.filename} is empty.", "error")
54+
elif file_exists:
55+
file.link_file_to_brain(brain=Brain(id=brain_id))
56+
return create_response(f"✅ {file.file.filename} has been uploaded to brain {brain_id}.", "success")
57+
58+
if file.file_extension in file_processors:
59+
try:
60+
await file_processors[file.file_extension](commons, file, enable_summarization, brain_id, openai_api_key)
61+
return create_response(f"✅ {file.file.filename} has been uploaded to brain {brain_id}.", "success")
62+
except Exception as e:
63+
# Add more specific exceptions as needed.
64+
print(f"Error processing file: {e}")
65+
return create_response(f"⚠️ An error occurred while processing {file.file.filename}.", "error")
5466

67+
return create_response(f"❌ {file.file.filename} is not supported.", "error")

0 commit comments

Comments
 (0)