Skip to content

Commit 963fb05

Browse files
authored
feat(github): now github loader (#264)
1 parent 000933f commit 963fb05

File tree

4 files changed

+106
-14
lines changed

4 files changed

+106
-14
lines changed

backend/crawl/crawler.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
import requests
2-
from pydantic import BaseModel
3-
import requests
1+
import os
42
import re
5-
import unicodedata
63
import tempfile
7-
import os
4+
import unicodedata
5+
6+
import requests
7+
from langchain.document_loaders import GitLoader
8+
from pydantic import BaseModel
89

910

1011
class CrawlWebsite(BaseModel):
@@ -23,6 +24,7 @@ def _crawl(self, url):
2324

2425
def process(self):
2526
content = self._crawl(self.url)
27+
2628
## Create a file
2729
file_name = slugify(self.url) + ".html"
2830
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
@@ -34,6 +36,12 @@ def process(self):
3436
return temp_file_path, file_name
3537
else:
3638
return None
39+
def checkGithub(self):
40+
if "github.com" in self.url:
41+
return True
42+
else:
43+
return False
44+
3745

3846

3947
def slugify(text):

backend/main.py

+34-9
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from middlewares.cors import add_cors_middleware
1414
from models.chats import ChatMessage
1515
from models.users import User
16+
from parsers.github import process_github
1617
from utils.file import convert_bytes, get_file_size
1718
from utils.processors import filter_file
1819
from utils.vectors import (CommonsDep, create_user, similarity_search,
@@ -114,18 +115,42 @@ async def chat_endpoint(commons: CommonsDep, chat_message: ChatMessage, credenti
114115

115116
@app.post("/crawl/", dependencies=[Depends(JWTBearer())])
116117
async def crawl_endpoint(commons: CommonsDep, crawl_website: CrawlWebsite, enable_summarization: bool = False, credentials: dict = Depends(JWTBearer())):
118+
max_brain_size = os.getenv("MAX_BRAIN_SIZE")
119+
117120
user = User(email=credentials.get('email', 'none'))
118-
file_path, file_name = crawl_website.process()
121+
user_vectors_response = commons['supabase'].table("vectors").select(
122+
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
123+
.filter("user_id", "eq", user.email)\
124+
.execute()
125+
documents = user_vectors_response.data # Access the data from the response
126+
# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
127+
user_unique_vectors = [dict(t) for t in set(tuple(d.items()) for d in documents)]
119128

120-
# Create a SpooledTemporaryFile from the file_path
121-
spooled_file = SpooledTemporaryFile()
122-
with open(file_path, 'rb') as f:
123-
shutil.copyfileobj(f, spooled_file)
129+
current_brain_size = sum(float(doc['size']) for doc in user_unique_vectors)
124130

125-
# Pass the SpooledTemporaryFile to UploadFile
126-
file = UploadFile(file=spooled_file, filename=file_name)
127-
message = await filter_file(file, enable_summarization, commons['supabase'], user=user)
128-
return message
131+
file_size = 1000000
132+
133+
remaining_free_space = float(max_brain_size) - (current_brain_size)
134+
135+
if remaining_free_space - file_size < 0:
136+
message = {"message": f"❌ User's brain will exceed maximum capacity with this upload. Maximum file allowed is : {convert_bytes(remaining_free_space)}", "type": "error"}
137+
else:
138+
user = User(email=credentials.get('email', 'none'))
139+
if not crawl_website.checkGithub():
140+
141+
file_path, file_name = crawl_website.process()
142+
143+
# Create a SpooledTemporaryFile from the file_path
144+
spooled_file = SpooledTemporaryFile()
145+
with open(file_path, 'rb') as f:
146+
shutil.copyfileobj(f, spooled_file)
147+
148+
# Pass the SpooledTemporaryFile to UploadFile
149+
file = UploadFile(file=spooled_file, filename=file_name)
150+
message = await filter_file(file, enable_summarization, commons['supabase'], user=user)
151+
return message
152+
else:
153+
message = await process_github(crawl_website.url, "false", user=user, supabase=commons['supabase'])
129154

130155

131156
@app.get("/explore", dependencies=[Depends(JWTBearer())])

backend/parsers/common.py

+6
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,9 @@ async def file_already_exists(supabase, file, user):
6565
response = supabase.table("vectors").select("id").filter("metadata->>file_sha1", "eq", file_sha1) \
6666
.filter("user_id", "eq", user.email).execute()
6767
return len(response.data) > 0
68+
69+
async def file_already_exists_from_content(supabase, file_content, user):
70+
file_sha1 = compute_sha1_from_content(file_content)
71+
response = supabase.table("vectors").select("id").filter("metadata->>file_sha1", "eq", file_sha1) \
72+
.filter("user_id", "eq", user.email).execute()
73+
return len(response.data) > 0

backend/parsers/github.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import os
2+
import time
3+
4+
from fastapi import UploadFile
5+
from langchain.document_loaders import GitLoader
6+
from langchain.schema import Document
7+
from langchain.text_splitter import RecursiveCharacterTextSplitter
8+
from parsers.common import file_already_exists_from_content
9+
from utils.file import compute_sha1_from_content, compute_sha1_from_file
10+
from utils.vectors import create_summary, create_vector, documents_vector_store
11+
12+
from .common import process_file
13+
14+
15+
async def process_github(repo, enable_summarization, user, supabase):
16+
random_dir_name = os.urandom(16).hex()
17+
dateshort = time.strftime("%Y%m%d")
18+
loader = GitLoader(
19+
clone_url=repo,
20+
repo_path="/tmp/" + random_dir_name,
21+
)
22+
documents = loader.load()
23+
os.system("rm -rf /tmp/" + random_dir_name)
24+
25+
chunk_size = 500
26+
chunk_overlap = 0
27+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
28+
chunk_size=chunk_size, chunk_overlap=chunk_overlap)
29+
30+
documents = text_splitter.split_documents(documents)
31+
print(documents[:1])
32+
33+
for doc in documents:
34+
if doc.metadata["file_type"] in [".pyc", ".env", ".lock", ".gitignore", ".gitmodules", ".gitattributes", ".gitkeep", ".git"]:
35+
continue
36+
metadata = {
37+
"file_sha1": compute_sha1_from_content(doc.page_content.encode("utf-8")),
38+
"file_size": len(doc.page_content)*8,
39+
"file_name": doc.metadata["file_name"],
40+
"chunk_size": chunk_size,
41+
"chunk_overlap": chunk_overlap,
42+
"date": dateshort,
43+
"summarization": "true" if enable_summarization else "false"
44+
}
45+
doc_with_metadata = Document(
46+
page_content=doc.page_content, metadata=metadata)
47+
exist = await file_already_exists_from_content(supabase, doc.page_content.encode("utf-8"), user)
48+
if not exist:
49+
create_vector(user.email, doc_with_metadata)
50+
print("Created vector for ", doc.metadata["file_name"])
51+
52+
return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"}
53+

0 commit comments

Comments
 (0)