-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimportfromconf_txt2vec.py
121 lines (88 loc) · 3.52 KB
/
importfromconf_txt2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import weaviate,os
from dotenv import load_dotenv
import os
import json
from langchain.document_loaders import ConfluenceLoader
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter
from langchain.schema import Document
import requests
import tiktoken
load_dotenv()
def get_spaces():
"""
Get all global spaces from confluence
"""
url = f"{os.getenv('CONFURL')}/rest/api/space"
headers = {
"Accept": "application/json",
"Authorization": f"Bearer: {os.getenv('CONFKEY')}"
}
params = {
"limit" : 250,
"type" : "global",
"keys": "name,key"
}
response = requests.request(
"GET",
url,
headers=headers,
params=params
)
spaces = response.json()['results']
listoutput = [{"name":space['name'], "key":space['key']} for space in spaces]
return listoutput
def confluence_load():
"""
Load pages from space
"""
loader = ConfluenceLoader(
url = os.getenv('CONFURL'), # type: ignore
token = os.getenv('CONFKEY')
)
documents = loader.load(
keep_markdown_format=True,
cql='type = page AND lastmodified >= now("-2y") AND lastmodified <= now() AND space.key = "ITPortal"') ### CQL for pages that have been modified in the last two years and not IT archive
return documents
WEAVIATE_URL = os.getenv('WEAVIATE_URL')
os.environ["WEAVIATE_API_KEY"] = os.getenv('WEAVIATE_KEY')
client = weaviate.Client(
WEAVIATE_URL,
auth_client_secret=weaviate.AuthApiKey(os.getenv('WEAVIATE_KEY')), # type: ignore
)
def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""Returns the number of tokens in a text string."""
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
conf = confluence_load()
### THE LANGCHAIN DOCUMENT LOADER ADDS ID WHICH CONFLICTS WITH WEAVIATE AND CAUSES ERRORS. THIS changes ids to pageid
new_documents = []
for doc in conf:
if doc.page_content: ### The page actually has content and is not empty
metadata = doc.metadata.copy() # copy the metadata dict to avoid changing the original
metadata['pageid'] = metadata.pop('id') # rename 'id' to 'pageid'
new_doc = Document(page_content=doc.page_content, metadata=metadata) # create a new namedtuple
new_documents.append(new_doc) # append the new namedtuple to the new list
### Next we take the documents from the confluence space and split them into small manageable snippets.
### Split according to markdown headers, etc...
text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=51)
texts = text_splitter.split_documents(new_documents)
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002
texts = text_splitter.split_documents(texts)
### Send them to Weaviate
counter=0
with client.batch(500) as batch:
for document in texts:
# print update message every 100 objects
if (counter %100 == 0):
print(f"(Import {counter} / {len(texts)} ", end="\r")
properties = {
"text": document.page_content,
"source": document.metadata['source'],
"pageid": document.metadata['pageid'],
"title": document.metadata['title']
}
batch.add_data_object(properties, "interactiveit_kb", None)
counter = counter+1
print(f"Import {counter} / {len(texts)}")
print(f"Import complete")