Skip to content

Commit 1dde43c

Browse files
committed
add new examples
1 parent ed1dc0b commit 1dde43c

8 files changed

+402
-6
lines changed
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
import pandas as pd
8+
from scrapegraphai.graphs import CSVScraperMultiGraph
9+
from langchain_openai import AzureChatOpenAI
10+
from langchain_openai import AzureOpenAIEmbeddings
11+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
12+
13+
load_dotenv()
14+
# ************************************************
15+
# Read the CSV file
16+
# ************************************************
17+
18+
FILE_NAME = "inputs/username.csv"
19+
curr_dir = os.path.dirname(os.path.realpath(__file__))
20+
file_path = os.path.join(curr_dir, FILE_NAME)
21+
22+
text = pd.read_csv(file_path)
23+
24+
# ************************************************
25+
# Define the configuration for the graph
26+
# ************************************************
27+
llm_model_instance = AzureChatOpenAI(
28+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
29+
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
30+
)
31+
32+
embedder_model_instance = AzureOpenAIEmbeddings(
33+
azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
34+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
35+
)
36+
graph_config = {
37+
"llm": {"model_instance": llm_model_instance},
38+
"embeddings": {"model_instance": embedder_model_instance}
39+
}
40+
# ************************************************
41+
# Create the CSVScraperMultiGraph instance and run it
42+
# ************************************************
43+
44+
csv_scraper_graph = CSVScraperMultiGraph(
45+
prompt="List me all the last names",
46+
source=[str(text), str(text)],
47+
config=graph_config
48+
)
49+
50+
result = csv_scraper_graph.run()
51+
print(result)
52+
53+
# ************************************************
54+
# Get graph execution info
55+
# ************************************************
56+
57+
graph_exec_info = csv_scraper_graph.get_execution_info()
58+
print(prettify_exec_info(graph_exec_info))
59+
60+
# Save to json or csv
61+
convert_to_csv(result, "result")
62+
convert_to_json(result, "result")
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""
2+
Module for showing how JSONScraperMultiGraph multi works
3+
"""
4+
import os
5+
import json
6+
from langchain_openai import AzureChatOpenAI
7+
from langchain_openai import AzureOpenAIEmbeddings
8+
from scrapegraphai.graphs import JSONScraperMultiGraph
9+
10+
llm_model_instance = AzureChatOpenAI(
11+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
12+
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
13+
)
14+
15+
embedder_model_instance = AzureOpenAIEmbeddings(
16+
azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
17+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
18+
)
19+
graph_config = {
20+
"llm": {"model_instance": llm_model_instance},
21+
"embeddings": {"model_instance": embedder_model_instance}
22+
}
23+
FILE_NAME = "inputs/example.json"
24+
curr_dir = os.path.dirname(os.path.realpath(__file__))
25+
file_path = os.path.join(curr_dir, FILE_NAME)
26+
27+
with open(file_path, 'r', encoding="utf-8") as file:
28+
text = file.read()
29+
30+
sources = [text, text]
31+
32+
multiple_search_graph = JSONScraperMultiGraph(
33+
prompt= "List me all the authors, title and genres of the books",
34+
source= sources,
35+
schema=None,
36+
config=graph_config
37+
)
38+
39+
result = multiple_search_graph.run()
40+
print(json.dumps(result, indent=4))
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""
2+
Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import XMLScraperMultiGraph
8+
from langchain_openai import AzureChatOpenAI
9+
from langchain_openai import AzureOpenAIEmbeddings
10+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Read the XML file
15+
# ************************************************
16+
17+
FILE_NAME = "inputs/books.xml"
18+
curr_dir = os.path.dirname(os.path.realpath(__file__))
19+
file_path = os.path.join(curr_dir, FILE_NAME)
20+
21+
with open(file_path, 'r', encoding="utf-8") as file:
22+
text = file.read()
23+
24+
# ************************************************
25+
# Define the configuration for the graph
26+
# ************************************************
27+
28+
llm_model_instance = AzureChatOpenAI(
29+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
30+
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
31+
)
32+
33+
embedder_model_instance = AzureOpenAIEmbeddings(
34+
azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
35+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
36+
)
37+
graph_config = {
38+
"llm": {"model_instance": llm_model_instance},
39+
"embeddings": {"model_instance": embedder_model_instance}
40+
}
41+
42+
# ************************************************
43+
# Create the XMLScraperMultiGraph instance and run it
44+
# ************************************************
45+
46+
xml_scraper_graph = XMLScraperMultiGraph(
47+
prompt="List me all the authors, title and genres of the books",
48+
source=[text, text], # Pass the content of the file, not the file object
49+
config=graph_config
50+
)
51+
52+
result = xml_scraper_graph.run()
53+
print(result)
54+
55+
# ************************************************
56+
# Get graph execution info
57+
# ************************************************
58+
59+
graph_exec_info = xml_scraper_graph.get_execution_info()
60+
print(prettify_exec_info(graph_exec_info))
61+
62+
# Save to json or csv
63+
convert_to_csv(result, "result")
64+
convert_to_json(result, "result")
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""
2+
Module for showing how JSONScraperMultiGraph multi works
3+
"""
4+
import os
5+
import json
6+
from scrapegraphai.graphs import JSONScraperMultiGraph
7+
8+
graph_config = {
9+
"llm": {
10+
"client": "client_name",
11+
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
12+
"temperature": 0.0
13+
},
14+
"embeddings": {
15+
"model": "bedrock/cohere.embed-multilingual-v3"
16+
}
17+
}
18+
FILE_NAME = "inputs/example.json"
19+
curr_dir = os.path.dirname(os.path.realpath(__file__))
20+
file_path = os.path.join(curr_dir, FILE_NAME)
21+
22+
with open(file_path, 'r', encoding="utf-8") as file:
23+
text = file.read()
24+
25+
sources = [text, text]
26+
27+
multiple_search_graph = JSONScraperMultiGraph(
28+
prompt= "List me all the authors, title and genres of the books",
29+
source= sources,
30+
schema=None,
31+
config=graph_config
32+
)
33+
34+
result = multiple_search_graph.run()
35+
print(json.dumps(result, indent=4))
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents
3+
"""
4+
5+
import os
6+
import pandas as pd
7+
from scrapegraphai.graphs import CSVScraperMultiGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
from langchain_community.llms import HuggingFaceEndpoint
10+
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
11+
12+
# ************************************************
13+
# Read the CSV file
14+
# ************************************************
15+
16+
FILE_NAME = "inputs/username.csv"
17+
curr_dir = os.path.dirname(os.path.realpath(__file__))
18+
file_path = os.path.join(curr_dir, FILE_NAME)
19+
20+
text = pd.read_csv(file_path)
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
26+
27+
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
28+
29+
llm_model_instance = HuggingFaceEndpoint(
30+
repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
31+
)
32+
33+
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
34+
api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
35+
)
36+
37+
# ************************************************
38+
# Create the SmartScraperGraph instance and run it
39+
# ************************************************
40+
41+
graph_config = {
42+
"llm": {"model_instance": llm_model_instance},
43+
"embeddings": {"model_instance": embedder_model_instance}
44+
}
45+
46+
47+
# ************************************************
48+
# Create the CSVScraperMultiGraph instance and run it
49+
# ************************************************
50+
51+
csv_scraper_graph = CSVScraperMultiGraph(
52+
prompt="List me all the last names",
53+
source=[str(text), str(text)],
54+
config=graph_config
55+
)
56+
57+
result = csv_scraper_graph.run()
58+
print(result)
59+
60+
# ************************************************
61+
# Get graph execution info
62+
# ************************************************
63+
64+
graph_exec_info = csv_scraper_graph.get_execution_info()
65+
print(prettify_exec_info(graph_exec_info))
66+
67+
# Save to json or csv
68+
convert_to_csv(result, "result")
69+
convert_to_json(result, "result")
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""
2+
Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents
3+
"""
4+
5+
import os
6+
from scrapegraphai.graphs import XMLScraperMultiGraph
7+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
8+
from langchain_community.llms import HuggingFaceEndpoint
9+
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
10+
11+
# ************************************************
12+
# Read the XML file
13+
# ************************************************
14+
15+
FILE_NAME = "inputs/books.xml"
16+
curr_dir = os.path.dirname(os.path.realpath(__file__))
17+
file_path = os.path.join(curr_dir, FILE_NAME)
18+
19+
with open(file_path, 'r', encoding="utf-8") as file:
20+
text = file.read()
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
26+
27+
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
28+
29+
llm_model_instance = HuggingFaceEndpoint(
30+
repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
31+
)
32+
33+
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
34+
api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
35+
)
36+
37+
# ************************************************
38+
# Create the SmartScraperGraph instance and run it
39+
# ************************************************
40+
41+
graph_config = {
42+
"llm": {"model_instance": llm_model_instance},
43+
"embeddings": {"model_instance": embedder_model_instance}
44+
}
45+
46+
# ************************************************
47+
# Create the XMLScraperMultiGraph instance and run it
48+
# ************************************************
49+
50+
xml_scraper_graph = XMLScraperMultiGraph(
51+
prompt="List me all the authors, title and genres of the books",
52+
source=[text, text], # Pass the content of the file, not the file object
53+
config=graph_config
54+
)
55+
56+
result = xml_scraper_graph.run()
57+
print(result)
58+
59+
# ************************************************
60+
# Get graph execution info
61+
# ************************************************
62+
63+
graph_exec_info = xml_scraper_graph.get_execution_info()
64+
print(prettify_exec_info(graph_exec_info))
65+
66+
# Save to json or csv
67+
convert_to_csv(result, "result")
68+
convert_to_json(result, "result")

0 commit comments

Comments
 (0)