Skip to content

Commit 67ac884

Browse files
authored
Merge pull request #91 from VinciGit00/fixing_bugs
2 parents c548032 + a9b11e4 commit 67ac884

File tree

14 files changed

+168
-78
lines changed

14 files changed

+168
-78
lines changed

CHANGELOG.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,22 @@
1+
## [0.4.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.4.0-beta.1...v0.4.0-beta.2) (2024-04-27)
2+
3+
4+
### Bug Fixes
5+
6+
* robot node and proxyes ([adbc08f](https://github.com/VinciGit00/Scrapegraph-ai/commit/adbc08f27bc0966822f054f3af0e1f94fc0b87f5))
7+
8+
## [0.4.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.3.0...v0.4.0-beta.1) (2024-04-27)
9+
10+
11+
### Features
12+
13+
* add new proxy rotation function ([f6077d1](https://github.com/VinciGit00/Scrapegraph-ai/commit/f6077d1f98023ac3bf0c89ef6b3d67dde4818df7))
14+
15+
16+
### Bug Fixes
17+
18+
* changed proxy function ([b754dd9](https://github.com/VinciGit00/Scrapegraph-ai/commit/b754dd909cd2aa2d5b5d94d9c7879ba3da58adc4))
19+
120
## [0.3.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.2.8...v0.3.0) (2024-04-26)
221

322

examples/local_models/Ollama/smart_scraper_ollama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
graph_config = {
1111
"llm": {
1212
"model": "ollama/mistral",
13-
"temperature": 1,
13+
"temperature": 0,
1414
"format": "json", # Ollama needs the format to be specified explicitly
1515
# "model_tokens": 2000, # set context length arbitrarily,
1616
"base_url": "http://localhost:11434", # set ollama URL arbitrarily

examples/single_node/fetch_node.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""
2+
Example of custom graph using existing nodes
3+
"""
4+
5+
from scrapegraphai.nodes import FetchNode
6+
7+
# ************************************************
8+
# Define the node
9+
# ************************************************
10+
11+
12+
robots_node = FetchNode(
13+
input="url | local_dir",
14+
output=["doc"],
15+
)
16+
17+
# ************************************************
18+
# Test the node
19+
# ************************************************
20+
21+
state = {
22+
"url": "https://twitter.com/home"
23+
}
24+
25+
result = robots_node.execute(state)
26+
27+
print(result)

examples/single_node/robot_node.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,16 @@
22
Example of custom graph using existing nodes
33
"""
44

5-
import os
6-
from dotenv import load_dotenv
7-
from scrapegraphai.models import OpenAI
5+
from scrapegraphai.models import Ollama
86
from scrapegraphai.nodes import RobotsNode
9-
load_dotenv()
107

118
# ************************************************
129
# Define the configuration for the graph
1310
# ************************************************
1411

15-
openai_key = os.getenv("OPENAI_APIKEY")
16-
1712
graph_config = {
1813
"llm": {
19-
"api_key": openai_key,
20-
"model": "gpt-3.5-turbo",
14+
"model": "ollama/llama3",
2115
"temperature": 0,
2216
"streaming": True
2317
},
@@ -27,7 +21,7 @@
2721
# Define the node
2822
# ************************************************
2923

30-
llm_model = OpenAI(graph_config["llm"])
24+
llm_model = Ollama(graph_config["llm"])
3125

3226
robots_node = RobotsNode(
3327
input="url",

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "scrapegraphai"
3-
version = "0.3.1"
3+
version = "0.4.0b2"
44
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
55
authors = [
66
"Marco Vinciguerra <mvincig11@gmail.com>",

scrapegraphai/graphs/base_graph.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def _create_edges(self, edges: list) -> dict:
5656
edge_dict[from_node.node_name] = to_node.node_name
5757
return edge_dict
5858

59-
def execute(self, initial_state: dict) -> dict:
59+
def execute(self, initial_state: dict) -> (dict, list):
6060
"""
6161
Executes the graph by traversing nodes starting from the entry point. The execution
6262
follows the edges based on the result of each node's execution and continues until
@@ -68,13 +68,12 @@ def execute(self, initial_state: dict) -> dict:
6868
Returns:
6969
dict: The state after execution has completed, which may have been altered by the nodes.
7070
"""
71-
print(self.nodes)
7271
current_node_name = self.nodes[0]
7372
state = initial_state
7473

7574
# variables for tracking execution info
7675
total_exec_time = 0.0
77-
exec_info = {}
76+
exec_info = []
7877
cb_total = {
7978
"total_tokens": 0,
8079
"prompt_tokens": 0,
@@ -94,18 +93,19 @@ def execute(self, initial_state: dict) -> dict:
9493
total_exec_time += node_exec_time
9594

9695
cb = {
96+
"node_name": index.node_name,
9797
"total_tokens": cb.total_tokens,
9898
"prompt_tokens": cb.prompt_tokens,
9999
"completion_tokens": cb.completion_tokens,
100100
"successful_requests": cb.successful_requests,
101101
"total_cost_USD": cb.total_cost,
102-
}
103-
104-
exec_info[current_node_name] = {
105102
"exec_time": node_exec_time,
106-
"model_info": cb
107103
}
108104

105+
exec_info.append(
106+
cb
107+
)
108+
109109
cb_total["total_tokens"] += cb["total_tokens"]
110110
cb_total["prompt_tokens"] += cb["prompt_tokens"]
111111
cb_total["completion_tokens"] += cb["completion_tokens"]
@@ -119,10 +119,14 @@ def execute(self, initial_state: dict) -> dict:
119119
else:
120120
current_node_name = None
121121

122-
execution_info = {
123-
"total_exec_time": total_exec_time,
124-
"total_model_info": cb_total,
125-
"nodes_info": exec_info
126-
}
127-
128-
return state, execution_info
122+
exec_info.append({
123+
"node_name": "TOTAL RESULT",
124+
"total_tokens": cb_total["total_tokens"],
125+
"prompt_tokens": cb_total["prompt_tokens"],
126+
"completion_tokens": cb_total["completion_tokens"],
127+
"successful_requests": cb_total["successful_requests"],
128+
"total_cost_USD": cb_total["total_cost_USD"],
129+
"exec_time": total_exec_time,
130+
})
131+
132+
return state, exec_info

scrapegraphai/nodes/fetch_node.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
4242
Initializes the FetchHTMLNode with a node name and node type.
4343
Arguments:
4444
node_name (str): name of the node
45+
prox_rotation (bool): if you wamt to rotate proxies
4546
"""
4647
super().__init__(node_name, "node", input, output, 1)
4748

@@ -58,7 +59,7 @@ def execute(self, state):
5859
5960
Raises:
6061
KeyError: If the 'url' key is not found in the state, indicating that the
61-
necessary information to perform the operation is missing.
62+
necessary information to perform the operation is missing.
6263
"""
6364
print(f"--- Executing {self.node_name} Node ---")
6465

@@ -76,9 +77,13 @@ def execute(self, state):
7677
"source": "local_dir"
7778
})]
7879

79-
# if it is a URL
8080
else:
81-
loader = AsyncHtmlLoader(source)
81+
if self.node_config is not None and self.node_config.get("endpoint") is not None:
82+
loader = AsyncHtmlLoader(
83+
source, proxies={"http": self.node_config["endpoint"]})
84+
else:
85+
loader = AsyncHtmlLoader(source)
86+
8287
document = loader.load()
8388
compressed_document = [
8489
Document(page_content=remover(str(document)))]

scrapegraphai/nodes/robots_node.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Module for checking if a website is scrapepable or not
2+
Module for checking if a website is scrapepable or not
33
"""
44
from typing import List
55
from urllib.parse import urlparse
@@ -12,7 +12,7 @@
1212

1313
class RobotsNode(BaseNode):
1414
"""
15-
A node responsible for checking if a website is scrapepable or not.
15+
A node responsible for checking if a website is scrapepable or not.
1616
It uses the AsyncHtmlLoader for asynchronous
1717
document loading.
1818
@@ -59,7 +59,7 @@ def __init__(self, input: str, output: List[str], node_config: dict, force_scra
5959
node_config (dict): Configuration parameters for the node.
6060
force_scraping (bool): A flag indicating whether scraping should be enforced even
6161
if disallowed by robots.txt. Defaults to True.
62-
node_name (str, optional): The unique identifier name for the node.
62+
node_name (str, optional): The unique identifier name for the node.
6363
Defaults to "Robots".
6464
"""
6565
super().__init__(node_name, "node", input, output, 1)
@@ -112,11 +112,12 @@ def execute(self, state):
112112
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
113113
loader = AsyncHtmlLoader(f"{base_url}/robots.txt")
114114
document = loader.load()
115-
model = self.llm_model.model_name
116-
117-
if "ollama" in model:
118-
model = model.split("/", maxsplit=1)[-1]
115+
if "ollama" in self.llm_model.model:
116+
self.llm_model.model = self.llm_model.model.split("/")[-1]
117+
model = self.llm_model.model.split("/")[-1]
119118

119+
else:
120+
model = self.llm_model.model_name
120121
try:
121122
agent = robots_dictionary[model]
122123

scrapegraphai/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
from .convert_to_csv import convert_to_csv
66
from .convert_to_json import convert_to_json
77
from .prettify_exec_info import prettify_exec_info
8+
from .proxy_rotation import proxy_generator

scrapegraphai/utils/prettify_exec_info.py

Lines changed: 4 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5,44 +5,17 @@
55
import pandas as pd
66

77

8-
def prettify_exec_info(complete_result: dict) -> pd.DataFrame:
8+
def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame:
99
"""
1010
Transform the execution information of the graph into a DataFrame for better visualization.
1111
1212
Args:
13-
- complete_result (dict): The complete execution information of the graph.
13+
- complete_result (list[dict]): The complete execution information of the graph.
1414
1515
Returns:
1616
- pd.DataFrame: The execution information of the graph in a DataFrame.
1717
"""
1818

19-
nodes_info = complete_result['nodes_info']
20-
total_info = {
21-
'total_exec_time': complete_result['total_exec_time'],
22-
'total_model_info': complete_result['total_model_info']
23-
}
19+
df_nodes = pd.DataFrame(complete_result)
2420

25-
# Convert node-specific information to DataFrame
26-
flat_data = []
27-
for node_name, node_info in nodes_info.items():
28-
flat_data.append({
29-
'Node': node_name,
30-
'Execution Time': node_info['exec_time'],
31-
# Unpack the model_info dict into the row
32-
**node_info['model_info']
33-
})
34-
35-
df_nodes = pd.DataFrame(flat_data)
36-
37-
# Add a row for the total execution time and total model info
38-
total_row = {
39-
'Node': 'Total',
40-
'Execution Time': total_info['total_exec_time'],
41-
# Unpack the total_model_info dict into the row
42-
**total_info['total_model_info']
43-
}
44-
df_total = pd.DataFrame([total_row])
45-
46-
# Combine the nodes DataFrame with the total info DataFrame
47-
df_combined_with_total = pd.concat([df_nodes, df_total], ignore_index=True)
48-
return df_combined_with_total
21+
return df_nodes

scrapegraphai/utils/proxy_rotation.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""
2+
Module for rotating proxies
3+
"""
4+
from fp.fp import FreeProxy
5+
6+
7+
def proxy_generator(num_ips: int):
8+
"""
9+
Rotates through a specified number of proxy IPs using the FreeProxy library.
10+
11+
Args:
12+
num_ips (int): The number of proxy IPs to rotate through.
13+
14+
Returns:
15+
dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation.
16+
17+
Example:
18+
>>> proxy_generator(5)
19+
{
20+
0: '192.168.1.1:8080',
21+
1: '103.10.63.135:8080',
22+
2: '176.9.75.42:8080',
23+
3: '37.57.216.2:8080',
24+
4: '113.20.31.250:8080'
25+
}
26+
"""
27+
res = []
28+
29+
for i in range(0, num_ips):
30+
res.append(FreeProxy().get())
31+
return res

tests/nodes/.env.example

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/nodes/fetch_node_test.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""
2+
Module for testinh robot_node
3+
"""
4+
import pytest
5+
from scrapegraphai.nodes import FetchNode
6+
7+
8+
@pytest.fixture
9+
def setup():
10+
"""
11+
setup
12+
"""
13+
# ************************************************
14+
# Define the node
15+
# ************************************************
16+
17+
robots_node = FetchNode(
18+
input="url | local_dir",
19+
output=["doc"],
20+
)
21+
22+
return robots_node
23+
24+
# ************************************************
25+
# Test the node
26+
# ************************************************
27+
28+
29+
def test_robots_node(setup):
30+
"""
31+
Run the tests
32+
"""
33+
state = {
34+
"url": "https://twitter.com/home"
35+
}
36+
37+
result = setup.execute(state)
38+
39+
assert result is not None
40+
41+
42+
# If you need to run this script directly
43+
if __name__ == "__main__":
44+
pytest.main()

0 commit comments

Comments
 (0)