Skip to content

Commit 2419003

Browse files
committed
fix: fix robot node
1 parent a87702f commit 2419003

File tree

3 files changed

+76
-58
lines changed

3 files changed

+76
-58
lines changed

examples/single_node/robot_node.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,15 @@
1111

1212
graph_config = {
1313
"llm": {
14-
"model_name": "ollama/llama3",
14+
"model": "ollama/llama3",
1515
"temperature": 0,
1616
"streaming": True
1717
},
18+
"embeddings": {
19+
"model": "ollama/nomic-embed-text",
20+
"temperature": 0,
21+
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
22+
}
1823
}
1924

2025
# ************************************************

scrapegraphai/nodes/robots_node.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,11 +111,11 @@ def execute(self, state: dict) -> dict:
111111
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
112112
loader = AsyncChromiumLoader(f"{base_url}/robots.txt")
113113
document = loader.load()
114-
if "ollama" in self.llm_model.model_name:
115-
self.llm_model.model_name = self.llm_model.model_name.split("/")[-1]
116-
model = self.llm_model.model_name.split("/")[-1]
114+
if "ollama" in self.llm_model.model:
115+
self.llm_model.model = self.llm_model.model.split("/")[-1]
116+
model = self.llm_model.model.split("/")[-1]
117117
else:
118-
model = self.llm_model.model_name
118+
model = self.llm_model.model
119119
try:
120120
agent = robots_dictionary[model]
121121

@@ -146,4 +146,4 @@ def execute(self, state: dict) -> dict:
146146
self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m")
147147

148148
state.update({self.output[0]: is_scrapable})
149-
return state
149+
return state

tests/nodes/robot_node_test.py

Lines changed: 65 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,74 @@
1-
"""
2-
Module for the tests
3-
"""
4-
import os
51
import pytest
6-
from scrapegraphai.graphs import SmartScraperGraph
2+
from unittest.mock import MagicMock
3+
4+
from scrapegraphai.models import Ollama
5+
from scrapegraphai.nodes import RobotsNode
76

87
@pytest.fixture
9-
def sample_text():
10-
"""
11-
Example of text fixture.
12-
"""
13-
file_name = "inputs/plain_html_example.txt"
14-
curr_dir = os.path.dirname(os.path.realpath(__file__))
15-
file_path = os.path.join(curr_dir, file_name)
8+
def mock_llm_model():
9+
mock_model = MagicMock()
10+
mock_model.model = "ollama/llama3"
11+
mock_model.__call__ = MagicMock(return_value=["yes"])
12+
return mock_model
1613

17-
with open(file_path, 'r', encoding="utf-8") as file:
18-
text = file.read()
14+
@pytest.fixture
15+
def robots_node(mock_llm_model):
16+
return RobotsNode(
17+
input="url",
18+
output=["is_scrapable"],
19+
node_config={"llm_model": mock_llm_model, "headless": False}
20+
)
1921

20-
return text
22+
def test_robots_node_scrapable(robots_node):
23+
state = {
24+
"url": "https://perinim.github.io/robots.txt"
25+
}
2126

22-
@pytest.fixture
23-
def graph_config():
24-
"""
25-
Configuration of the graph fixture.
26-
"""
27-
return {
28-
"llm": {
29-
"model": "ollama/mistral",
30-
"temperature": 0,
31-
"format": "json",
32-
"base_url": "http://localhost:11434",
33-
},
34-
"embeddings": {
35-
"model": "ollama/nomic-embed-text",
36-
"temperature": 0,
37-
"base_url": "http://localhost:11434",
38-
}
27+
# Mocking AsyncChromiumLoader to return a fake robots.txt content
28+
robots_node.AsyncChromiumLoader = MagicMock(return_value=MagicMock(load=MagicMock(return_value="User-agent: *\nAllow: /")))
29+
30+
# Execute the node
31+
result_state, result = robots_node.execute(state)
32+
33+
# Check the updated state
34+
assert result_state["is_scrapable"] == "yes"
35+
assert result == ("is_scrapable", "yes")
36+
37+
def test_robots_node_not_scrapable(robots_node):
38+
state = {
39+
"url": "https://twitter.com/home"
3940
}
4041

41-
def test_scraping_pipeline(sample_text, graph_config):
42-
"""
43-
Test the SmartScraperGraph scraping pipeline.
44-
"""
45-
smart_scraper_graph = SmartScraperGraph(
46-
prompt="List me all the news with their description.",
47-
source=sample_text,
48-
config=graph_config
49-
)
42+
# Mocking AsyncChromiumLoader to return a fake robots.txt content
43+
robots_node.AsyncChromiumLoader = MagicMock(return_value=MagicMock(load=MagicMock(return_value="User-agent: *\nDisallow: /")))
44+
45+
# Mock the LLM response to return "no"
46+
robots_node.llm_model.__call__.return_value = ["no"]
47+
48+
# Execute the node and expect a ValueError because force_scraping is False by default
49+
with pytest.raises(ValueError):
50+
robots_node.execute(state)
51+
52+
def test_robots_node_force_scrapable(robots_node):
53+
state = {
54+
"url": "https://twitter.com/home"
55+
}
56+
57+
# Mocking AsyncChromiumLoader to return a fake robots.txt content
58+
robots_node.AsyncChromiumLoader = MagicMock(return_value=MagicMock(load=MagicMock(return_value="User-agent: *\nDisallow: /")))
59+
60+
# Mock the LLM response to return "no"
61+
robots_node.llm_model.__call__.return_value = ["no"]
62+
63+
# Set force_scraping to True
64+
robots_node.force_scraping = True
65+
66+
# Execute the node
67+
result_state, result = robots_node.execute(state)
68+
69+
# Check the updated state
70+
assert result_state["is_scrapable"] == "no"
71+
assert result == ("is_scrapable", "no")
5072

51-
result = smart_scraper_graph.run()
52-
53-
assert result is not None
54-
# Additional assertions to check the structure of the result
55-
assert isinstance(result, dict) # Assuming the result is a dictionary
56-
assert "news" in result # Assuming the result should contain a key "news"
57-
assert "is_scrapable" in result
58-
assert isinstance(result["is_scrapable"], bool)
59-
assert result["is_scrapable"] is True
60-
# Ensure the execute method was called once
61-
mock_execute.assert_called_once_with(initial_state)
73+
if __name__ == "__main__":
74+
pytest.main()

0 commit comments

Comments
 (0)