|
1 |
| -""" |
2 |
| -Module for the tests |
3 |
| -""" |
4 |
| -import os |
5 | 1 | import pytest
|
6 |
| -from scrapegraphai.graphs import SmartScraperGraph |
| 2 | +from unittest.mock import MagicMock |
| 3 | + |
| 4 | +from scrapegraphai.models import Ollama |
| 5 | +from scrapegraphai.nodes import RobotsNode |
7 | 6 |
|
8 | 7 | @pytest.fixture
|
9 |
| -def sample_text(): |
10 |
| - """ |
11 |
| - Example of text fixture. |
12 |
| - """ |
13 |
| - file_name = "inputs/plain_html_example.txt" |
14 |
| - curr_dir = os.path.dirname(os.path.realpath(__file__)) |
15 |
| - file_path = os.path.join(curr_dir, file_name) |
| 8 | +def mock_llm_model(): |
| 9 | + mock_model = MagicMock() |
| 10 | + mock_model.model = "ollama/llama3" |
| 11 | + mock_model.__call__ = MagicMock(return_value=["yes"]) |
| 12 | + return mock_model |
16 | 13 |
|
17 |
| - with open(file_path, 'r', encoding="utf-8") as file: |
18 |
| - text = file.read() |
| 14 | +@pytest.fixture |
| 15 | +def robots_node(mock_llm_model): |
| 16 | + return RobotsNode( |
| 17 | + input="url", |
| 18 | + output=["is_scrapable"], |
| 19 | + node_config={"llm_model": mock_llm_model, "headless": False} |
| 20 | + ) |
19 | 21 |
|
20 |
| - return text |
| 22 | +def test_robots_node_scrapable(robots_node): |
| 23 | + state = { |
| 24 | + "url": "https://perinim.github.io/robots.txt" |
| 25 | + } |
21 | 26 |
|
22 |
| -@pytest.fixture |
23 |
| -def graph_config(): |
24 |
| - """ |
25 |
| - Configuration of the graph fixture. |
26 |
| - """ |
27 |
| - return { |
28 |
| - "llm": { |
29 |
| - "model": "ollama/mistral", |
30 |
| - "temperature": 0, |
31 |
| - "format": "json", |
32 |
| - "base_url": "http://localhost:11434", |
33 |
| - }, |
34 |
| - "embeddings": { |
35 |
| - "model": "ollama/nomic-embed-text", |
36 |
| - "temperature": 0, |
37 |
| - "base_url": "http://localhost:11434", |
38 |
| - } |
| 27 | + # Mocking AsyncChromiumLoader to return a fake robots.txt content |
| 28 | + robots_node.AsyncChromiumLoader = MagicMock(return_value=MagicMock(load=MagicMock(return_value="User-agent: *\nAllow: /"))) |
| 29 | + |
| 30 | + # Execute the node |
| 31 | + result_state, result = robots_node.execute(state) |
| 32 | + |
| 33 | + # Check the updated state |
| 34 | + assert result_state["is_scrapable"] == "yes" |
| 35 | + assert result == ("is_scrapable", "yes") |
| 36 | + |
| 37 | +def test_robots_node_not_scrapable(robots_node): |
| 38 | + state = { |
| 39 | + "url": "https://twitter.com/home" |
39 | 40 | }
|
40 | 41 |
|
41 |
| -def test_scraping_pipeline(sample_text, graph_config): |
42 |
| - """ |
43 |
| - Test the SmartScraperGraph scraping pipeline. |
44 |
| - """ |
45 |
| - smart_scraper_graph = SmartScraperGraph( |
46 |
| - prompt="List me all the news with their description.", |
47 |
| - source=sample_text, |
48 |
| - config=graph_config |
49 |
| - ) |
| 42 | + # Mocking AsyncChromiumLoader to return a fake robots.txt content |
| 43 | + robots_node.AsyncChromiumLoader = MagicMock(return_value=MagicMock(load=MagicMock(return_value="User-agent: *\nDisallow: /"))) |
| 44 | + |
| 45 | + # Mock the LLM response to return "no" |
| 46 | + robots_node.llm_model.__call__.return_value = ["no"] |
| 47 | + |
| 48 | + # Execute the node and expect a ValueError because force_scraping is False by default |
| 49 | + with pytest.raises(ValueError): |
| 50 | + robots_node.execute(state) |
| 51 | + |
| 52 | +def test_robots_node_force_scrapable(robots_node): |
| 53 | + state = { |
| 54 | + "url": "https://twitter.com/home" |
| 55 | + } |
| 56 | + |
| 57 | + # Mocking AsyncChromiumLoader to return a fake robots.txt content |
| 58 | + robots_node.AsyncChromiumLoader = MagicMock(return_value=MagicMock(load=MagicMock(return_value="User-agent: *\nDisallow: /"))) |
| 59 | + |
| 60 | + # Mock the LLM response to return "no" |
| 61 | + robots_node.llm_model.__call__.return_value = ["no"] |
| 62 | + |
| 63 | + # Set force_scraping to True |
| 64 | + robots_node.force_scraping = True |
| 65 | + |
| 66 | + # Execute the node |
| 67 | + result_state, result = robots_node.execute(state) |
| 68 | + |
| 69 | + # Check the updated state |
| 70 | + assert result_state["is_scrapable"] == "no" |
| 71 | + assert result == ("is_scrapable", "no") |
50 | 72 |
|
51 |
| - result = smart_scraper_graph.run() |
52 |
| - |
53 |
| - assert result is not None |
54 |
| - # Additional assertions to check the structure of the result |
55 |
| - assert isinstance(result, dict) # Assuming the result is a dictionary |
56 |
| - assert "news" in result # Assuming the result should contain a key "news" |
57 |
| - assert "is_scrapable" in result |
58 |
| - assert isinstance(result["is_scrapable"], bool) |
59 |
| - assert result["is_scrapable"] is True |
60 |
| - # Ensure the execute method was called once |
61 |
| - mock_execute.assert_called_once_with(initial_state) |
| 73 | +if __name__ == "__main__": |
| 74 | + pytest.main() |
0 commit comments