Merge pull request #91 from VinciGit00/fixing_bugs

PeriniM · web-flow · commit 67ac884e4ca9 · 2024-04-28T19:12:38.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,22 @@
+## [0.4.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.4.0-beta.1...v0.4.0-beta.2) (2024-04-27)
+
+
+### Bug Fixes
+
+* robot node and proxyes ([adbc08f](https://github.com/VinciGit00/Scrapegraph-ai/commit/adbc08f27bc0966822f054f3af0e1f94fc0b87f5))
+
+## [0.4.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.3.0...v0.4.0-beta.1) (2024-04-27)
+
+
+### Features
+
+* add new proxy rotation function ([f6077d1](https://github.com/VinciGit00/Scrapegraph-ai/commit/f6077d1f98023ac3bf0c89ef6b3d67dde4818df7))
+
+
+### Bug Fixes
+
+* changed proxy function ([b754dd9](https://github.com/VinciGit00/Scrapegraph-ai/commit/b754dd909cd2aa2d5b5d94d9c7879ba3da58adc4))
+
 ## [0.3.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.2.8...v0.3.0) (2024-04-26)
 
 
diff --git a/examples/local_models/Ollama/smart_scraper_ollama.py b/examples/local_models/Ollama/smart_scraper_ollama.py
@@ -10,7 +10,7 @@
 graph_config = {
     "llm": {
         "model": "ollama/mistral",
-        "temperature": 1,
+        "temperature": 0,
         "format": "json",  # Ollama needs the format to be specified explicitly
         # "model_tokens": 2000, # set context length arbitrarily,
         "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
diff --git a/examples/single_node/fetch_node.py b/examples/single_node/fetch_node.py
@@ -0,0 +1,27 @@
+"""
+Example of custom graph using existing nodes
+"""
+
+from scrapegraphai.nodes import FetchNode
+
+# ************************************************
+# Define the node
+# ************************************************
+
+
+robots_node = FetchNode(
+    input="url | local_dir",
+    output=["doc"],
+)
+
+# ************************************************
+# Test the node
+# ************************************************
+
+state = {
+    "url": "https://twitter.com/home"
+}
+
+result = robots_node.execute(state)
+
+print(result)
diff --git a/examples/single_node/robot_node.py b/examples/single_node/robot_node.py
@@ -2,22 +2,16 @@
 Example of custom graph using existing nodes
 """
 
-import os
-from dotenv import load_dotenv
-from scrapegraphai.models import OpenAI
+from scrapegraphai.models import Ollama
 from scrapegraphai.nodes import RobotsNode
-load_dotenv()
 
 # ************************************************
 # Define the configuration for the graph
 # ************************************************
 
-openai_key = os.getenv("OPENAI_APIKEY")
-
 graph_config = {
     "llm": {
-        "api_key": openai_key,
-        "model": "gpt-3.5-turbo",
+        "model": "ollama/llama3",
         "temperature": 0,
         "streaming": True
     },
@@ -27,7 +21,7 @@
 # Define the node
 # ************************************************
 
-llm_model = OpenAI(graph_config["llm"])
+llm_model = Ollama(graph_config["llm"])
 
 robots_node = RobotsNode(
     input="url",
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "scrapegraphai"
-version = "0.3.1"
+version = "0.4.0b2"
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
     "Marco Vinciguerra <mvincig11@gmail.com>",
diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py
@@ -56,7 +56,7 @@ def _create_edges(self, edges: list) -> dict:
             edge_dict[from_node.node_name] = to_node.node_name
         return edge_dict
 
-    def execute(self, initial_state: dict) -> dict:
+    def execute(self, initial_state: dict) -> (dict, list):
         """
         Executes the graph by traversing nodes starting from the entry point. The execution
         follows the edges based on the result of each node's execution and continues until
@@ -68,13 +68,12 @@ def execute(self, initial_state: dict) -> dict:
         Returns:
             dict: The state after execution has completed, which may have been altered by the nodes.
         """
-        print(self.nodes)
         current_node_name = self.nodes[0]
         state = initial_state
 
         # variables for tracking execution info
         total_exec_time = 0.0
-        exec_info = {}
+        exec_info = []
         cb_total = {
             "total_tokens": 0,
             "prompt_tokens": 0,
@@ -94,18 +93,19 @@ def execute(self, initial_state: dict) -> dict:
                 total_exec_time += node_exec_time
 
                 cb = {
+                    "node_name": index.node_name,
                     "total_tokens": cb.total_tokens,
                     "prompt_tokens": cb.prompt_tokens,
                     "completion_tokens": cb.completion_tokens,
                     "successful_requests": cb.successful_requests,
                     "total_cost_USD": cb.total_cost,
-                }
-
-                exec_info[current_node_name] = {
                     "exec_time": node_exec_time,
-                    "model_info": cb
                 }
 
+                exec_info.append(
+                    cb
+                )
+
                 cb_total["total_tokens"] += cb["total_tokens"]
                 cb_total["prompt_tokens"] += cb["prompt_tokens"]
                 cb_total["completion_tokens"] += cb["completion_tokens"]
@@ -119,10 +119,14 @@ def execute(self, initial_state: dict) -> dict:
             else:
                 current_node_name = None
 
-        execution_info = {
-            "total_exec_time": total_exec_time,
-            "total_model_info": cb_total,
-            "nodes_info": exec_info
-        }
-
-        return state, execution_info
+        exec_info.append({
+            "node_name": "TOTAL RESULT",
+            "total_tokens":  cb_total["total_tokens"],
+            "prompt_tokens":  cb_total["prompt_tokens"],
+            "completion_tokens": cb_total["completion_tokens"],
+            "successful_requests": cb_total["successful_requests"],
+            "total_cost_USD":   cb_total["total_cost_USD"],
+            "exec_time": total_exec_time,
+        })
+
+        return state, exec_info
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -42,6 +42,7 @@ def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
         Initializes the FetchHTMLNode with a node name and node type.
         Arguments:
             node_name (str): name of the node
+            prox_rotation (bool): if you wamt to rotate proxies
         """
         super().__init__(node_name, "node", input, output, 1)
 
@@ -58,7 +59,7 @@ def execute(self, state):
 
         Raises:
             KeyError: If the 'url' key is not found in the state, indicating that the
-                      necessary information to perform the operation is missing.
+                    necessary information to perform the operation is missing.
         """
         print(f"--- Executing {self.node_name} Node ---")
 
@@ -76,9 +77,13 @@ def execute(self, state):
                 "source": "local_dir"
             })]
 
-        # if it is a URL
         else:
-            loader = AsyncHtmlLoader(source)
+            if self.node_config is not None and self.node_config.get("endpoint") is not None:
+                loader = AsyncHtmlLoader(
+                    source, proxies={"http": self.node_config["endpoint"]})
+            else:
+                loader = AsyncHtmlLoader(source)
+
             document = loader.load()
             compressed_document = [
                 Document(page_content=remover(str(document)))]
diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py
@@ -1,5 +1,5 @@
 """
-Module for checking if a website is scrapepable or not 
+Module for checking if a website is scrapepable or not
 """
 from typing import List
 from urllib.parse import urlparse
@@ -12,7 +12,7 @@
 
 class RobotsNode(BaseNode):
     """
-    A node responsible for checking if a website is scrapepable or not. 
+    A node responsible for checking if a website is scrapepable or not.
     It uses the AsyncHtmlLoader for asynchronous
     document loading.
 
@@ -59,7 +59,7 @@ def __init__(self, input: str, output: List[str],  node_config: dict, force_scra
             node_config (dict): Configuration parameters for the node.
             force_scraping (bool): A flag indicating whether scraping should be enforced even
                                    if disallowed by robots.txt. Defaults to True.
-            node_name (str, optional): The unique identifier name for the node. 
+            node_name (str, optional): The unique identifier name for the node.
                                        Defaults to "Robots".
         """
         super().__init__(node_name, "node", input, output, 1)
@@ -112,11 +112,12 @@ def execute(self, state):
             base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
             loader = AsyncHtmlLoader(f"{base_url}/robots.txt")
             document = loader.load()
-            model = self.llm_model.model_name
-
-            if "ollama" in model:
-                model = model.split("/", maxsplit=1)[-1]
+            if "ollama" in self.llm_model.model:
+                self.llm_model.model = self.llm_model.model.split("/")[-1]
+                model = self.llm_model.model.split("/")[-1]
 
+            else:
+                model = self.llm_model.model_name
             try:
                 agent = robots_dictionary[model]
 
diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
@@ -5,3 +5,4 @@
 from .convert_to_csv import convert_to_csv
 from .convert_to_json import convert_to_json
 from .prettify_exec_info import prettify_exec_info
+from .proxy_rotation import proxy_generator
diff --git a/scrapegraphai/utils/prettify_exec_info.py b/scrapegraphai/utils/prettify_exec_info.py
@@ -5,44 +5,17 @@
 import pandas as pd
 
 
-def prettify_exec_info(complete_result: dict) -> pd.DataFrame:
+def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame:
     """
     Transform the execution information of the graph into a DataFrame for better visualization.
 
     Args:
-    - complete_result (dict): The complete execution information of the graph.
+    - complete_result (list[dict]): The complete execution information of the graph.
 
     Returns:
     - pd.DataFrame: The execution information of the graph in a DataFrame.
     """
 
-    nodes_info = complete_result['nodes_info']
-    total_info = {
-        'total_exec_time': complete_result['total_exec_time'],
-        'total_model_info': complete_result['total_model_info']
-    }
+    df_nodes = pd.DataFrame(complete_result)
 
-    # Convert node-specific information to DataFrame
-    flat_data = []
-    for node_name, node_info in nodes_info.items():
-        flat_data.append({
-            'Node': node_name,
-            'Execution Time': node_info['exec_time'],
-            # Unpack the model_info dict into the row
-            **node_info['model_info']
-        })
-
-    df_nodes = pd.DataFrame(flat_data)
-
-    # Add a row for the total execution time and total model info
-    total_row = {
-        'Node': 'Total',
-        'Execution Time': total_info['total_exec_time'],
-        # Unpack the total_model_info dict into the row
-        **total_info['total_model_info']
-    }
-    df_total = pd.DataFrame([total_row])
-
-    # Combine the nodes DataFrame with the total info DataFrame
-    df_combined_with_total = pd.concat([df_nodes, df_total], ignore_index=True)
-    return df_combined_with_total
+    return df_nodes
diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py
@@ -0,0 +1,31 @@
+"""
+Module for rotating proxies
+"""
+from fp.fp import FreeProxy
+
+
+def proxy_generator(num_ips: int):
+    """
+    Rotates through a specified number of proxy IPs using the FreeProxy library.
+
+    Args:
+        num_ips (int): The number of proxy IPs to rotate through.
+
+    Returns:
+        dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation.
+
+    Example:
+        >>> proxy_generator(5)
+        {
+            0: '192.168.1.1:8080',
+            1: '103.10.63.135:8080',
+            2: '176.9.75.42:8080',
+            3: '37.57.216.2:8080',
+            4: '113.20.31.250:8080'
+        }
+    """
+    res = []
+
+    for i in range(0, num_ips):
+        res.append(FreeProxy().get())
+    return res
diff --git a/tests/nodes/.env.example b/tests/nodes/.env.example
diff --git a/tests/nodes/fetch_node_test.py b/tests/nodes/fetch_node_test.py
@@ -0,0 +1,44 @@
+"""
+Module for testinh robot_node
+"""
+import pytest
+from scrapegraphai.nodes import FetchNode
+
+
+@pytest.fixture
+def setup():
+    """
+    setup
+    """
+    # ************************************************
+    # Define the node
+    # ************************************************
+
+    robots_node = FetchNode(
+        input="url | local_dir",
+        output=["doc"],
+    )
+
+    return robots_node
+
+# ************************************************
+# Test the node
+# ************************************************
+
+
+def test_robots_node(setup):
+    """
+    Run the tests
+    """
+    state = {
+        "url": "https://twitter.com/home"
+    }
+
+    result = setup.execute(state)
+
+    assert result is not None
+
+
+# If you need to run this script directly
+if __name__ == "__main__":
+    pytest.main()
diff --git a/tests/nodes/robot_node_test.py b/tests/nodes/robot_node_test.py