Skip to content

Commit ca3d9cc

Browse files
authored
Merge pull request #89 from VinciGit00/proxy_rotation
fix: robot node and proxy rotation
2 parents 4bc7274 + adbc08f commit ca3d9cc

File tree

6 files changed

+17
-25
lines changed

6 files changed

+17
-25
lines changed

examples/local_models/Ollama/smart_scraper_ollama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
graph_config = {
1111
"llm": {
1212
"model": "ollama/mistral",
13-
"temperature": 1,
13+
"temperature": 0,
1414
"format": "json", # Ollama needs the format to be specified explicitly
1515
# "model_tokens": 2000, # set context length arbitrarily,
1616
"base_url": "http://localhost:11434", # set ollama URL arbitrarily

examples/single_node/fetch_node.py

Whitespace-only changes.

examples/single_node/robot_node.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,16 @@
22
Example of custom graph using existing nodes
33
"""
44

5-
import os
6-
from dotenv import load_dotenv
7-
from scrapegraphai.models import OpenAI
5+
from scrapegraphai.models import Ollama
86
from scrapegraphai.nodes import RobotsNode
9-
load_dotenv()
107

118
# ************************************************
129
# Define the configuration for the graph
1310
# ************************************************
1411

15-
openai_key = os.getenv("OPENAI_APIKEY")
16-
1712
graph_config = {
1813
"llm": {
19-
"api_key": openai_key,
20-
"model": "gpt-3.5-turbo",
14+
"model": "ollama/llama3",
2115
"temperature": 0,
2216
"streaming": True
2317
},
@@ -27,7 +21,7 @@
2721
# Define the node
2822
# ************************************************
2923

30-
llm_model = OpenAI(graph_config["llm"])
24+
llm_model = Ollama(graph_config["llm"])
3125

3226
robots_node = RobotsNode(
3327
input="url",

scrapegraphai/nodes/fetch_node.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from langchain_core.documents import Document
88
from .base_node import BaseNode
99
from ..utils.remover import remover
10-
from ..utils.proxy_generator import proxy_generator
1110

1211

1312
class FetchNode(BaseNode):
@@ -38,16 +37,14 @@ class FetchNode(BaseNode):
3837
to succeed.
3938
"""
4039

41-
def __init__(self, input: str, output: List[str], num_prox: int = True,
42-
node_name: str = "Fetch"):
40+
def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
4341
"""
4442
Initializes the FetchHTMLNode with a node name and node type.
4543
Arguments:
4644
node_name (str): name of the node
4745
prox_rotation (bool): if you wamt to rotate proxies
4846
"""
4947
super().__init__(node_name, "node", input, output, 1)
50-
self.num_prox = num_prox
5148

5249
def execute(self, state):
5350
"""
@@ -80,13 +77,13 @@ def execute(self, state):
8077
"source": "local_dir"
8178
})]
8279

83-
# if it is a URL
8480
else:
85-
if self.num_prox > 1:
81+
if self.node_config.get("endpoint") is not None:
8682
loader = AsyncHtmlLoader(
87-
source, proxies=proxy_generator(self.num_prox))
83+
source, proxies={"http": self.node_config["endpoint"]})
8884
else:
8985
loader = AsyncHtmlLoader(source)
86+
9087
document = loader.load()
9188
compressed_document = [
9289
Document(page_content=remover(str(document)))]

scrapegraphai/nodes/robots_node.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Module for checking if a website is scrapepable or not
2+
Module for checking if a website is scrapepable or not
33
"""
44
from typing import List
55
from urllib.parse import urlparse
@@ -12,7 +12,7 @@
1212

1313
class RobotsNode(BaseNode):
1414
"""
15-
A node responsible for checking if a website is scrapepable or not.
15+
A node responsible for checking if a website is scrapepable or not.
1616
It uses the AsyncHtmlLoader for asynchronous
1717
document loading.
1818
@@ -59,7 +59,7 @@ def __init__(self, input: str, output: List[str], node_config: dict, force_scra
5959
node_config (dict): Configuration parameters for the node.
6060
force_scraping (bool): A flag indicating whether scraping should be enforced even
6161
if disallowed by robots.txt. Defaults to True.
62-
node_name (str, optional): The unique identifier name for the node.
62+
node_name (str, optional): The unique identifier name for the node.
6363
Defaults to "Robots".
6464
"""
6565
super().__init__(node_name, "node", input, output, 1)
@@ -112,11 +112,12 @@ def execute(self, state):
112112
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
113113
loader = AsyncHtmlLoader(f"{base_url}/robots.txt")
114114
document = loader.load()
115-
model = self.llm_model.model_name
116-
117-
if "ollama" in model:
118-
model = model.split("/", maxsplit=1)[-1]
115+
if "ollama" in self.llm_model.model:
116+
self.llm_model.model = self.llm_model.model.split("/")[-1]
117+
model = self.llm_model.model.split("/")[-1]
119118

119+
else:
120+
model = self.llm_model.model_name
120121
try:
121122
agent = robots_dictionary[model]
122123

scrapegraphai/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55
from .convert_to_csv import convert_to_csv
66
from .convert_to_json import convert_to_json
77
from .prettify_exec_info import prettify_exec_info
8-
from .proxy_generator import proxy_generator
8+
from .proxy_rotation import proxy_generator

0 commit comments

Comments
 (0)