Skip to content

Commit ba783f8

Browse files
authored
Merge pull request #38 from ScrapeGraphAI/add-infinite-scrolling
feat: add infinite scrolling
2 parents 0a7a968 + 6272be9 commit ba783f8

File tree

8 files changed

+126
-2
lines changed

8 files changed

+126
-2
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
.DS_Store
44
**/.DS_Store
55
*.csv
6+
venv/

__pycache__/test_sdk.cpython-312.pyc

838 Bytes
Binary file not shown.
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import asyncio
2+
3+
from scrapegraph_py import AsyncClient
4+
from scrapegraph_py.logger import sgai_logger
5+
6+
sgai_logger.set_logging(level="INFO")
7+
8+
9+
async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None:
10+
"""Scrape companies from a specific YC batch with infinite scroll."""
11+
try:
12+
# Initial scrape with infinite scroll enabled
13+
response = await client.smartscraper(
14+
website_url=url,
15+
user_prompt="Extract all company information from this page, including name, description, and website",
16+
number_of_scrolls=10,
17+
)
18+
# Process the results
19+
companies = response.get("result", {}).get("companies", [])
20+
if not companies:
21+
print(f"No companies found for batch {batch}")
22+
return
23+
24+
# Save or process the companies data
25+
print(f"Found {len(companies)} companies in batch {batch}")
26+
27+
for company in companies:
28+
print(f"Company: {company.get('name', 'N/A')}")
29+
print(f"Description: {company.get('description', 'N/A')}")
30+
print(f"Website: {company.get('website', 'N/A')}")
31+
print("-" * 50)
32+
33+
except Exception as e:
34+
print(f"Error scraping batch {batch}: {str(e)}")
35+
36+
37+
async def main():
38+
# Initialize async client
39+
client = AsyncClient(api_key="Your-API-Key")
40+
41+
try:
42+
# Example YC batch URLs
43+
batch_urls = {
44+
"W24": "https://www.ycombinator.com/companies?batch=Winter%202024",
45+
"S23": "https://www.ycombinator.com/companies?batch=Summer%202023"
46+
}
47+
48+
# Create tasks for each batch
49+
tasks = [
50+
scrape_companies(client, url, batch)
51+
for batch, url in batch_urls.items()
52+
]
53+
54+
# Execute all batch scraping concurrently
55+
await asyncio.gather(*tasks)
56+
57+
finally:
58+
# Ensure client is properly closed
59+
await client.close()
60+
61+
62+
if __name__ == "__main__":
63+
asyncio.run(main())
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from scrapegraph_py import Client
2+
from scrapegraph_py.logger import sgai_logger
3+
from pydantic import BaseModel
4+
from typing import List
5+
6+
sgai_logger.set_logging(level="INFO")
7+
8+
# Define the output schema
9+
class Company(BaseModel):
10+
name: str
11+
category: str
12+
location: str
13+
14+
class CompaniesResponse(BaseModel):
15+
companies: List[Company]
16+
17+
# Initialize the client with explicit API key
18+
sgai_client = Client(api_key="sgai-api-key")
19+
20+
try:
21+
# SmartScraper request with infinite scroll
22+
response = sgai_client.smartscraper(
23+
website_url="https://www.ycombinator.com/companies?batch=Spring%202025",
24+
user_prompt="Extract all company names and their categories from the page",
25+
output_schema=CompaniesResponse,
26+
number_of_scrolls=10 # Scroll 10 times to load more companies
27+
)
28+
29+
# Print the response
30+
print(f"Request ID: {response['request_id']}")
31+
32+
# Parse and print the results in a structured way
33+
result = CompaniesResponse.model_validate(response['result'])
34+
print("\nExtracted Companies:")
35+
print("-" * 80)
36+
for company in result.companies:
37+
print(f"Name: {company.name}")
38+
print(f"Category: {company.category}")
39+
print(f"Location: {company.location}")
40+
print("-" * 80)
41+
42+
except Exception as e:
43+
print(f"An error occurred: {e}")
44+
45+
finally:
46+
sgai_client.close()

scrapegraph-py/scrapegraph_py/async_client.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ async def smartscraper(
174174
website_html: Optional[str] = None,
175175
headers: Optional[dict[str, str]] = None,
176176
output_schema: Optional[BaseModel] = None,
177+
number_of_scrolls: Optional[int] = None,
177178
):
178179
"""Send a smartscraper request"""
179180
logger.info("🔍 Starting smartscraper request")
@@ -183,6 +184,8 @@ async def smartscraper(
183184
logger.debug("📄 Using provided HTML content")
184185
if headers:
185186
logger.debug("🔧 Using custom headers")
187+
if number_of_scrolls is not None:
188+
logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}")
186189
logger.debug(f"📝 Prompt: {user_prompt}")
187190

188191
request = SmartScraperRequest(
@@ -191,7 +194,9 @@ async def smartscraper(
191194
headers=headers,
192195
user_prompt=user_prompt,
193196
output_schema=output_schema,
197+
number_of_scrolls=number_of_scrolls,
194198
)
199+
195200
logger.debug("✅ Request validation passed")
196201

197202
result = await self._make_request(

scrapegraph-py/scrapegraph_py/client.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ def smartscraper(
182182
website_html: Optional[str] = None,
183183
headers: Optional[dict[str, str]] = None,
184184
output_schema: Optional[BaseModel] = None,
185+
number_of_scrolls: Optional[int] = None,
185186
):
186187
"""Send a smartscraper request"""
187188
logger.info("🔍 Starting smartscraper request")
@@ -191,6 +192,8 @@ def smartscraper(
191192
logger.debug("📄 Using provided HTML content")
192193
if headers:
193194
logger.debug("🔧 Using custom headers")
195+
if number_of_scrolls is not None:
196+
logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}")
194197
logger.debug(f"📝 Prompt: {user_prompt}")
195198

196199
request = SmartScraperRequest(
@@ -199,6 +202,7 @@ def smartscraper(
199202
headers=headers,
200203
user_prompt=user_prompt,
201204
output_schema=output_schema,
205+
number_of_scrolls=number_of_scrolls,
202206
)
203207
logger.debug("✅ Request validation passed")
204208

scrapegraph-py/scrapegraph_py/models/smartscraper.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from uuid import UUID
55

66
from bs4 import BeautifulSoup
7-
from pydantic import BaseModel, Field, model_validator
7+
from pydantic import BaseModel, Field, model_validator, conint
88

99

1010
class SmartScraperRequest(BaseModel):
@@ -29,6 +29,11 @@ class SmartScraperRequest(BaseModel):
2929
description="Optional headers to send with the request, including cookies and user agent",
3030
)
3131
output_schema: Optional[Type[BaseModel]] = None
32+
number_of_scrolls: Optional[conint(ge=0, le=100)] = Field(
33+
default=None,
34+
description="Number of times to scroll the page (0-100). If None, no scrolling will be performed.",
35+
example=10
36+
)
3237

3338
@model_validator(mode="after")
3439
def validate_user_prompt(self) -> "SmartScraperRequest":

scrapegraph-py/uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)