Merge pull request #38 from ScrapeGraphAI/add-infinite-scrolling

VinciGit00 · web-flow · commit ba783f83c04e · 2025-06-16T11:32:51.000+02:00
feat: add infinite scrolling
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@
 .DS_Store
 **/.DS_Store
 *.csv
+venv/
diff --git a/__pycache__/test_sdk.cpython-312.pyc b/__pycache__/test_sdk.cpython-312.pyc
diff --git a/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py
@@ -0,0 +1,63 @@
+import asyncio
+
+from scrapegraph_py import AsyncClient
+from scrapegraph_py.logger import sgai_logger
+
+sgai_logger.set_logging(level="INFO")
+
+
+async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None:
+    """Scrape companies from a specific YC batch with infinite scroll."""
+    try:
+        # Initial scrape with infinite scroll enabled
+        response = await client.smartscraper(
+            website_url=url,
+            user_prompt="Extract all company information from this page, including name, description, and website",
+            number_of_scrolls=10,
+        )
+        # Process the results
+        companies = response.get("result", {}).get("companies", [])
+        if not companies:
+            print(f"No companies found for batch {batch}")
+            return
+            
+        # Save or process the companies data
+        print(f"Found {len(companies)} companies in batch {batch}")
+        
+        for company in companies:
+            print(f"Company: {company.get('name', 'N/A')}")
+            print(f"Description: {company.get('description', 'N/A')}")
+            print(f"Website: {company.get('website', 'N/A')}")
+            print("-" * 50)
+            
+    except Exception as e:
+        print(f"Error scraping batch {batch}: {str(e)}")
+
+
+async def main():
+    # Initialize async client
+    client = AsyncClient(api_key="Your-API-Key")
+    
+    try:
+        # Example YC batch URLs
+        batch_urls = {
+            "W24": "https://www.ycombinator.com/companies?batch=Winter%202024",
+            "S23": "https://www.ycombinator.com/companies?batch=Summer%202023"
+        }
+        
+        # Create tasks for each batch
+        tasks = [
+            scrape_companies(client, url, batch)
+            for batch, url in batch_urls.items()
+        ]
+        
+        # Execute all batch scraping concurrently
+        await asyncio.gather(*tasks)
+        
+    finally:
+        # Ensure client is properly closed
+        await client.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main()) 
diff --git a/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py
@@ -0,0 +1,46 @@
+from scrapegraph_py import Client
+from scrapegraph_py.logger import sgai_logger
+from pydantic import BaseModel
+from typing import List
+
+sgai_logger.set_logging(level="INFO")
+
+# Define the output schema
+class Company(BaseModel):
+    name: str
+    category: str
+    location: str
+
+class CompaniesResponse(BaseModel):
+    companies: List[Company]
+
+# Initialize the client with explicit API key
+sgai_client = Client(api_key="sgai-api-key")
+
+try:
+    # SmartScraper request with infinite scroll
+    response = sgai_client.smartscraper(
+        website_url="https://www.ycombinator.com/companies?batch=Spring%202025",
+        user_prompt="Extract all company names and their categories from the page",
+        output_schema=CompaniesResponse,
+        number_of_scrolls=10  # Scroll 10 times to load more companies
+    )
+
+    # Print the response
+    print(f"Request ID: {response['request_id']}")
+    
+    # Parse and print the results in a structured way
+    result = CompaniesResponse.model_validate(response['result'])
+    print("\nExtracted Companies:")
+    print("-" * 80)
+    for company in result.companies:
+        print(f"Name: {company.name}")
+        print(f"Category: {company.category}")
+        print(f"Location: {company.location}")
+        print("-" * 80)
+
+except Exception as e:
+    print(f"An error occurred: {e}")
+
+finally:
+    sgai_client.close() 
diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py
@@ -174,6 +174,7 @@ async def smartscraper(
         website_html: Optional[str] = None,
         headers: Optional[dict[str, str]] = None,
         output_schema: Optional[BaseModel] = None,
+        number_of_scrolls: Optional[int] = None,
     ):
         """Send a smartscraper request"""
         logger.info("🔍 Starting smartscraper request")
@@ -183,6 +184,8 @@ async def smartscraper(
             logger.debug("📄 Using provided HTML content")
         if headers:
             logger.debug("🔧 Using custom headers")
+        if number_of_scrolls is not None:
+            logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}")
         logger.debug(f"📝 Prompt: {user_prompt}")
 
         request = SmartScraperRequest(
@@ -191,7 +194,9 @@ async def smartscraper(
             headers=headers,
             user_prompt=user_prompt,
             output_schema=output_schema,
+            number_of_scrolls=number_of_scrolls,
         )
+
         logger.debug("✅ Request validation passed")
 
         result = await self._make_request(
diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py
@@ -182,6 +182,7 @@ def smartscraper(
         website_html: Optional[str] = None,
         headers: Optional[dict[str, str]] = None,
         output_schema: Optional[BaseModel] = None,
+        number_of_scrolls: Optional[int] = None,
     ):
         """Send a smartscraper request"""
         logger.info("🔍 Starting smartscraper request")
@@ -191,6 +192,8 @@ def smartscraper(
             logger.debug("📄 Using provided HTML content")
         if headers:
             logger.debug("🔧 Using custom headers")
+        if number_of_scrolls is not None:
+            logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}")
         logger.debug(f"📝 Prompt: {user_prompt}")
 
         request = SmartScraperRequest(
@@ -199,6 +202,7 @@ def smartscraper(
             headers=headers,
             user_prompt=user_prompt,
             output_schema=output_schema,
+            number_of_scrolls=number_of_scrolls,
         )
         logger.debug("✅ Request validation passed")
 
diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py
@@ -4,7 +4,7 @@
 from uuid import UUID
 
 from bs4 import BeautifulSoup
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, model_validator, conint
 
 
 class SmartScraperRequest(BaseModel):
@@ -29,6 +29,11 @@ class SmartScraperRequest(BaseModel):
         description="Optional headers to send with the request, including cookies and user agent",
     )
     output_schema: Optional[Type[BaseModel]] = None
+    number_of_scrolls: Optional[conint(ge=0, le=100)] = Field(
+        default=None,
+        description="Number of times to scroll the page (0-100). If None, no scrolling will be performed.",
+        example=10
+    )
 
     @model_validator(mode="after")
     def validate_user_prompt(self) -> "SmartScraperRequest":
diff --git a/scrapegraph-py/uv.lock b/scrapegraph-py/uv.lock