v0.3.2

robbiemu · robbiemu · commit 5208b427bfaf · 2024-10-09T00:08:17.000-04:00
- corrected issues
  - with json headers cli flag
  - similar allowed paths being allowed (/foo/ would allow /foobar/)
diff --git a/src/libcrawler/__main__.py b/src/libcrawler/__main__.py
@@ -41,7 +41,7 @@ def main():
             return
     elif args.headers_json:
         try:
-            headers = json.loads(args.headers_json)
+            headers = args.headers_json
         except json.JSONDecodeError as e:
             print(f"Invalid JSON format for --headers-json: {e}")
             return
diff --git a/src/libcrawler/libcrawler.py b/src/libcrawler/libcrawler.py
@@ -71,9 +71,9 @@ def get_links(html, current_url, allowed_paths=None):
     current_netloc = urlparse(current_url).netloc
     links = set()
 
-    # Normalize allowed paths (remove trailing slashes)
+    # Do not normalize allowed paths (keep trailing slashes)
     if allowed_paths:
-        allowed_paths = [path.rstrip('/') for path in allowed_paths]
+        allowed_paths = [path.rstrip('/') + '/' if not path.endswith('/') else path for path in allowed_paths]
 
     for a in anchors:
         href = a['href']
@@ -87,9 +87,9 @@ def get_links(html, current_url, allowed_paths=None):
         # Normalize parsed_url.path (remove trailing slash)
         normalized_path = parsed_url.path.rstrip('/')
 
-        # If allowed_paths is specified, only include paths that start with allowed_paths
         if allowed_paths:
-            if not any(normalized_path.startswith(path) for path in allowed_paths):
+            # Check for exact match or if the path starts with allowed path followed by '/'
+            if not any(normalized_path == path.rstrip('/') or normalized_path.startswith(path.rstrip('/') + '/') for path in allowed_paths):
                 continue
 
         links.add(full_url)
diff --git a/src/libcrawler/version.py b/src/libcrawler/version.py
@@ -1,2 +1,2 @@
-__version_info__ = ('0', '3', '0')
+__version_info__ = ('0', '3', '2')
 __version__ = '.'.join(__version_info__)
diff --git a/src/tests/test_crawler.py b/src/tests/test_crawler.py
@@ -117,6 +117,23 @@ def test_get_links_with_index_html(self):
         }
         self.assertEqual(links, expected_links)
 
+    def test_get_links_disallowed_similar_paths(self):
+        html = '''
+        <a href="/langgraph/page1">Langgraph Page 1</a>
+        <a href="/langgraphjs/page2">LanggraphJS Page 2</a>
+        <a href="/langgraph-tools/page3">Langgraph Tools Page 3</a>
+        <a href="/langgraph">Langgraph Home</a>
+        <a href="/langgraphjavascript">Langgraph JavaScript</a>
+        '''
+        base_url = 'http://example.com'
+        allowed_paths = ['/langgraph/']
+        links = get_links(html, base_url, allowed_paths=allowed_paths)
+        expected_links = {
+            'http://example.com/langgraph/page1',
+            'http://example.com/langgraph',  # Include the exact path
+        }
+        self.assertEqual(links, expected_links)
+
 class TestIsAllowedByRobots(unittest.TestCase):
     def test_is_allowed_by_robots(self):
         robots_parser = Mock()

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-__version_info__ = ('0', '3', '0')`
	`1`	`+__version_info__ = ('0', '3', '2')`
`2`	`2`	`__version__ = '.'.join(__version_info__)`