Skip to content

Commit 5208b42

Browse files
committed
v0.3.2
- corrected issues - with json headers cli flag - similar allowed paths being allowed (/foo/ would allow /foobar/)
1 parent 314a9ac commit 5208b42

File tree

4 files changed

+23
-6
lines changed

4 files changed

+23
-6
lines changed

src/libcrawler/__main__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def main():
4141
return
4242
elif args.headers_json:
4343
try:
44-
headers = json.loads(args.headers_json)
44+
headers = args.headers_json
4545
except json.JSONDecodeError as e:
4646
print(f"Invalid JSON format for --headers-json: {e}")
4747
return

src/libcrawler/libcrawler.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,9 @@ def get_links(html, current_url, allowed_paths=None):
7171
current_netloc = urlparse(current_url).netloc
7272
links = set()
7373

74-
# Normalize allowed paths (remove trailing slashes)
74+
# Do not normalize allowed paths (keep trailing slashes)
7575
if allowed_paths:
76-
allowed_paths = [path.rstrip('/') for path in allowed_paths]
76+
allowed_paths = [path.rstrip('/') + '/' if not path.endswith('/') else path for path in allowed_paths]
7777

7878
for a in anchors:
7979
href = a['href']
@@ -87,9 +87,9 @@ def get_links(html, current_url, allowed_paths=None):
8787
# Normalize parsed_url.path (remove trailing slash)
8888
normalized_path = parsed_url.path.rstrip('/')
8989

90-
# If allowed_paths is specified, only include paths that start with allowed_paths
9190
if allowed_paths:
92-
if not any(normalized_path.startswith(path) for path in allowed_paths):
91+
# Check for exact match or if the path starts with allowed path followed by '/'
92+
if not any(normalized_path == path.rstrip('/') or normalized_path.startswith(path.rstrip('/') + '/') for path in allowed_paths):
9393
continue
9494

9595
links.add(full_url)

src/libcrawler/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
__version_info__ = ('0', '3', '0')
1+
__version_info__ = ('0', '3', '2')
22
__version__ = '.'.join(__version_info__)

src/tests/test_crawler.py

+17
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,23 @@ def test_get_links_with_index_html(self):
117117
}
118118
self.assertEqual(links, expected_links)
119119

120+
def test_get_links_disallowed_similar_paths(self):
121+
html = '''
122+
<a href="/langgraph/page1">Langgraph Page 1</a>
123+
<a href="/langgraphjs/page2">LanggraphJS Page 2</a>
124+
<a href="/langgraph-tools/page3">Langgraph Tools Page 3</a>
125+
<a href="/langgraph">Langgraph Home</a>
126+
<a href="/langgraphjavascript">Langgraph JavaScript</a>
127+
'''
128+
base_url = 'http://example.com'
129+
allowed_paths = ['/langgraph/']
130+
links = get_links(html, base_url, allowed_paths=allowed_paths)
131+
expected_links = {
132+
'http://example.com/langgraph/page1',
133+
'http://example.com/langgraph', # Include the exact path
134+
}
135+
self.assertEqual(links, expected_links)
136+
120137
class TestIsAllowedByRobots(unittest.TestCase):
121138
def test_is_allowed_by_robots(self):
122139
robots_parser = Mock()

0 commit comments

Comments
 (0)