Skip to content

Commit 09401ed

Browse files
Issue softwarepub#276 - Refactor functions for harvesting CFF/CodeMeta via path
1 parent afb8189 commit 09401ed

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import pathlib
2+
import re
3+
import requests
4+
import tempfile
5+
import typing as t
6+
7+
8+
def normalize_url(path: str) -> str:
9+
"""Normalize a given URL by correcting backslashes and fixing malformed HTTPS."""
10+
corrected_url = path.replace("\\", "/")
11+
return corrected_url.replace("https:/", "https://")
12+
13+
14+
def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib.Path]:
15+
"""
16+
Fetch a metadata file (e.g., CITATION.cff or codemeta.json) from a GitHub or GitLab repository.
17+
18+
:param repo_url: The repository URL.
19+
:param filename: The name of the metadata file to fetch.
20+
:return: Path to the temporary file containing the downloaded metadata, or None.
21+
"""
22+
try:
23+
if "github.com" in repo_url:
24+
# GitHub API
25+
api_url = repo_url.replace("github.com", "api.github.com/repos").rstrip("/") + "/contents"
26+
response = requests.get(api_url)
27+
if response.status_code == 200:
28+
for file_info in response.json():
29+
if file_info["name"] == filename:
30+
return _download_to_tempfile(file_info["download_url"], filename)
31+
elif "gitlab.com" in repo_url:
32+
# GitLab API
33+
match = re.match(r"https://([^/]+)/([^/]+)/([^/]+)", repo_url)
34+
if match:
35+
base_domain = match.group(1)
36+
group_or_user = match.group(2)
37+
project_name = match.group(3).split('/')[0]
38+
project_path = f"{group_or_user}/{project_name}"
39+
api_url = f"https://{base_domain}/api/v4/projects/{requests.utils.quote(project_path, safe='')}/repository/tree"
40+
41+
response = requests.get(api_url)
42+
if response.status_code == 200:
43+
for file_info in response.json():
44+
if file_info["name"] == filename:
45+
file_url = (
46+
f"https://{base_domain}/api/v4/projects/"
47+
f"{requests.utils.quote(project_path, safe='')}/repository/files/"
48+
f"{requests.utils.quote(filename, safe='')}/raw"
49+
)
50+
return _download_to_tempfile(file_url, filename)
51+
else:
52+
print(f"Unsupported repository URL: {repo_url}")
53+
return None
54+
except Exception as e:
55+
print(f"Error fetching metadata from repository: {e}")
56+
return None
57+
58+
59+
def _download_to_tempfile(url: str, filename: str) -> pathlib.Path:
60+
"""
61+
Download a file from a URL and save it to a temporary file.
62+
63+
:param url: The URL to download from.
64+
:param filename: The name of the file to save.
65+
:return: Path to the temporary file.
66+
"""
67+
try:
68+
content = requests.get(url).text
69+
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{filename.split('.')[-1]}") as temp_file:
70+
temp_file.write(content.encode("utf-8"))
71+
print(f"Downloaded {filename} to {temp_file.name}")
72+
return pathlib.Path(temp_file.name)
73+
except Exception as e:
74+
print(f"Error downloading {filename}: {e}")
75+
return None

0 commit comments

Comments
 (0)