Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/chaoss/augur into improve-d…
Browse files Browse the repository at this point in the history
…ockerization
  • Loading branch information
Mbaoma committed Oct 18, 2024
2 parents 604983c + 571aa33 commit 03675fc
Show file tree
Hide file tree
Showing 16 changed files with 70 additions and 1,649 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Augur NEW Release v0.76.2
# Augur NEW Release v0.76.4

Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data - less data carpentry for everyone else!
The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot), a public instance of 8Knot is available [here](https://metrix.chaoss.io) - this is tied to a public instance of [Augur](https://ai.chaoss.io).
Expand Down
1 change: 0 additions & 1 deletion add.md

This file was deleted.

2 changes: 1 addition & 1 deletion augur/api/view/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def av_add_user_repo():
# matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo}
elif Repo.parse_gitlab_repo_url(url)[0]:

org_name, repo_name = Repo.parse_github_repo_url(url)
org_name, repo_name = Repo.parse_gitlab_repo_url(url)
repo_git = f"https://gitlab.com/{org_name}/{repo_name}"

# TODO: gitlab ensure the whole repo git is inserted so it can be found here
Expand Down
3 changes: 2 additions & 1 deletion augur/application/db/data_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -1151,7 +1151,7 @@ def extract_needed_gitlab_issue_message_ref_data(message: dict, issue_id: int, r
return message_ref_dict


def extract_needed_gitlab_message_data(comment: dict, platform_id: int, tool_source: str, tool_version: str, data_source: str):
def extract_needed_gitlab_message_data(comment: dict, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str):
"""
Extract specific metadata for a comment from an api response
and connect it to the relevant platform id.
Expand All @@ -1169,6 +1169,7 @@ def extract_needed_gitlab_message_data(comment: dict, platform_id: int, tool_sou
"""

comment_dict = {
"repo_id": repo_id,
"pltfrm_id": platform_id,
"msg_text": comment['body'],
"msg_timestamp": comment['created_at'],
Expand Down
5 changes: 3 additions & 2 deletions augur/application/db/models/augur_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import logging
import re
import json
import urllib.parse


from augur.application.db.models.base import Base
Expand Down Expand Up @@ -971,7 +972,7 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool:
return False, {"status": "Invalid repo URL"}

# Encode namespace and project name for the API request
project_identifier = f"{owner}%2F{repo}"
project_identifier = urllib.parse.quote(f"{owner}/{repo}", safe='')
url = REPO_ENDPOINT.format(project_identifier)

attempts = 0
Expand Down Expand Up @@ -1030,7 +1031,7 @@ def parse_gitlab_repo_url(url: str) -> tuple:
Tuple of owner and repo. Or a tuple of None and None if the url is invalid.
"""

result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$", url)
result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9\-_\/]+)\/([A-Za-z0-9\-_]+)(\.git)?\/?$", url)

if not result:
return None, None
Expand Down
4 changes: 2 additions & 2 deletions augur/tasks/git/dependency_tasks/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def generate_scorecard(logger, repo_git):
path = repo_git[8:]
if path[-4:] == '.git':
path = path.replace(".git", "")
command = '--repo=' + path
command = '--local=' + path

#this is path where our scorecard project is located
path_to_scorecard = os.environ['HOME'] + '/scorecard'
Expand All @@ -99,7 +99,7 @@ def generate_scorecard(logger, repo_git):
logger.info('adding to database...')
logger.debug(f"output: {required_output}")

if not required_output['checks']:
if not required_output.get('checks'):
logger.info('No scorecard checks found!')
return

Expand Down
2 changes: 1 addition & 1 deletion augur/tasks/git/dependency_tasks/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def process_ossf_dependency_metrics(self, repo_git):
logger.warning(f'Exception generating scorecard: {e}')
tracer = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
logger.warning(f'Full stack trace of OpenSSF scorecard error: {tracer}')
raise MetadataException(f"An error occurred while generating the scorecard: {str(e)}")
raise MetadataException(e,f"An error occurred while generating the scorecard: {str(e)}")

"""
This try/except block is an attempt to get more information about this occasional error:
Expand Down
2 changes: 1 addition & 1 deletion augur/tasks/github/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def _collect_and_process_pr_events(self, owner, repo, repo_id, key_auth):
with engine.connect() as connection:

query = text(f"""
select pull_request_id, pr_src_number as gh_pr_number, pr_src_id from pull_requests order by pr_created_at desc; from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc;
select pull_request_id, pr_src_number as gh_pr_number, pr_src_id from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc;
""")

pr_result = connection.execute(query).fetchall()
Expand Down
5 changes: 5 additions & 0 deletions augur/tasks/github/util/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Utility functions that are useful for several Github tasks"""
from typing import Any, List, Tuple
import logging
import urllib.parse
import json
import httpx
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
Expand Down Expand Up @@ -46,6 +47,10 @@ def get_owner_repo(git_url: str) -> Tuple[str, str]:

return owner, repo

def get_gitlab_repo_identifier(owner, repo):

return urllib.parse.quote(f"{owner}/{repo}", safe='')


def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dict:
# try to get json from response
Expand Down
14 changes: 8 additions & 6 deletions augur/tasks/gitlab/events_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler
from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data
from augur.tasks.github.util.util import get_owner_repo
from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent
from augur.tasks.github.util.util import get_gitlab_repo_identifier
from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent, Repo
from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session
from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth

Expand All @@ -24,7 +24,7 @@ def collect_gitlab_issue_events(repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

logger = logging.getLogger(collect_gitlab_issue_events.__name__)

Expand Down Expand Up @@ -52,7 +52,7 @@ def collect_gitlab_merge_request_events(repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

logger = logging.getLogger(collect_gitlab_issue_events.__name__)

Expand Down Expand Up @@ -82,11 +82,13 @@ def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None:
key_auth: key auth cache and rotator object
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger.info(f"Collecting gitlab issue events for {owner}/{repo}")

url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type={gtype}"
url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/events?target_type={gtype}"
events = GitlabApiHandler(key_auth, logger)

all_data = []
Expand Down
22 changes: 13 additions & 9 deletions augur/tasks/gitlab/issues_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler
from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data
from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts
from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor
from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts
from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor, Repo
from augur.tasks.util.worker_util import remove_duplicate_dicts
from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session
from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth
Expand All @@ -32,7 +32,7 @@ def collect_gitlab_issues(repo_git : str) -> int:
key_auth = GitlabRandomKeyAuth(logger)

try:
owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, key_auth)

Expand All @@ -57,11 +57,13 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None:
key_auth: key auth cache and rotator object
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger.info(f"Collecting gitlab issues for {owner}/{repo}")

url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues?with_labels_details=True"
url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/issues?with_labels_details=True"
issues = GitlabApiHandler(key_auth, logger)

all_data = []
Expand Down Expand Up @@ -207,7 +209,7 @@ def collect_gitlab_issue_comments(issue_ids, repo_git) -> int:
repo_git: repo url
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

logger = logging.getLogger(collect_gitlab_issues.__name__)

Expand Down Expand Up @@ -237,7 +239,9 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git):
repo_git: repo url
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

all_comments = {}
issue_count = len(issue_ids)
Expand All @@ -249,7 +253,7 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git):

logger.info(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}")

url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes"
url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/issues/{id}/notes"

for page_data, _ in comments.iter_pages(url):

Expand Down Expand Up @@ -315,7 +319,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, session):
}

message_dicts.append(
extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source)
extract_needed_gitlab_message_data(message, platform_id, repo_id, tool_source, tool_version, data_source)
)

contributors = remove_duplicate_dicts(contributors)
Expand Down
42 changes: 27 additions & 15 deletions augur/tasks/gitlab/merge_request_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler
from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data
from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts
from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts
from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee
from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth
from augur.tasks.util.worker_util import remove_duplicate_dicts
Expand All @@ -26,7 +26,7 @@ def collect_gitlab_merge_requests(repo_git: str) -> int:

repo_id = get_repo_by_repo_git(repo_git).repo_id

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

key_auth = GitlabRandomKeyAuth(logger)

Expand All @@ -51,11 +51,13 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None:
key_auth: key auth cache and rotator object
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger.info(f"Collecting pull requests for {owner}/{repo}")

url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests?with_labels_details=True"
url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests?with_labels_details=True"
mrs = GitlabApiHandler(key_auth, logger)

all_data = []
Expand Down Expand Up @@ -171,15 +173,17 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

logger = logging.getLogger(collect_merge_request_comments.__name__)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

repo_id = get_repo_by_repo_git(repo_git).repo_id

key_auth = GitlabRandomKeyAuth(logger)

url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}")
url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/notes".format(repo_identifier=repo_identifier, id="{id}")
comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, key_auth, logger, response_type="list")

with get_session() as session:
Expand Down Expand Up @@ -240,7 +244,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, session):
}

message_dicts.append(
extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source)
extract_needed_gitlab_message_data(message, platform_id, repo_id, tool_source, tool_version, data_source)
)

contributors = remove_duplicate_dicts(contributors)
Expand Down Expand Up @@ -282,15 +286,17 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger = logging.getLogger(collect_merge_request_metadata.__name__)

repo_id = get_repo_by_repo_git(repo_git).repo_id

key_auth = GitlabRandomKeyAuth(logger)

url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}")
url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}".format(repo_identifier=repo_identifier, id="{id}")
metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, key_auth, logger, response_type="dict")

with get_session() as session:
Expand Down Expand Up @@ -347,15 +353,17 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger = logging.getLogger(collect_merge_request_reviewers.__name__)

repo_id = get_repo_by_repo_git(repo_git).repo_id

key_auth = GitlabRandomKeyAuth(logger)

url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}")
url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/approvals".format(repo_identifier=repo_identifier, id="{id}")
reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, key_auth, logger, response_type="dict")

with get_session() as session:
Expand Down Expand Up @@ -414,15 +422,17 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger = logging.getLogger(collect_merge_request_commits.__name__)

repo_id = get_repo_by_repo_git(repo_git).repo_id

key_auth = GitlabRandomKeyAuth(logger)

url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}")
url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/commits".format(repo_identifier=repo_identifier, id="{id}")
commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, key_auth, logger, response_type="list")

with get_session() as session:
Expand Down Expand Up @@ -484,13 +494,15 @@ def collect_merge_request_files(mr_ids, repo_git) -> int:

logger = logging.getLogger(collect_merge_request_files.__name__)

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

repo_id = get_repo_by_repo_git(repo_git).repo_id

key_auth = GitlabRandomKeyAuth(logger)

url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}")
url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/changes".format(repo_identifier=repo_identifier, id="{id}")
files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, key_auth, logger, response_type="dict")

with get_session() as session:
Expand Down
10 changes: 3 additions & 7 deletions augur/tasks/init/celery_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,9 @@ def setup_periodic_tasks(sender, **kwargs):
sender.add_periodic_task(collection_interval, augur_collection_monitor.s())

#Do longer tasks less often
non_domain_collection_interval = collection_interval * 300
logger.info(f"Scheduling non-repo-domain collection every {non_domain_collection_interval/60} minutes")
sender.add_periodic_task(non_domain_collection_interval, non_repo_domain_tasks.s())
logger.info(f"Scheduling data analysis every 30 days")
thirty_days_in_seconds = 30*24*60*60
sender.add_periodic_task(thirty_days_in_seconds, non_repo_domain_tasks.s())

mat_views_interval = int(config.get_value('Celery', 'refresh_materialized_views_interval_in_days'))
logger.info(f"Scheduling refresh materialized view every night at 1am CDT")
Expand All @@ -231,10 +231,6 @@ def setup_periodic_tasks(sender, **kwargs):
logger.info(f"Setting 404 repos to be marked for retry on midnight each day")
sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s())

logger.info(f"Scheduling contributor breadth every 30 days")
thirty_days_in_seconds = 30*24*60*60
sender.add_periodic_task(thirty_days_in_seconds, contributor_breadth_model.s())

@after_setup_logger.connect
def setup_loggers(*args,**kwargs):
"""Override Celery loggers with our own."""
Expand Down
Loading

0 comments on commit 03675fc

Please sign in to comment.