Skip to content

Commit

Permalink
Merge branch 'dev' into unique-repos
Browse files Browse the repository at this point in the history
  • Loading branch information
ABrain7710 committed Oct 8, 2024
2 parents 449917f + 4522248 commit c7774bc
Show file tree
Hide file tree
Showing 15 changed files with 356 additions and 73 deletions.
14 changes: 14 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This CITATION.cff reference content was generated from Zotero.
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: Goggins
given-names: Sean
- family-names: Lumbard
given-names: Kevin
- family-names: Germonprez
given-names: Matt
title: "Open Source Community Health: Analytical Metrics and Their Corresponding Narratives"
doi: 10.1109/SoHeal52568.2021.00010
date-released: 2021
url: https://www.seangoggins.net/wp-content/plugins/zotpress/lib/request/request.dl.php?api_user_id=655145&dlkey=HNG22ZSU&content_type=application/pdf
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Augur NEW Release v0.76.1
# Augur NEW Release v0.76.3

Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else!
The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io
Expand All @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
## NEW RELEASE ALERT!
### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)

Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.1
Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.3

- The `main` branch is a stable version of our new architecture, which features:
- Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.
Expand Down
158 changes: 158 additions & 0 deletions augur/api/metrics/deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,162 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No
return results


@register_metric()
def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None):
"""
Returns a list of all the dependencies in a project/repo/repo_group.
:param repo_id: The repository's id
:param repo_group_id: The repository's group id
:param period: To set the periodicity to 'day', 'week', 'month' or 'year', defaults to 'day'
:param begin_date: Specifies the begin date, defaults to '1970-1-1 00:00:00'
:param end_date: Specifies the end date, defaults to datetime.now()
:return: DataFrame of persons/period
"""

if not begin_date:
begin_date = '1970-1-1 00:00:01'
if not end_date:
end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

if repo_id:

libyearSQL = s.sql.text("""
SELECT
rg_name,
repo_group_id,
repo_name,
d.repo_id,
repo_git,
forked_from,
repo_archived,
c.name,
c.libyear,
MAX ( C.data_collection_date ) AS most_recent_collection
FROM
(
SELECT A.rg_name AS rg_name,
A.repo_group_id AS repo_group_id,
b.repo_name AS repo_name,
b.repo_id AS repo_id,
b.repo_git AS repo_git,
b.forked_from AS forked_from,
b.repo_archived AS repo_archived
FROM
repo_groups A,
repo b
WHERE
A.repo_group_id = b.repo_group_id
ORDER BY
rg_name,
repo_name
) d,
(
SELECT DISTINCT
f.repo_id,
f.NAME,
f.libyear,
f.data_collection_date
FROM
( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e,
augur_data.repo_deps_libyear f
WHERE
e.data_collection_date = f.data_collection_date and
e.repo_id = f.repo_id
ORDER BY
NAME
) C
WHERE
d.repo_id = C.repo_id
AND C.repo_id = :repo_id
GROUP BY
rg_name,
repo_git,
repo_group_id,
repo_name,
d.repo_id,
forked_from,
repo_archived,
c.name,
c.libyear
ORDER BY
repo_id;
""")

with current_app.engine.connect() as conn:
results = pd.read_sql(libyearSQL, conn, params={'repo_id': repo_id})

else:

libyearSQL = s.sql.text("""
Select w.* from
(
SELECT
rg_name,
repo_group_id,
repo_name,
d.repo_id,
repo_git,
forked_from,
repo_archived,
c.name,
c.libyear,
MAX ( C.data_collection_date ) AS most_recent_collection
FROM
(
SELECT A.rg_name AS rg_name,
A.repo_group_id AS repo_group_id,
b.repo_name AS repo_name,
b.repo_id AS repo_id,
b.repo_git AS repo_git,
b.forked_from AS forked_from,
b.repo_archived AS repo_archived
FROM
repo_groups A,
repo b
WHERE
A.repo_group_id = b.repo_group_id
ORDER BY
rg_name,
repo_name
) d,
(
SELECT DISTINCT
f.repo_id,
f.NAME,
f.libyear,
f.data_collection_date
FROM
( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e,
augur_data.repo_deps_libyear f
WHERE
e.data_collection_date = f.data_collection_date and
e.repo_id = f.repo_id
ORDER BY
NAME
) C
WHERE
d.repo_id = C.repo_id
GROUP BY
rg_name,
repo_git,
repo_group_id,
repo_name,
d.repo_id,
forked_from,
repo_archived,
c.name,
c.libyear
ORDER BY
repo_id) w,
repo_groups y,
repo z
where w.repo_id=z.repo_id and
y.repo_group_id=z.repo_group_id
and z.repo_group_id = :repo_group_id
""")

with current_app.engine.connect() as conn:
results = pd.read_sql(libyearSQL, conn, params={'repo_group_id': repo_group_id})
return results

2 changes: 1 addition & 1 deletion augur/api/view/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def av_add_user_repo():
# matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo}
elif Repo.parse_gitlab_repo_url(url)[0]:

org_name, repo_name = Repo.parse_github_repo_url(url)
org_name, repo_name = Repo.parse_gitlab_repo_url(url)
repo_git = f"https://gitlab.com/{org_name}/{repo_name}"

gitlab_repo_urls.append(repo_git)
Expand Down
16 changes: 16 additions & 0 deletions augur/application/db/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,22 @@ def get_working_commits_by_repo_id(repo_id):

return working_commits

def get_missing_commit_message_hashes(repo_id):

fetch_missing_hashes_sql = s.sql.text("""
SELECT DISTINCT cmt_commit_hash FROM commits
WHERE repo_id=:repo_id
AND cmt_commit_hash NOT IN
(SELECT DISTINCT cmt_hash FROM commit_messages WHERE repo_id=:repo_id);
""").bindparams(repo_id=repo_id)

try:
missing_commit_hashes = fetchall_data_from_sql_text(fetch_missing_hashes_sql)
except:
missing_commit_hashes = []

return missing_commit_hashes

def get_worker_oauth_keys(platform: str):

with get_session() as session:
Expand Down
5 changes: 3 additions & 2 deletions augur/application/db/models/augur_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import logging
import re
import json
import urllib.parse


from augur.application.db.models.base import Base
Expand Down Expand Up @@ -972,7 +973,7 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool:
return False, {"status": "Invalid repo URL"}

# Encode namespace and project name for the API request
project_identifier = f"{owner}%2F{repo}"
project_identifier = urllib.parse.quote(f"{owner}/{repo}", safe='')
url = REPO_ENDPOINT.format(project_identifier)

attempts = 0
Expand Down Expand Up @@ -1031,7 +1032,7 @@ def parse_gitlab_repo_url(url: str) -> tuple:
Tuple of owner and repo. Or a tuple of None and None if the url is invalid.
"""

result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$", url)
result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9\-_\/]+)\/([A-Za-z0-9\-_]+)(\.git)?\/?$", url)

if not result:
return None, None
Expand Down
49 changes: 48 additions & 1 deletion augur/tasks/git/facade_tasks.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#SPDX-License-Identifier: MIT

import logging
import datetime
from celery import group, chain

from augur.application.db.lib import get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id, facade_bulk_insert_commits, bulk_insert_dicts
from subprocess import check_output
from augur.application.db.lib import get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id, facade_bulk_insert_commits, bulk_insert_dicts, get_missing_commit_message_hashes

from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits
from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set
Expand Down Expand Up @@ -157,6 +159,50 @@ def facade_start_contrib_analysis_task():
facade_helper.update_status('Updating Contributors')
facade_helper.log_activity('Info', 'Updating Contributors with commits')

@celery.task(base=AugurFacadeRepoCollectionTask)
def facade_fetch_missing_commit_messages(repo_git):
logger = logging.getLogger(facade_fetch_missing_commit_messages.__name__)
facade_helper = FacadeHelper(logger)

repo = get_repo_by_repo_git(repo_git)

logger.debug(f"Fetching missing commit message records for repo {repo_git}")

missing_message_hashes = get_missing_commit_message_hashes(repo.repo_id)

to_insert = []

for hash in missing_message_hashes:
#Get the huge list of commits to process.
absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name)
repo_loc = (f"{absolute_path}/.git")

try:
commit_message = check_output(
f"git --git-dir {repo_loc} log --format=%B -n 1 {hash}".split()
).decode('utf-8').strip()

msg_record = {
'repo_id' : repo.repo_id,
'cmt_msg' : commit_message,
'cmt_hash' : hash,
'tool_source' : 'Facade',
'tool_version' : '0.78?',
'data_source' : 'git',
'data_collection_date' : datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}

if len(to_insert) >= 1000:
bulk_insert_dicts(logger,to_insert, CommitMessage, ["repo_id","cmt_hash"])
to_insert = []

to_insert.append(msg_record)
except Exception as e:
logger.info(f'The exception is : {e}.')

if to_insert:
bulk_insert_dicts(logger, to_insert, CommitMessage, ["repo_id","cmt_hash"])


#enable celery multithreading
@celery.task(base=AugurFacadeRepoCollectionTask)
Expand Down Expand Up @@ -354,6 +400,7 @@ def generate_analysis_sequence(logger,repo_git, facade_helper):

analysis_sequence.append(trim_commits_post_analysis_facade_task.si(repo_git))

analysis_sequence.append(facade_fetch_missing_commit_messages.si(repo_git))

analysis_sequence.append(facade_analysis_end_facade_task.si())

Expand Down
5 changes: 5 additions & 0 deletions augur/tasks/github/util/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Utility functions that are useful for several Github tasks"""
from typing import Any, List, Tuple
import logging
import urllib.parse
import json
import httpx
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
Expand Down Expand Up @@ -73,6 +74,10 @@ def get_owner_repo(git_url: str) -> Tuple[str, str]:

return owner, repo

def get_gitlab_repo_identifier(owner, repo):

return urllib.parse.quote(f"{owner}/{repo}", safe='')


def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dict:
# try to get json from response
Expand Down
14 changes: 8 additions & 6 deletions augur/tasks/gitlab/events_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler
from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data
from augur.tasks.github.util.util import get_owner_repo
from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent
from augur.tasks.github.util.util import get_gitlab_repo_identifier
from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent, Repo
from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session
from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth

Expand All @@ -24,7 +24,7 @@ def collect_gitlab_issue_events(repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

logger = logging.getLogger(collect_gitlab_issue_events.__name__)

Expand Down Expand Up @@ -52,7 +52,7 @@ def collect_gitlab_merge_request_events(repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

logger = logging.getLogger(collect_gitlab_issue_events.__name__)

Expand Down Expand Up @@ -82,11 +82,13 @@ def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None:
key_auth: key auth cache and rotator object
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger.info(f"Collecting gitlab issue events for {owner}/{repo}")

url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type={gtype}"
url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/events?target_type={gtype}"
events = GitlabApiHandler(key_auth, logger)

all_data = []
Expand Down
Loading

0 comments on commit c7774bc

Please sign in to comment.