diff --git a/.docker-setup.sh b/.docker-setup.sh index d9c9e7f8c..dfdc797a3 100755 --- a/.docker-setup.sh +++ b/.docker-setup.sh @@ -24,7 +24,6 @@ missingModules="" #Check everything that needs to be in the $PATH is in there. #Bash doesn't let this work if this is in an if statement for some reason it has to be chained type -P "docker" &>/dev/null && echo "docker found..." || missingModules="${missingModules} docker" -type -P "docker-compose" &>/dev/null && echo "docker-compose found..." || missingModules="${missingModules} docker-compose" type -P "ifconfig" &>/dev/null && echo "ifconfig found..." || missingModules="${missingModules} ifconfig (part of net-tools)" type -P "psql" &>/dev/null && echo "psql found..." || missingModules="${missingModules} psql" type -P "watch" &>/dev/null && echo "watch found..." || missingModules="${missingModules} watch" diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml new file mode 100644 index 000000000..c23bfd7bb --- /dev/null +++ b/.github/workflows/checks.yml @@ -0,0 +1,31 @@ +name: "run-linting-checks" +on: + pull_request: + branches: [main, dev] + +jobs: + run-pylint: + name: runner / pylint + permissions: write-all + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: dciborow/action-pylint@0.1.0 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + reporter: github-pr-review + level: warning + glob_pattern: "**/*.py" + filter_mode: "file" + + misspell: + name: runner / misspell + runs-on: ubuntu-latest + steps: + - name: Highlight any misspellings in changes. + uses: actions/checkout@v4 + - name: misspell + uses: reviewdog/action-misspell@v1 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + locale: "US" \ No newline at end of file diff --git a/.pylintrc b/.pylintrc index 0b1b7d204..0056af873 100644 --- a/.pylintrc +++ b/.pylintrc @@ -12,7 +12,7 @@ #refactoring checker #enable=R -disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311 +disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311,E0401 # Analyse import fallback blocks. This can be used to support both Python 2 and diff --git a/Makefile b/Makefile index 26cac178b..22364ac16 100644 --- a/Makefile +++ b/Makefile @@ -139,16 +139,16 @@ docs-view: docs compose-run: - @ docker-compose -f docker-compose.yml up --build + @ docker compose -f docker-compose.yml up --build compose-run-database: @ echo "**************************************************************************" @ echo "Make sure there are no database credentials in docker_env.txt!" @ echo "**************************************************************************" @ echo - @ docker-compose -f docker-compose.yml -f database-compose.yml up --build + @ docker compose -f docker-compose.yml -f database-compose.yml up --build -docker-build: docker-build-backend docker-build-frontend docker-build-database +docker-build: docker-build-backend docker-build-frontend docker-build-database docker-build-rabbitmq docker-build-backend: @ docker build -t augurlabs/augur:backend -f util/docker/backend/Dockerfile . @@ -159,6 +159,8 @@ docker-build-frontend: docker-build-database: @ docker build -t augurlabs/augur:database -f util/docker/database/Dockerfile . +docker-build-rabbitmq: + @ docker build -t augurlabs/augur:rabbitmq -f util/docker/rabbitmq/Dockerfile . docker-run-backend: @ - docker stop augur_backend @@ -174,3 +176,8 @@ docker-run-database: @ - docker stop augur_database @ - docker rm augur_database docker run -p 5434:5432 --name augur_database augurlabs/augur:database + +docker-run-rabbitmq: + @ - docker stop augur_rabbitmq + @ - docker rm augur_rabbitmq + docker run -p 5434:5432 --name augur_rabbitmq augurlabs/augur:rabbitmq \ No newline at end of file diff --git a/README.md b/README.md index 9977fc81a..13fbe0dca 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -# Augur NEW Release v0.60.0 +# Augur NEW Release v0.62.4 + +Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! +The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io [![first-timers-only](https://img.shields.io/badge/first--timers--only-friendly-blue.svg?style=flat-square)](https://www.firsttimersonly.com/) We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy of tagging issues for first timers only, and walking one newcomer through the resolution process weekly. [You can find these issues tagged with "first timers only" on our issues list.](https://github.com/chaoss/augur/labels/first-timers-only). @@ -7,7 +10,7 @@ ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.60.0 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.62.4 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. - A new job management architecture that uses Celery and Redis to manage queues, and enables users to run a Flower job monitoring dashboard @@ -94,6 +97,7 @@ Contributors - `Dawn Foster `_ - `Ivana Atanasova `_ - `Georg J.P. Link `_ +- `Gary P White `_ GSoC 2022 participants ----------------------- diff --git a/augur/api/metrics/README.md b/augur/api/metrics/README.md index cabcc4475..5990291bf 100644 --- a/augur/api/metrics/README.md +++ b/augur/api/metrics/README.md @@ -26,7 +26,8 @@ from augur.application.db.engine import engine 4. Define any queries with the structure show below ```py repo_sql = s.sql.text(""" SELECT repo.repo_name FROM repo WHERE repo.repo_id = :repo_id """) -results = pd.read_sql(repo_sql, engine, params={'repo_id': repo_id}) +with engine.connect() as conn: + results = pd.read_sql(repo_sql, conn, params={'repo_id': repo_id}) ``` 5. Return either a pandas dataframe, dict, or json. - Note: If you return a pandas dataframe or dict it will be automatically converted into json diff --git a/augur/api/metrics/commit.py b/augur/api/metrics/commit.py index c143cd9f6..41d86abbf 100644 --- a/augur/api/metrics/commit.py +++ b/augur/api/metrics/commit.py @@ -90,8 +90,9 @@ def committers(repo_group_id, repo_id=None, begin_date=None, end_date=None, peri """ ) - results = pd.read_sql(committersSQL, engine, params={'repo_id': repo_id, - 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date, 'period':period}) + with engine.connect() as conn: + results = pd.read_sql(committersSQL, conn, params={'repo_id': repo_id, + 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date, 'period':period}) return results @@ -167,8 +168,9 @@ def annual_commit_count_ranked_by_new_repo_in_repo_group(repo_group_id, repo_id= ORDER BY YEAR ASC """.format(table, period)) - results = pd.read_sql(cdRgNewrepRankedCommitsSQL, engine, params={'repo_id': repo_id, - 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(cdRgNewrepRankedCommitsSQL, conn, params={'repo_id': repo_id, + 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -265,8 +267,9 @@ def annual_commit_count_ranked_by_repo_in_repo_group(repo_group_id, repo_id=None LIMIT 10 """) - results = pd.read_sql(cdRgTpRankedCommitsSQL, engine, params={ "repo_group_id": repo_group_id, - "repo_id": repo_id}) + with engine.connect() as conn: + results = pd.read_sql(cdRgTpRankedCommitsSQL, conn, params={ "repo_group_id": repo_group_id, + "repo_id": repo_id}) return results @register_metric() @@ -296,8 +299,9 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY patches DESC) a """) - results = pd.read_sql(total_commits_SQL, engine, - params={'year': year, 'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(total_commits_SQL, conn, + params={'year': year, 'repo_group_id': repo_group_id}) else: total_commits_SQL = s.sql.text(""" SELECT SUM(patches)::int @@ -308,8 +312,9 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY patches DESC) a """) - results = pd.read_sql(total_commits_SQL, engine, - params={'year': year, 'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(total_commits_SQL, conn, + params={'year': year, 'repo_id': repo_id}) if not results.iloc[0]['sum']: return pd.DataFrame() @@ -334,8 +339,9 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY commits DESC """) - results = pd.read_sql(committers_SQL, engine, - params={'year': year, 'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(committers_SQL, conn, + params={'year': year, 'repo_group_id': repo_group_id}) else: committers_SQL = s.sql.text(""" SELECT @@ -353,8 +359,9 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY commits DESC """) - results = pd.read_sql(committers_SQL, engine, - params={'year': year, 'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(committers_SQL, conn, + params={'year': year, 'repo_id': repo_id}) cumsum = 0 for i, row in results.iterrows(): diff --git a/augur/api/metrics/contributor.py b/augur/api/metrics/contributor.py index 7d255ecb4..3f25236d0 100644 --- a/augur/api/metrics/contributor.py +++ b/augur/api/metrics/contributor.py @@ -125,8 +125,9 @@ def contributors(repo_group_id, repo_id=None, period='day', begin_date=None, end ORDER BY total DESC """) - results = pd.read_sql(contributorsSQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsSQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: contributorsSQL = s.sql.text(""" SELECT id::text AS user_id, @@ -211,8 +212,9 @@ def contributors(repo_group_id, repo_id=None, period='day', begin_date=None, end ORDER BY total DESC """) - results = pd.read_sql(contributorsSQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -281,8 +283,9 @@ def contributors_new(repo_group_id, repo_id=None, period='day', begin_date=None, GROUP BY date, repo.repo_id, repo_name """) - results = pd.read_sql(contributorsNewSQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsNewSQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: contributorsNewSQL = s.sql.text(""" SELECT date_trunc(:period, b.created_at::DATE) AS date, COUNT(id) AS new_contributors, repo.repo_id, repo_name @@ -330,8 +333,9 @@ def contributors_new(repo_group_id, repo_id=None, period='day', begin_date=None, GROUP BY date, repo.repo_id, repo_name """) - results = pd.read_sql(contributorsNewSQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsNewSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -351,7 +355,8 @@ def lines_changed_by_author(repo_group_id, repo_id=None): GROUP BY commits.repo_id, date_trunc('week', cmt_author_date::date), cmt_author_affiliation, cmt_author_email, repo_name ORDER BY date_trunc('week', cmt_author_date::date) ASC; """) - results = pd.read_sql(linesChangedByAuthorSQL, engine, params={"repo_id": repo_id}) + with engine.connect() as conn: + results = pd.read_sql(linesChangedByAuthorSQL, conn, params={"repo_id": repo_id}) return results else: linesChangedByAuthorSQL = s.sql.text(""" @@ -362,7 +367,8 @@ def lines_changed_by_author(repo_group_id, repo_id=None): GROUP BY repo_id, date_trunc('week', cmt_author_date::date), cmt_author_affiliation, cmt_author_email ORDER BY date_trunc('week', cmt_author_date::date) ASC; """) - results = pd.read_sql(linesChangedByAuthorSQL, engine, params={"repo_group_id": repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(linesChangedByAuthorSQL, conn, params={"repo_group_id": repo_group_id}) return results @register_metric() @@ -420,8 +426,9 @@ def contributors_code_development(repo_group_id, repo_id=None, period='all', beg GROUP BY a.email, a.repo_id, repo_name """) - results = pd.read_sql(contributorsSQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsSQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: contributorsSQL = s.sql.text(""" SELECT @@ -455,6 +462,7 @@ def contributors_code_development(repo_group_id, repo_id=None, period='all', beg ORDER BY commits desc, email """) - results = pd.read_sql(contributorsSQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/deps.py b/augur/api/metrics/deps.py index deb5ac89f..d92371d89 100644 --- a/augur/api/metrics/deps.py +++ b/augur/api/metrics/deps.py @@ -6,6 +6,7 @@ import sqlalchemy as s import pandas as pd from augur.api.util import register_metric +import datetime from ..server import engine @@ -45,7 +46,8 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No AND repo_dependencies.repo_id = :repo_id """) - results = pd.read_sql(depsSQL, engine) + with engine.connect() as conn: + results = pd.read_sql(depsSQL, conn) else: @@ -69,7 +71,8 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No AND repo.repo_group_id = :repo_group_id """) - results = pd.read_sql(depsSQL, engine) + with engine.connect() as conn: + results = pd.read_sql(depsSQL, conn) return results diff --git a/augur/api/metrics/insight.py b/augur/api/metrics/insight.py index 874f656f7..848161e1a 100644 --- a/augur/api/metrics/insight.py +++ b/augur/api/metrics/insight.py @@ -29,5 +29,6 @@ def top_insights(repo_group_id, num_repos=6): LIMIT :num_repos ) """) - results = pd.read_sql(topInsightsSQL, engine, params={'repo_group_id': repo_group_id, 'num_repos': num_repos}) + with engine.connect() as conn: + results = pd.read_sql(topInsightsSQL, conn, params={'repo_group_id': repo_group_id, 'num_repos': num_repos}) return results diff --git a/augur/api/metrics/issue.py b/augur/api/metrics/issue.py index 72108bc20..22ee2630b 100644 --- a/augur/api/metrics/issue.py +++ b/augur/api/metrics/issue.py @@ -50,8 +50,10 @@ def issues_first_time_opened(repo_group_id, repo_id=None, period='day', begin_da GROUP BY issue_date, repo_name ORDER BY issue_date """) - results = pd.read_sql(issueNewContributor, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + results = pd.read_sql(issueNewContributor, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: issueNewContributor = s.sql.text(""" SELECT @@ -76,9 +78,10 @@ def issues_first_time_opened(repo_group_id, repo_id=None, period='day', begin_da GROUP BY repo.repo_id, issue_date ORDER BY issue_date """) - results = pd.read_sql(issueNewContributor, engine, - params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issueNewContributor, conn, + params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -119,8 +122,9 @@ def issues_first_time_closed(repo_group_id, repo_id=None, period='day', begin_da ) AS iss_close GROUP BY issue_date, repo_name """) - results = pd.read_sql(issuesClosedSQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issuesClosedSQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: issuesClosedSQL = s.sql.text(""" SELECT date_trunc(:period, new_date::DATE) AS issue_date, @@ -141,8 +145,10 @@ def issues_first_time_closed(repo_group_id, repo_id=None, period='day', begin_da ) AS iss_close GROUP BY repo_id, repo_name,issue_date """) - results = pd.read_sql(issuesClosedSQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + results = pd.read_sql(issuesClosedSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -179,8 +185,9 @@ def issues_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_d ORDER BY issues.repo_id, date """) - results = pd.read_sql(issues_new_SQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_new_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -198,8 +205,9 @@ def issues_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_d ORDER BY date; """) - results = pd.read_sql(issues_new_SQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_new_SQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -235,8 +243,9 @@ def issues_active(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY issues.repo_id, date """) - results = pd.read_sql(issues_active_SQL, engine, params={'repo_group_id': repo_group_id, 'period':period, - 'begin_date': begin_date, 'end_date':end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_active_SQL, conn, params={'repo_group_id': repo_group_id, 'period':period, + 'begin_date': begin_date, 'end_date':end_date}) else: issues_active_SQL = s.sql.text(""" @@ -254,8 +263,9 @@ def issues_active(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY date """) - results = pd.read_sql(issues_active_SQL, engine, params={'repo_id': repo_id, 'period':period, - 'begin_date': begin_date, 'end_date':end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_active_SQL, conn, params={'repo_id': repo_id, 'period':period, + 'begin_date': begin_date, 'end_date':end_date}) return results @register_metric() @@ -290,8 +300,9 @@ def issues_closed(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY issues.repo_id, date """) - results = pd.read_sql(issues_closed_SQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_closed_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: issues_closed_SQL = s.sql.text(""" @@ -308,8 +319,9 @@ def issues_closed(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY date; """) - results = pd.read_sql(issues_closed_SQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_closed_SQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -347,9 +359,10 @@ def issue_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None): ORDER BY repo_id, issue_id """) - results = pd.read_sql(issue_duration_SQL, engine, params={'repo_group_id': repo_group_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issue_duration_SQL, conn, params={'repo_group_id': repo_group_id, + 'begin_date': begin_date, + 'end_date': end_date}) results['duration'] = results['duration'].astype(str) return results @@ -371,9 +384,10 @@ def issue_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None): ORDER BY issue_id; """) - results = pd.read_sql(issue_duration_SQL, engine, params={'repo_id': repo_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issue_duration_SQL, conn, params={'repo_id': repo_id, + 'begin_date': begin_date, + 'end_date': end_date}) results['duration'] = results['duration'].astype(str) return results @@ -417,9 +431,10 @@ def issue_participants(repo_group_id, repo_id=None, begin_date=None, end_date=No ORDER BY issues.repo_id, issues.created_at """) - result = pd.read_sql(issue_participants_SQL, engine, params={'repo_group_id': repo_group_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + result = pd.read_sql(issue_participants_SQL, conn, params={'repo_group_id': repo_group_id, + 'begin_date': begin_date, + 'end_date': end_date}) return result else: issue_participants_SQL = s.sql.text(""" @@ -445,9 +460,10 @@ def issue_participants(repo_group_id, repo_id=None, begin_date=None, end_date=No ORDER BY issues.created_at """) - result = pd.read_sql(issue_participants_SQL, engine, params={'repo_id': repo_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + result = pd.read_sql(issue_participants_SQL, conn, params={'repo_id': repo_id, + 'begin_date': begin_date, + 'end_date': end_date}) return result @register_metric() @@ -468,7 +484,9 @@ def issue_backlog(repo_group_id, repo_id=None): GROUP BY issues.repo_id, repo_name ORDER BY issues.repo_id """) - result = pd.read_sql(issue_backlog_SQL, engine, params={'repo_group_id': repo_group_id}) + + with engine.connect() as conn: + result = pd.read_sql(issue_backlog_SQL, conn, params={'repo_group_id': repo_group_id}) return result else: @@ -481,7 +499,8 @@ def issue_backlog(repo_group_id, repo_id=None): GROUP BY repo_name """) - result = pd.read_sql(issue_backlog_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + result = pd.read_sql(issue_backlog_SQL, conn, params={'repo_id': repo_id}) return result @register_metric() @@ -509,7 +528,8 @@ def issue_throughput(repo_group_id, repo_id=None): AND table1.repo_id = repo.repo_id """) - results = pd.read_sql(issue_throughput_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(issue_throughput_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -525,7 +545,8 @@ def issue_throughput(repo_group_id, repo_id=None): WHERE table1.repo_id = repo.repo_id """) - result = pd.read_sql(issue_throughput_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + result = pd.read_sql(issue_throughput_SQL, conn, params={'repo_id': repo_id}) return result @register_metric() @@ -574,9 +595,10 @@ def issues_open_age(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY open_date DESC """) - results = pd.read_sql(openAgeSQL, engine, - params={'repo_id': repo_id, 'repo_group_id': repo_group_id, - 'period': period, 'begin_date':begin_date, 'end_date':end_date}) + with engine.connect() as conn: + results = pd.read_sql(openAgeSQL, conn, + params={'repo_id': repo_id, 'repo_group_id': repo_group_id, + 'period': period, 'begin_date':begin_date, 'end_date':end_date}) return results @@ -634,11 +656,12 @@ def issues_closed_resolution_duration(repo_group_id, repo_id=None, period='day', ORDER BY gh_issue_number """) - results = pd.read_sql(issueSQL, engine, - params={'repo_id': repo_id, - 'repo_group_id': repo_group_id, - 'period': period, 'begin_date':begin_date, - 'end_date':end_date}) + with engine.connect() as conn: + results = pd.read_sql(issueSQL, conn, + params={'repo_id': repo_id, + 'repo_group_id': repo_group_id, + 'period': period, 'begin_date':begin_date, + 'end_date':end_date}) return results @@ -667,8 +690,9 @@ def average_issue_resolution_time(repo_group_id, repo_id=None): """) - results = pd.read_sql(avg_issue_resolution_SQL, engine, - params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(avg_issue_resolution_SQL, conn, + params={'repo_group_id': repo_group_id}) return results else: @@ -683,8 +707,9 @@ def average_issue_resolution_time(repo_group_id, repo_id=None): GROUP BY repo.repo_name """) - results = pd.read_sql(avg_issue_resolution_SQL, engine, - params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(avg_issue_resolution_SQL, conn, + params={'repo_id': repo_id}) return results @register_metric() @@ -757,7 +782,8 @@ def issues_maintainer_response_duration(repo_group_id, repo_id=None, begin_date= group by repo_id, repo_name """) - results = pd.read_sql(issuesSQL, engine, params={'repo_id': repo_id, 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issuesSQL, conn, params={'repo_id': repo_id, 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date}) return results @@ -780,7 +806,8 @@ def open_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo_groups.rg_name ORDER BY date """) - results = pd.read_sql(openIssueCountSQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(openIssueCountSQL, conn, params={'repo_group_id': repo_group_id}) return results else: openIssueCountSQL = s.sql.text(""" @@ -794,7 +821,8 @@ def open_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo.repo_id ORDER BY date """) - results = pd.read_sql(openIssueCountSQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(openIssueCountSQL, conn, params={'repo_id': repo_id}) return results @@ -817,7 +845,8 @@ def closed_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo_groups.rg_name ORDER BY date """) - results = pd.read_sql(closedIssueCountSQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(closedIssueCountSQL, conn, params={'repo_group_id': repo_group_id}) return results else: closedIssueCountSQL = s.sql.text(""" @@ -831,7 +860,8 @@ def closed_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo.repo_id ORDER BY date """) - results = pd.read_sql(closedIssueCountSQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(closedIssueCountSQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -893,8 +923,9 @@ def issue_comments_mean(repo_group_id, repo_id=None, group_by='week'): else: raise ValueError("Incorrect value for 'group_by'") - results = pd.read_sql(issue_comments_mean_std_SQL, engine, - params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(issue_comments_mean_std_SQL, conn, + params={'repo_group_id': repo_group_id}) return results else: @@ -946,8 +977,9 @@ def issue_comments_mean(repo_group_id, repo_id=None, group_by='week'): else: raise ValueError("Incorrect value for 'group_by'") - results = pd.read_sql(issue_comments_mean_std_SQL, engine, - params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(issue_comments_mean_std_SQL, conn, + params={'repo_id': repo_id}) return results @register_metric() @@ -978,9 +1010,10 @@ def issue_comments_mean_std(repo_group_id, repo_id=None, group_by='week'): """) - results = pd.read_sql(issue_comments_mean_std_SQL, engine, - params={'repo_group_id': repo_group_id, - 'group_by': group_by}) + with engine.connect() as conn: + results = pd.read_sql(issue_comments_mean_std_SQL, conn, + params={'repo_group_id': repo_group_id, + 'group_by': group_by}) return results else: @@ -1006,8 +1039,9 @@ def issue_comments_mean_std(repo_group_id, repo_id=None, group_by='week'): ORDER BY date """) - results = pd.read_sql(issue_comments_mean_std_SQL, engine, - params={'repo_id': repo_id, 'group_by': group_by}) + with engine.connect() as conn: + results = pd.read_sql(issue_comments_mean_std_SQL, conn, + params={'repo_id': repo_id, 'group_by': group_by}) return results @register_metric() @@ -1057,6 +1091,7 @@ def abandoned_issues(repo_group_id, repo_id=None, period='day', begin_date=None, ''' ) - results = pd.read_sql(abandonedSQL, engine, params={'repo_id': repo_id, 'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(abandonedSQL, conn, params={'repo_id': repo_id, 'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/message.py b/augur/api/metrics/message.py index 8c36c3a4c..9988f5a0d 100644 --- a/augur/api/metrics/message.py +++ b/augur/api/metrics/message.py @@ -56,9 +56,9 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en """) - - results = pd.read_sql(repomessagesSQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(repomessagesSQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: repomessagesSQL = s.sql.text(""" @@ -85,10 +85,11 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en rg_name, message_date """) - - results = pd.read_sql(repomessagesSQL, engine, - params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + results = pd.read_sql(repomessagesSQL, conn, + params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/pull_request.py b/augur/api/metrics/pull_request.py index 9fbcc6175..3b1798ec0 100644 --- a/augur/api/metrics/pull_request.py +++ b/augur/api/metrics/pull_request.py @@ -10,6 +10,53 @@ from ..server import engine +@register_metric() +def pull_requests_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): + """ + Returns a time series of the number of new Pull Requests opened during a certain period. + + :param repo_id: The repository's id + :param repo_group_id: The repository's group id + :param period: To set the periodicity to 'day', 'week', 'month' or 'year', defaults to 'day' + :param begin_date: Specifies the begin date, defaults to '1970-1-1 00:00:01' + :param end_date: Specifies the end date, defaults to datetime.now() + :return: DataFrame of new Pull Requests/period + """ + if not begin_date: + begin_date = '1970-1-1 00:00:01' + if not end_date: + end_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + if repo_id: + new_pull_requests_query = s.sql.text(""" + SELECT DATE_TRUNC(:period, pr_created_at) AS created_date, + COUNT(pr_id) AS new_pull_requests + FROM pull_requests + WHERE repo_id = :repo_id + AND pr_created_at BETWEEN :begin_date AND :end_date + GROUP BY created_date + """) + + results = pd.read_sql(new_pull_requests_query, engine, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) + else: + new_pull_requests_query = s.sql.text(""" + SELECT DATE_TRUNC(:period, pr_created_at) AS created_date, + COUNT(pr_id) AS new_pull_requests + FROM pull_requests + WHERE repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) + AND pr_created_at BETWEEN :begin_date AND :end_date + GROUP BY created_date + """) + + results = pd.read_sql(new_pull_requests_query, engine, + params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) + + return results + @register_metric() def pull_requests_merge_contributor_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): """ @@ -40,9 +87,10 @@ def pull_requests_merge_contributor_new(repo_group_id, repo_id=None, period='day """) - results = pd.read_sql(commitNewContributor, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(commitNewContributor, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) else: commitNewContributor = s.sql.text(""" SELECT abc.repo_id, repo_name ,date_trunc(:period, new_date::DATE) as commit_date, @@ -58,11 +106,11 @@ def pull_requests_merge_contributor_new(repo_group_id, repo_id=None, period='day GROUP BY abc.repo_id, repo_name, commit_date """) - - results = pd.read_sql(commitNewContributor, engine, - params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(commitNewContributor, conn, + params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) return results @register_metric() @@ -96,9 +144,10 @@ def pull_requests_closed_no_merge(repo_group_id, repo_id=None, period='day', beg - results = pd.read_sql(closedNoMerge, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(closedNoMerge, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) else: closedNoMerge = s.sql.text(""" @@ -110,11 +159,11 @@ def pull_requests_closed_no_merge(repo_group_id, repo_id=None, period='day', beg ORDER BY closed_date """) - - results = pd.read_sql(closedNoMerge, engine, - params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(closedNoMerge, conn, + params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) return results @register_metric() @@ -151,9 +200,10 @@ def reviews(repo_group_id, repo_id=None, period='day', begin_date=None, end_date """) - results = pd.read_sql(reviews_SQL, engine, - params={'period': period, 'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date }) + with engine.connect() as conn: + results = pd.read_sql(reviews_SQL, conn, + params={'period': period, 'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date }) return results else: @@ -171,10 +221,10 @@ def reviews(repo_group_id, repo_id=None, period='day', begin_date=None, end_date ORDER BY date """) - - results = pd.read_sql(reviews_SQL, engine, - params={'period': period, 'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(reviews_SQL, conn, + params={'period': period, 'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -211,10 +261,10 @@ def reviews_accepted(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY pull_requests.repo_id, date """) - - results = pd.read_sql(reviews_accepted_SQL, engine, - params={'period': period, 'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(reviews_accepted_SQL, conn, + params={'period': period, 'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) return results else: reviews_accepted_SQL = s.sql.text(""" @@ -232,9 +282,10 @@ def reviews_accepted(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY date """) - results = pd.read_sql(reviews_accepted_SQL, engine, - params={'period': period, 'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(reviews_accepted_SQL, conn, + params={'period': period, 'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -271,10 +322,10 @@ def reviews_declined(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY pull_requests.repo_id, date """) - - results = pd.read_sql(reviews_declined_SQL, engine, - params={'period': period, 'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date }) + with engine.connect() as conn: + results = pd.read_sql(reviews_declined_SQL, conn, + params={'period': period, 'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date }) return results else: reviews_declined_SQL = s.sql.text(""" @@ -292,9 +343,10 @@ def reviews_declined(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY date """) - results = pd.read_sql(reviews_declined_SQL, engine, - params={'period': period, 'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(reviews_declined_SQL, conn, + params={'period': period, 'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -331,11 +383,11 @@ def review_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None) ORDER BY pull_requests.repo_id, pull_requests.pull_request_id """) - - results = pd.read_sql(review_duration_SQL, engine, - params={'repo_group_id': repo_group_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(review_duration_SQL, conn, + params={'repo_group_id': repo_group_id, + 'begin_date': begin_date, + 'end_date': end_date}) results['duration'] = results['duration'].astype(str) return results else: @@ -355,10 +407,11 @@ def review_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None) ORDER BY pull_requests.repo_id, pull_request_id """) - results = pd.read_sql(review_duration_SQL, engine, - params={'repo_id': repo_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(review_duration_SQL, conn, + params={'repo_id': repo_id, + 'begin_date': begin_date, + 'end_date': end_date}) results['duration'] = results['duration'].astype(str) return results @@ -408,8 +461,9 @@ def pull_request_acceptance_rate(repo_group_id, repo_id=None, begin_date=None, e ON opened.date_created = accepted.accepted_on """) - results = pd.read_sql(prAccRateSQL, engine, params={'repo_group_id': repo_group_id, 'group_by': group_by, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(prAccRateSQL, conn, params={'repo_group_id': repo_group_id, 'group_by': group_by, + 'begin_date': begin_date, 'end_date': end_date}) return results else: prAccRateSQL = s.sql.text(""" @@ -441,8 +495,9 @@ def pull_request_acceptance_rate(repo_group_id, repo_id=None, begin_date=None, e ON opened.date_created = accepted.accepted_on """) - results = pd.read_sql(prAccRateSQL, engine, params={'repo_id': repo_id, 'group_by': group_by, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(prAccRateSQL, conn, params={'repo_id': repo_id, 'group_by': group_by, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -546,9 +601,10 @@ def pull_request_average_time_to_close(repo_group_id, repo_id=None, group_by='mo - pr_all = pd.read_sql(pr_all_SQL, engine, - params={'repo_id': repo_id, 'repo_group_id':repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_SQL, conn, + params={'repo_id': repo_id, 'repo_group_id':repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) if not repo_id: pr_avg_time_to_close = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_to_close'.format(time_unit)]] else: @@ -657,10 +713,11 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 GROUP BY closed_year, closed_month, merged_status, time_between_responses.pr_closed_at, time_between_responses.average_time_between_responses """) - - pr_all = pd.read_sql(pr_all_SQL, engine, - params={'repo_id': repo_id, 'repo_group_id':repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_SQL, conn, + params={'repo_id': repo_id, 'repo_group_id':repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) if not repo_id: pr_avg_time_between_responses = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_between_responses'.format(time_unit)]] else: @@ -767,10 +824,11 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo GROUP BY closed_year, merged_status, data.pr_closed_at, data.commit_count """) - - pr_all = pd.read_sql(pr_all_SQL, engine, - params={'repo_id': repo_id, 'repo_group_id':repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_SQL, conn, + params={'repo_id': repo_id, 'repo_group_id':repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) if not repo_id: pr_avg_commit_counts = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_commits_per_pull_request']] else: @@ -926,10 +984,11 @@ def pull_request_average_event_counts(repo_group_id, repo_id=None, group_by='mon ORDER BY merged_status, closed_year, closed_week, closed_day """) - - pr_all = pd.read_sql(pr_all_SQL, engine, - params={'repo_id': repo_id, 'repo_group_id':repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_SQL, conn, + params={'repo_id': repo_id, 'repo_group_id':repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) count_names = ['assigned_count', 'review_requested_count', 'labeled_count', 'unlabeled_count', 'subscribed_count', 'mentioned_count', 'referenced_count', 'closed_count', 'head_ref_force_pushed_count', 'head_ref_deleted_count', 'milestoned_count', 'merged_count', 'comment_count'] average_count_names = [] @@ -1050,9 +1109,10 @@ def pull_request_average_time_to_responses_and_close(repo_group_id, repo_id=None GROUP BY closed_year, merged_status, response_times.first_response_time, response_times.last_response_time, response_times.pr_created_at, response_times.pr_closed_at """) - pr_all = pd.read_sql(pr_all_SQL, engine, - params={'repo_id': repo_id, 'repo_group_id':repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_SQL, conn, + params={'repo_id': repo_id, 'repo_group_id':repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) if not repo_id: avg_pr_time_to_responses_and_close = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_to_first_response'.format(time_unit), 'average_{}_to_last_response'.format(time_unit), 'average_{}_to_close'.format(time_unit)]] @@ -1132,9 +1192,10 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 AND pr_closed_at::date <= :end_date ::date """) - - pr_all = pd.read_sql(pr_all_sql, engine, params={'repo_group_id': repo_group_id, - 'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_sql, conn, params={'repo_group_id': repo_group_id, + 'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) if not repo_id: pr_merged_counts = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).count().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['pull_request_count']] diff --git a/augur/api/metrics/release.py b/augur/api/metrics/release.py index 60f779365..5594f7ef0 100644 --- a/augur/api/metrics/release.py +++ b/augur/api/metrics/release.py @@ -50,10 +50,10 @@ def releases(repo_group_id, repo_id=None, period='day', begin_date=None, end_dat ORDER BY releases.release_published_at DESC """) - - results = pd.read_sql(releases_SQL, engine, - params={'period': period, 'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date }) + with engine.connect() as conn: + results = pd.read_sql(releases_SQL, conn, + params={'period': period, 'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date }) return results else: @@ -80,10 +80,10 @@ def releases(repo_group_id, repo_id=None, period='day', begin_date=None, end_dat ORDER BY releases.release_published_at DESC """) - - results = pd.read_sql(releases_SQL, engine, - params={'period': period, 'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(releases_SQL, conn, + params={'period': period, 'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -127,10 +127,10 @@ def tag_only_releases(repo_group_id, repo_id=None, period='day', begin_date=None ORDER BY releases.release_published_at DESC """) - - results = pd.read_sql(releases_SQL, engine, - params={'period': period, 'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date }) + with engine.connect() as conn: + results = pd.read_sql(releases_SQL, conn, + params={'period': period, 'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date }) return results else: @@ -150,10 +150,11 @@ def tag_only_releases(repo_group_id, repo_id=None, period='day', begin_date=None ORDER BY releases.release_published_at DESC """) - results = pd.read_sql(releases_SQL, engine, - params={'period': period, 'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(releases_SQL, conn, + params={'period': period, 'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results -def create_release_metrics(metrics): - add_metrics(metrics, __name__) +#def create_release_metrics(metrics): +# add_metrics(metrics, __name__) diff --git a/augur/api/metrics/repo_meta.py b/augur/api/metrics/repo_meta.py index ca4d9668e..c5d8e1138 100644 --- a/augur/api/metrics/repo_meta.py +++ b/augur/api/metrics/repo_meta.py @@ -46,8 +46,8 @@ def code_changes(repo_group_id, repo_id=None, period='week', begin_date=None, en ORDER BY week """) - - results = pd.read_sql(code_changes_SQL, engine, params={'repo_group_id': repo_group_id, 'period': period, + with engine.connect() as conn: + results = pd.read_sql(code_changes_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) results['week'] = results['week'].apply(lambda x: x - 1) results['date'] = results['year'].astype(str) + ' ' + results['week'].astype(str) + ' 0' @@ -68,9 +68,9 @@ def code_changes(repo_group_id, repo_id=None, period='week', begin_date=None, en ORDER BY week """) - - results = pd.read_sql(code_changes_SQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(code_changes_SQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) results['week'] = results['week'].apply(lambda x: x - 1) results['date'] = results['year'].astype(str) + ' ' + results['week'].astype(str) + ' 0' @@ -111,8 +111,9 @@ def code_changes_lines(repo_group_id, repo_id=None, period='day', begin_date=Non ORDER BY commits.repo_id, date """) - results = pd.read_sql(code_changes_lines_SQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(code_changes_lines_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -130,9 +131,9 @@ def code_changes_lines(repo_group_id, repo_id=None, period='day', begin_date=Non ORDER BY date; """) - - results = pd.read_sql(code_changes_lines_SQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(code_changes_lines_SQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -163,8 +164,9 @@ def sub_projects(repo_group_id, repo_id=None, begin_date=None, end_date=None): AND repo_added BETWEEN :begin_date AND :end_date """) - results = pd.read_sql(sub_projectsSQL, engine, params={'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(sub_projectsSQL, conn, params={'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) else: sub_projectsSQL = s.sql.text(""" SELECT COUNT(*) AS sub_project_count @@ -173,8 +175,9 @@ def sub_projects(repo_group_id, repo_id=None, begin_date=None, end_date=None): AND repo_added BETWEEN :begin_date AND :end_date """) - results = pd.read_sql(sub_projectsSQL, engine, params={'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(sub_projectsSQL, conn, params={'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -194,8 +197,8 @@ def sbom_download(repo_group_id, repo_id=None): logger.debug(dosocs_SQL) params = {'repo_id': repo_id} - - return pd.read_sql(dosocs_SQL, engine, params=params) + with engine.connect() as conn: + return pd.read_sql(dosocs_SQL, conn, params=params) #return [json.dumps(license_information)] @register_metric() @@ -223,7 +226,8 @@ def cii_best_practices_badge(repo_group_id, repo_id=None): LIMIT 1 """) - raw_df = pd.read_sql(cii_best_practices_badge_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + raw_df = pd.read_sql(cii_best_practices_badge_SQL, conn, params={'repo_id': repo_id}) if len(raw_df) == 0: return [] @@ -263,8 +267,8 @@ def forks(repo_group_id, repo_id=None): ORDER BY repo_info.repo_id, date """) - - results = pd.read_sql(forks_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(forks_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -278,8 +282,8 @@ def forks(repo_group_id, repo_id=None): ORDER BY date """) - - results = pd.read_sql(forks_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(forks_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -303,8 +307,8 @@ def fork_count(repo_group_id, repo_id=None): WHERE repo_group_id = :repo_group_id) """) - - results = pd.read_sql(fork_count_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(fork_count_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: fork_count_SQL = s.sql.text(""" @@ -315,8 +319,8 @@ def fork_count(repo_group_id, repo_id=None): LIMIT 1 """) - - results = pd.read_sql(fork_count_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(fork_count_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -334,7 +338,8 @@ def languages(repo_group_id, repo_id=None): WHERE repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) """) - results = pd.read_sql(languages_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(languages_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -344,8 +349,8 @@ def languages(repo_group_id, repo_id=None): WHERE repo_id = :repo_id """) - - results = pd.read_sql(languages_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(languages_SQL, conn, params={'repo_id': repo_id}) return results @register_metric(type="license") @@ -381,7 +386,8 @@ def license_files(license_id, spdx_binary, repo_group_id, repo_id=None,): b.license_id in ( 369,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482)); """) - results = pd.read_sql(license_data_SQL, engine, params={'repo_id': repo_id, 'spdx_binary': spdx_binary, 'license_id': license_id}) + with engine.connect() as conn: + results = pd.read_sql(license_data_SQL, conn, params={'repo_id': repo_id, 'spdx_binary': spdx_binary, 'license_id': license_id}) return results @register_metric() @@ -450,7 +456,8 @@ def license_declared(repo_group_id, repo_id=None): short_name; """) - results = pd.read_sql(license_declared_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(license_declared_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -534,7 +541,8 @@ def license_coverage(repo_group_id, repo_id=None): GROUP BY a.name, a.licensed, a.licensed, b.total """) - results = pd.read_sql(license_declared_SQL, engine, params={'repo_id': repo_id, 'repo_group_id':repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(license_declared_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id}) return results @@ -595,8 +603,8 @@ def license_count(repo_group_id, repo_id=None): GROUP BY a.name, a.number_of_license, a.licensed, b.total """) - - results = pd.read_sql(license_declared_SQL, engine, params={'repo_id': repo_id, 'repo_group_id':repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(license_declared_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id}) return results @@ -624,8 +632,8 @@ def stars(repo_group_id, repo_id=None): ORDER BY repo_info.repo_id, date """) - - results = pd.read_sql(stars_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(stars_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -639,7 +647,8 @@ def stars(repo_group_id, repo_id=None): ORDER BY date """) - results = pd.read_sql(stars_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(stars_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -663,8 +672,8 @@ def stars_count(repo_group_id, repo_id=None): WHERE repo_group_id = :repo_group_id) """) - - results = pd.read_sql(stars_count_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(stars_count_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: stars_count_SQL = s.sql.text(""" @@ -675,7 +684,8 @@ def stars_count(repo_group_id, repo_id=None): LIMIT 1 """) - results = pd.read_sql(stars_count_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(stars_count_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -701,8 +711,8 @@ def watchers(repo_group_id, repo_id=None): ORDER BY repo_info.repo_id, date """) - - results = pd.read_sql(watchers_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(watchers_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -716,8 +726,8 @@ def watchers(repo_group_id, repo_id=None): ORDER BY date """) - - results = pd.read_sql(watchers_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(watchers_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -741,8 +751,8 @@ def watchers_count(repo_group_id, repo_id=None): WHERE repo_group_id = :repo_group_id) """) - - results = pd.read_sql(watchers_count_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(watchers_count_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: watchers_count_SQL = s.sql.text(""" @@ -753,8 +763,8 @@ def watchers_count(repo_group_id, repo_id=None): LIMIT 1 """) - - results = pd.read_sql(watchers_count_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(watchers_count_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -798,8 +808,9 @@ def annual_lines_of_code_count_ranked_by_new_repo_in_repo_group(repo_group_id, r LIMIT 10 """) - results = pd.read_sql(cdRgNewrepRankedCommitsSQL, engine, params={ "repo_group_id": repo_group_id, - "repo_id": repo_id, "calendar_year": calendar_year}) + with engine.connect() as conn: + results = pd.read_sql(cdRgNewrepRankedCommitsSQL, conn, params={ "repo_group_id": repo_group_id, + "repo_id": repo_id, "calendar_year": calendar_year}) return results @register_metric() @@ -894,9 +905,9 @@ def annual_lines_of_code_count_ranked_by_repo_in_repo_group(repo_group_id, repo_ """) - - results = pd.read_sql(cdRgTpRankedCommitsSQL, engine, params={ "repo_group_id": repo_group_id, - "repo_id": repo_id}) + with engine.connect() as conn: + results = pd.read_sql(cdRgTpRankedCommitsSQL, conn, params={ "repo_group_id": repo_group_id, + "repo_id": repo_id}) return results @register_metric() @@ -948,8 +959,8 @@ def lines_of_code_commit_counts_by_calendar_year_grouped(repo_url, calendar_year GROUP BY week """) - - results = pd.read_sql(cdRepTpIntervalLocCommitsSQL, engine, params={"repourl": '%{}%'.format(repo_url), 'calendar_year': calendar_year}) + with engine.connect() as conn: + results = pd.read_sql(cdRepTpIntervalLocCommitsSQL, conn, params={"repourl": '%{}%'.format(repo_url), 'calendar_year': calendar_year}) return results @register_metric() @@ -969,9 +980,9 @@ def average_weekly_commits(repo_group_id=None, repo_id=None, calendar_year=None) ORDER BY repo_name """.format(extra_and)) - - results = pd.read_sql(average_weekly_commits_sql, engine, params={"repo_group_id": repo_group_id, - "repo_id": repo_id, "calendar_year": calendar_year}) + with engine.connect() as conn: + results = pd.read_sql(average_weekly_commits_sql, conn, params={"repo_group_id": repo_group_id, + "repo_id": repo_id, "calendar_year": calendar_year}) return results @register_metric() @@ -1054,8 +1065,9 @@ def aggregate_summary(repo_group_id, repo_id=None, begin_date=None, end_date=Non ) commit_data """) - results = pd.read_sql(summarySQL, engine, params={'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(summarySQL, conn, params={'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) return results else: summarySQL = s.sql.text(""" @@ -1123,6 +1135,7 @@ def aggregate_summary(repo_group_id, repo_id=None, begin_date=None, end_date=Non ) commit_data """) - results = pd.read_sql(summarySQL, engine, params={'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(summarySQL, conn, params={'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/toss.py b/augur/api/metrics/toss.py index 122cb3567..d3e91ad40 100644 --- a/augur/api/metrics/toss.py +++ b/augur/api/metrics/toss.py @@ -57,8 +57,9 @@ def toss_pull_request_acceptance_rate(repo_id, begin_date=None, end_date=None, g ) opened ON merged.repo_id = opened.repo_id """) - results = pd.read_sql(pr_acceptance_rate_sql, engine, params={'repo_id': repo_id, 'group_by': group_by, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(pr_acceptance_rate_sql, conn, params={'repo_id': repo_id, 'group_by': group_by, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -89,8 +90,9 @@ def toss_review_duration(repo_id, begin_date=None, end_date=None): AND :end_date """) - results = pd.read_sql(pr_acceptance_rate_sql, engine, params={'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(pr_acceptance_rate_sql, conn, params={'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) if results.iloc[0]['duration'] is None: results.iloc[0]['duration'] = -1 else: @@ -120,5 +122,6 @@ def toss_repo_info(repo_id): LIMIT 1; """) - results = pd.read_sql(license_file_sql, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(license_file_sql, conn, params={'repo_id': repo_id}) return results diff --git a/augur/api/routes/__init__.py b/augur/api/routes/__init__.py index 5e601f54e..03c2e2fa7 100644 --- a/augur/api/routes/__init__.py +++ b/augur/api/routes/__init__.py @@ -11,3 +11,4 @@ from .user import * from .dei import * from .util import * +from .complexity import * diff --git a/augur/api/routes/collection_status.py b/augur/api/routes/collection_status.py index 58e17311f..8afd8eb2d 100644 --- a/augur/api/routes/collection_status.py +++ b/augur/api/routes/collection_status.py @@ -25,7 +25,9 @@ def commit_collection_status(): # TODO: make this name automatic - wrapper? AND c.facade_status = 'Success'; """) - results = pd.read_sql(commit_collection_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(commit_collection_sql, conn) data = results.to_json( orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -86,7 +88,9 @@ def issue_collection_status(): # TODO: make this name automatic - wrapper? ) D WHERE d.issues_enabled = 'true'; """) - results = pd.read_sql(issue_collection_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(issue_collection_sql, conn) data = results.to_json( orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) @@ -156,7 +160,9 @@ def pull_request_collection_status(): # TODO: make this name automatic - wrappe ORDER BY ratio_abs; """) - results = pd.read_sql(pull_request_collection_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(pull_request_collection_sql, conn) data = results.to_json( orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) diff --git a/augur/api/routes/complexity.py b/augur/api/routes/complexity.py index 81045720a..bee39eb92 100644 --- a/augur/api/routes/complexity.py +++ b/augur/api/routes/complexity.py @@ -6,32 +6,113 @@ import os import requests -AUGUR_API_VERSION = 'api/unstable' +from augur.api.routes import AUGUR_API_VERSION +from ..server import app, engine -def create_routes(server): - @server.app.route('/{}/complexity/project_languages'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_languages(): - project_languages_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.programming_language, - e.code_lines, - e.files +@app.route('/{}/complexity/project_languages'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_languages(): + project_languages_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.programming_language, + e.code_lines, + e.files + FROM + augur_data.repo, + (SELECT + d.repo_id, + d.programming_language, + SUM(d.code_lines) AS code_lines, + COUNT(*)::int AS files + FROM + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.programming_language, + augur_data.repo_labor.code_lines + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id, d.programming_language) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) + + with engine.connect() as conn: + results = pd.read_sql(project_languages_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + +@app.route('/{}/complexity/project_files'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_files(): + project_files_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.files + FROM + augur_data.repo, + (SELECT + d.repo_id, + count(*) AS files FROM - augur_data.repo, - (SELECT + (SELECT + augur_data.repo_labor.repo_id + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) + + with engine.connect() as conn: + results = pd.read_sql(project_files_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + +@app.route('/{}/complexity/project_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_lines(): + project_lines_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.total_lines, + e.average_lines + FROM + augur_data.repo, + (SELECT d.repo_id, - d.programming_language, - SUM(d.code_lines) AS code_lines, - COUNT(*)::int AS files + SUM(d.total_lines) AS total_lines, + AVG(d.total_lines)::INT AS average_lines FROM (SELECT augur_data.repo_labor.repo_id, - augur_data.repo_labor.programming_language, - augur_data.repo_labor.code_lines + augur_data.repo_labor.total_lines FROM augur_data.repo_labor, ( SELECT @@ -43,113 +124,80 @@ def get_project_languages(): WHERE augur_data.repo_labor.repo_id = recent.repo_id AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id, d.programming_language) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - results = pd.read_sql(project_languages_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) - @server.app.route('/{}/complexity/project_files'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_files(): - project_files_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.files - FROM - augur_data.repo, - (SELECT - d.repo_id, - count(*) AS files - FROM - (SELECT - augur_data.repo_labor.repo_id - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - results = pd.read_sql(project_files_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + with engine.connect() as conn: + results = pd.read_sql(project_lines_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") - @server.app.route('/{}/complexity/project_lines'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_lines(): - project_lines_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.total_lines, - e.average_lines +@app.route('/{}/complexity/project_comment_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_comment_lines(): + comment_lines_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.comment_lines, + e.avg_comment_lines + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.comment_lines) AS comment_lines, + AVG(d.comment_lines)::INT AS avg_comment_lines FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.total_lines) AS total_lines, - AVG(d.total_lines)::INT AS average_lines - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.total_lines - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - results = pd.read_sql(project_lines_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.comment_lines + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) + + with engine.connect() as conn: + results = pd.read_sql(comment_lines_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") - @server.app.route('/{}/complexity/project_comment_lines'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_comment_lines(): - comment_lines_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.comment_lines, - e.avg_comment_lines +@app.route('/{}/complexity/project_blank_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_blank_lines(): + blank_lines_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.blank_lines, + e.avg_blank_lines + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.blank_lines) AS blank_lines, + AVG(d.blank_lines)::int AS avg_blank_lines FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.comment_lines) AS comment_lines, - AVG(d.comment_lines)::INT AS avg_comment_lines - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.comment_lines - FROM + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.blank_lines + FROM augur_data.repo_labor, ( SELECT augur_data.repo_labor.repo_id, @@ -161,93 +209,57 @@ def get_project_comment_lines(): augur_data.repo_labor.repo_id = recent.repo_id AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id """) - results = pd.read_sql(comment_lines_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") - @server.app.route('/{}/complexity/project_blank_lines'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_blank_lines(): - blank_lines_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.blank_lines, - e.avg_blank_lines - FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.blank_lines) AS blank_lines, - AVG(d.blank_lines)::int AS avg_blank_lines - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.blank_lines - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - results = pd.read_sql(blank_lines_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") - + with engine.connect() as conn: + results = pd.read_sql(blank_lines_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + - @server.app.route('/{}/complexity/project_file_complexity'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_file_complexity(): - project_file_complexity_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.sum_code_complexity, - e.average_code_complexity +@app.route('/{}/complexity/project_file_complexity'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_file_complexity(): + project_file_complexity_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.sum_code_complexity, + e.average_code_complexity + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.code_complexity) AS sum_code_complexity, + AVG(d.code_complexity)::int AS average_code_complexity FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.code_complexity) AS sum_code_complexity, - AVG(d.code_complexity)::int AS average_code_complexity - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.code_complexity - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - results = pd.read_sql(project_file_complexity_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.code_complexity + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) + with engine.connect() as conn: + results = pd.read_sql(project_file_complexity_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + diff --git a/augur/api/routes/contributor_reports.py b/augur/api/routes/contributor_reports.py index 896e00fc0..c600e8141 100644 --- a/augur/api/routes/contributor_reports.py +++ b/augur/api/routes/contributor_reports.py @@ -293,7 +293,9 @@ def new_contributor_data_collection(repo_id, required_contributions): WHERE RANK IN {rank_tuple} """) - df = pd.read_sql(contributor_query, engine) + + with engine.connect() as conn: + df = pd.read_sql(contributor_query, conn) df = df.loc[~df['full_name'].str.contains('bot', na=False)] df = df.loc[~df['login'].str.contains('bot', na=False)] @@ -334,7 +336,9 @@ def months_data_collection(start_date, end_date): FROM generate_series (TIMESTAMP '{start_date}', TIMESTAMP '{end_date}', INTERVAL '1 month' ) created_month ) d ) x ) y """) - months_df = pd.read_sql(months_query, engine) + + with engine.connect() as conn: + months_df = pd.read_sql(months_query, conn) # add yearmonths to months_df months_df[['year', 'month']] = months_df[['year', 'month']].astype(float).astype(int).astype(str) diff --git a/augur/api/routes/dei.py b/augur/api/routes/dei.py index dea79b79c..82324a8d6 100644 --- a/augur/api/routes/dei.py +++ b/augur/api/routes/dei.py @@ -52,7 +52,7 @@ def dei_track_repo(application: ClientApplication): return jsonify({"status": "Repo already exists"}) frontend_repo_group: RepoGroup = session.query(RepoGroup).filter(RepoGroup.rg_name == FRONTEND_REPO_GROUP_NAME).first() - repo_id = Repo.insert(session, repo_url, frontend_repo_group.repo_group_id, "API.DEI", repo_type="") + repo_id = Repo.insert_github_repo(session, repo_url, frontend_repo_group.repo_group_id, "API.DEI", repo_type="") if not repo_id: return jsonify({"status": "Error adding repo"}) diff --git a/augur/api/routes/metadata.py b/augur/api/routes/metadata.py index 389a3d9d1..f49dbb88f 100644 --- a/augur/api/routes/metadata.py +++ b/augur/api/routes/metadata.py @@ -47,7 +47,9 @@ def get_repo_info(): ORDER BY repo.repo_name; """) - results = pd.read_sql(repo_info_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(repo_info_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) return Response(response=data, @@ -61,7 +63,9 @@ def contributions_count(): group by repo_git order by contributions desc; """) - results = pd.read_sql(repo_info_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(repo_info_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) return Response(response=data, @@ -75,7 +79,9 @@ def contributors_count(): group by repo_git order by contributors desc; """) - results = pd.read_sql(repo_info_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(repo_info_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) return Response(response=data, diff --git a/augur/api/routes/pull_request_reports.py b/augur/api/routes/pull_request_reports.py index 02f6e235c..9e6577954 100644 --- a/augur/api/routes/pull_request_reports.py +++ b/augur/api/routes/pull_request_reports.py @@ -53,7 +53,7 @@ def pull_request_data_collection(repo_id, start_date, end_date): ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response, first_response_time, last_response_time, - average_time_between_responses, + EXTRACT ( EPOCH FROM average_time_between_responses), assigned_count, review_requested_count, labeled_count, @@ -62,15 +62,15 @@ def pull_request_data_collection(repo_id, start_date, end_date): referenced_count, closed_count, head_ref_force_pushed_count, - merged_count, + merged_count::INT, milestoned_count, unlabeled_count, head_ref_deleted_count, comment_count, - lines_added, - lines_removed, + COALESCE(lines_added, 0), + COALESCE(lines_removed, 0), commit_count, - file_count + COALESCE(file_count, 0) FROM repo, repo_groups, @@ -87,46 +87,47 @@ def pull_request_data_collection(repo_id, start_date, end_date): count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count, count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count, count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count, - count(*) FILTER (WHERE action = 'merged') AS merged_count, - MIN(message.msg_timestamp) AS first_response_time, - COUNT(DISTINCT message.msg_timestamp) AS comment_count, - MAX(message.msg_timestamp) AS last_response_time, - (MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp) AS average_time_between_responses - FROM pull_request_events, pull_requests, repo, pull_request_message_ref, message - WHERE repo.repo_id = {repo_id} - AND repo.repo_id = pull_requests.repo_id - AND pull_requests.pull_request_id = pull_request_events.pull_request_id - AND pull_requests.pull_request_id = pull_request_message_ref.pull_request_id - AND pull_request_message_ref.msg_id = message.msg_id + COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count, + COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time, + COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count, + COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time, + COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses + FROM pull_requests + LEFT OUTER JOIN pull_request_events on pull_requests.pull_request_id = pull_request_events.pull_request_id + JOIN repo on repo.repo_id = pull_requests.repo_id + LEFT OUTER JOIN pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id + LEFT OUTER JOIN message on pull_request_message_ref.msg_id = message.msg_id + WHERE repo.repo_id = 1 GROUP BY pull_requests.pull_request_id ) response_times ON pull_requests.pull_request_id = response_times.pull_request_id - LEFT OUTER JOIN ( - SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count FROM pull_request_commits, pull_requests, pull_request_meta + LEFT JOIN ( + SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count + FROM pull_request_commits, pull_requests, pull_request_meta WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} + AND pull_requests.repo_id = 1 AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha AND pr_cmt_sha <> pull_request_meta.pr_sha GROUP BY pull_request_commits.pull_request_id ) all_commit_counts ON pull_requests.pull_request_id = all_commit_counts.pull_request_id - LEFT OUTER JOIN ( + LEFT JOIN ( SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label FROM pull_requests, pull_request_meta WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} + AND pull_requests.repo_id = 1 AND pr_head_or_base = 'base' GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label ) base_labels ON base_labels.pull_request_id = all_commit_counts.pull_request_id - LEFT OUTER JOIN ( + LEFT JOIN ( SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count FROM pull_request_commits, commits, pull_requests, pull_request_meta WHERE cmt_commit_hash = pr_cmt_sha AND pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} + AND pull_requests.repo_id = 1 AND commits.repo_id = pull_requests.repo_id AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha AND commits.cmt_commit_hash <> pull_request_meta.pr_sha @@ -136,11 +137,13 @@ def pull_request_data_collection(repo_id, start_date, end_date): WHERE repo.repo_group_id = repo_groups.repo_group_id AND repo.repo_id = pull_requests.repo_id - AND repo.repo_id = {repo_id} + AND repo.repo_id = 1 ORDER BY merged_count DESC """) - pr_all = pd.read_sql(pr_query, engine) + + with engine.connect() as conn: + pr_all = pd.read_sql(pr_query, conn) pr_all[['assigned_count', 'review_requested_count', diff --git a/augur/api/routes/user.py b/augur/api/routes/user.py index dfaeb81f7..62bc44068 100644 --- a/augur/api/routes/user.py +++ b/augur/api/routes/user.py @@ -227,7 +227,7 @@ def add_user_repo(): repo = request.args.get("repo_url") group_name = request.args.get("group_name") - result = current_user.add_repo(group_name, repo) + result = current_user.add_github_repo(group_name, repo) return jsonify(result[1]) @@ -260,7 +260,7 @@ def add_user_org(): org = request.args.get("org_url") group_name = request.args.get("group_name") - result = current_user.add_org(group_name, org) + result = current_user.add_github_org(group_name, org) return jsonify(result[1]) diff --git a/augur/api/routes/util.py b/augur/api/routes/util.py index cd6a8ad3b..71d3526b9 100644 --- a/augur/api/routes/util.py +++ b/augur/api/routes/util.py @@ -1,10 +1,11 @@ #SPDX-License-Identifier: MIT +from augur.api.routes import AUGUR_API_VERSION +from ..server import app, engine import base64 import sqlalchemy as s import pandas as pd import json from flask import Response -import logging from augur.application.db.session import DatabaseSession from augur.application.logs import AugurLogger @@ -12,10 +13,6 @@ logger = AugurLogger("augur").get_logger() -from augur.api.routes import AUGUR_API_VERSION -from ..server import app, engine - - @app.route('/{}/repo-groups'.format(AUGUR_API_VERSION)) def get_all_repo_groups(): #TODO: make this name automatic - wrapper? repoGroupsSQL = s.sql.text(""" @@ -23,7 +20,9 @@ def get_all_repo_groups(): #TODO: make this name automatic - wrapper? FROM repo_groups ORDER BY rg_name """) - results = pd.read_sql(repoGroupsSQL, engine) + + with engine.connect() as conn: + results = pd.read_sql(repoGroupsSQL, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, @@ -52,13 +51,15 @@ def get_all_repos(): (select * from api_get_all_repos_issues) b on repo.repo_id = b.repo_id - left outer join - (select * from api_get_all_repo_prs) c - on repo.repo_id=c.repo_id + left outer join + (select * from api_get_all_repo_prs) c + on repo.repo_id=c.repo_id JOIN repo_groups ON repo_groups.repo_group_id = repo.repo_group_id order by repo_name """) - results = pd.read_sql(get_all_repos_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(get_all_repos_sql, conn) results['url'] = results['url'].apply(lambda datum: datum.split('//')[1]) b64_urls = [] @@ -91,21 +92,65 @@ def get_repos_in_repo_group(repo_group_id): (select * from api_get_all_repos_issues) b on repo.repo_id = b.repo_id - left outer join - (select * from api_get_all_repo_prs) c - on repo.repo_id=c.repo_id + left outer join + (select * from api_get_all_repo_prs) c + on repo.repo_id=c.repo_id JOIN repo_groups ON repo_groups.repo_group_id = repo.repo_group_id WHERE repo_groups.repo_group_id = :repo_group_id ORDER BY repo.repo_git """) - results = pd.read_sql(repos_in_repo_groups_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(repos_in_repo_groups_SQL, conn, params={'repo_group_id': repo_group_id}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, mimetype="application/json") +@app.route('/{}/repos/'.format(AUGUR_API_VERSION)) +def get_repo_by_id(repo_id: int) -> Response: + repo_by_id_SQL = s.sql.text(""" + SELECT + repo.repo_id, + repo.repo_name, + repo.description, + repo.repo_git AS url, + a.commits_all_time, + b.issues_all_time, + c.pull_requests_all_time, + rg_name, + repo.repo_group_id + FROM + repo + LEFT OUTER JOIN + (SELECT * FROM api_get_all_repos_commits) a + ON repo.repo_id = a.repo_id + LEFT OUTER JOIN + (SELECT * FROM api_get_all_repos_issues) b + ON repo.repo_id = b.repo_id + LEFT OUTER JOIN + (SELECT * FROM api_get_all_repo_prs) c + ON repo.repo_id = c.repo_id + JOIN repo_groups ON repo_groups.repo_group_id = repo.repo_group_id + WHERE + repo.repo_id = :id + """) + + results = pd.read_sql(repo_by_id_SQL, engine, params={"id": repo_id}) + results["url"] = results["url"].apply(lambda datum: datum.split("//")[1]) # cut "https://" off the URL + results["base64_url"] = [base64.b64encode(results.at[i, "url"].encode()) for i in results.index] + data = results.to_json(orient="records", date_format="iso", date_unit="ms") + + if not data or data == "[]": + return Response(response='{"status": "Repository ' + str(repo_id) + ' does not exist"}', + status=400, + mimetype="application/json") + + return Response(response=data[1:-1], # cut off brackets at each end, turns list of length 1 into single value + status=200, + mimetype="application/json") + @app.route('/{}/owner//repo/'.format(AUGUR_API_VERSION)) def get_repo_by_git_name(owner, repo): @@ -116,7 +161,8 @@ def get_repo_by_git_name(owner, repo): GROUP BY repo_id, rg_name """) - results = pd.read_sql(get_repo_by_git_name_sql, engine, params={'owner': '%{}_'.format(owner), 'repo': repo,}) + with engine.connect() as conn: + results = pd.read_sql(get_repo_by_git_name_sql, conn, params={'owner': '%{}%'.format(owner), 'repo': repo,}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, @@ -132,7 +178,9 @@ def get_repo_by_name(rg_name, repo_name): AND LOWER(rg_name) = LOWER(:rg_name) AND LOWER(repo_name) = LOWER(:repo_name) """) - results = pd.read_sql(get_repo_by_name_sql, engine, params={'rg_name': rg_name, 'repo_name': repo_name}) + + with engine.connect() as conn: + results = pd.read_sql(get_repo_by_name_sql, conn, params={'rg_name': rg_name, 'repo_name': repo_name}) results['url'] = results['url'].apply(lambda datum: datum.split('//')[1]) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -146,7 +194,9 @@ def get_group_by_name(rg_name): FROM repo_groups WHERE lower(rg_name) = lower(:rg_name) """) - results = pd.read_sql(groupSQL, engine, params={'rg_name': rg_name}) + + with engine.connect() as conn: + results = pd.read_sql(groupSQL, conn, params={'rg_name': rg_name}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, @@ -160,7 +210,8 @@ def get_repos_for_dosocs(): WHERE a.setting='repo_directory' """) - results = pd.read_sql(get_repos_for_dosocs_SQL, engine) + with engine.connect() as conn: + results = pd.read_sql(get_repos_for_dosocs_SQL, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, @@ -188,7 +239,9 @@ def get_issues(repo_group_id, repo_id=None): GROUP BY issues.issue_id ORDER by OPEN_DAY DESC """) - results = pd.read_sql(get_issues_sql, engine, params={'repo_group_id': repo_group_id}) + + with engine.connect() as conn: + results = pd.read_sql(get_issues_sql, conn, params={'repo_group_id': repo_group_id}) else: get_issues_sql = s.sql.text(""" SELECT issue_title, @@ -208,7 +261,9 @@ def get_issues(repo_group_id, repo_id=None): GROUP BY issues.issue_id, repo_name ORDER by OPEN_DAY DESC """) - results = pd.read_sql(get_issues_sql, engine, params={'repo_id': repo_id}) + + with engine.connect() as conn: + results = pd.read_sql(get_issues_sql, conn, params={'repo_id': repo_id}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, diff --git a/augur/api/view/api.py b/augur/api/view/api.py index 287b07943..598c0cdb6 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -102,7 +102,18 @@ def av_add_user_repo(): if rg_obj: # add the orgs repos to the group add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) - + + # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} + elif Repo.parse_gitlab_repo_url(url)[0]: + + org_name, repo_name = Repo.parse_github_repo_url(url) + repo_git = f"https://gitlab.com/{org_name}/{repo_name}" + + # TODO: gitlab ensure the whole repo git is inserted so it can be found here + repo_obj = Repo.get_by_repo_git(session, repo_git) + if repo_obj: + add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + else: invalid_urls.append(url) diff --git a/augur/api/view/routes.py b/augur/api/view/routes.py index 8a9fc0597..72164a929 100644 --- a/augur/api/view/routes.py +++ b/augur/api/view/routes.py @@ -1,4 +1,8 @@ +""" +Defines the api routes for the augur views +""" import logging +import math from flask import Flask, render_template, render_template_string, request, abort, jsonify, redirect, url_for, session, flash from sqlalchemy.orm.exc import NoResultFound from .utils import * @@ -37,9 +41,9 @@ def root(path=""): def logo(brand=None): if brand is None: return redirect(url_for('static', filename='img/augur_logo.png')) - elif "augur" in brand: + if "augur" in brand: return logo(None) - elif "chaoss" in brand: + if "chaoss" in brand: return redirect(url_for('static', filename='img/Chaoss_Logo_white.png')) return "" @@ -75,10 +79,16 @@ def repo_table_view(): if current_user.is_authenticated: data = current_user.get_repos(page = page, sort = sorting, direction = direction, search=query)[0] - page_count = (current_user.get_repo_count(search = query)[0] or 0) // pagination_offset + repos_count = (current_user.get_repo_count(search = query)[0] or 0) else: data = get_all_repos(page = page, sort = sorting, direction = direction, search=query)[0] - page_count = (get_all_repos_count(search = query)[0] or 0) // pagination_offset + repos_count = (get_all_repos_count(search = query)[0] or 0) + + page_count = math.ceil(repos_count / pagination_offset) - 1 + + if not data: + data = None + return render_module("repos-table", title="Repos", repos=data, query_key=query, activePage=page, pages=page_count, offset=pagination_offset, PS="repo_table_view", reverse = rev, sorting = sorting) diff --git a/augur/api/view/utils.py b/augur/api/view/utils.py index 228935574..298e9950a 100644 --- a/augur/api/view/utils.py +++ b/augur/api/view/utils.py @@ -1,10 +1,24 @@ +""" +Defines utility functions used by the augur api views +""" from pathlib import Path from concurrent.futures import ThreadPoolExecutor from flask import render_template, flash, url_for, Flask +from .init import init_logging from .init import * from ..server import app, db_session from augur.application.config import AugurConfig -import urllib.request, urllib.error, json, os, math, yaml, urllib3, time, logging, re +import urllib.request, urllib.error, json, os, math, yaml, urllib3, time, logging, re, math + +from augur.application.db.session import DatabaseSession +from augur.application.db.engine import DatabaseEngine +from augur.application.db.models import User, Repo, RepoGroup, UserGroup, UserRepo +from sqlalchemy import Column, Table, Integer, MetaData, or_ +from sqlalchemy.sql.operators import ilike_op, distinct_op +from sqlalchemy.sql.functions import coalesce +from augur.application.db.models.base import Base + +from sqlalchemy.orm import Query init_logging() @@ -66,6 +80,8 @@ def getSetting(key, section = "View"): loadSettings() +#version_check(settings) + """ ---------------------------------------------------------------- """ def loadReports(): @@ -298,3 +314,6 @@ def render_message(messageTitle, messageBody = None, title = None, redirect = No def render_module(module, **args): args.setdefault("body", module) return render_template('index.j2', **args) + +""" ---------------------------------------------------------------- +""" diff --git a/augur/application/cli/_multicommand.py b/augur/application/cli/_multicommand.py index 2df6e8b11..c0d8b1a96 100644 --- a/augur/application/cli/_multicommand.py +++ b/augur/application/cli/_multicommand.py @@ -27,7 +27,7 @@ def get_command(self, ctx, name): try: module = importlib.import_module('.' + name, 'augur.application.cli') return module.cli - except ModuleNotFoundError: + except ModuleNotFoundError as e: pass @click.command(cls=AugurMultiCommand, context_settings=CONTEXT_SETTINGS) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 29afab2b0..9b6894a7d 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -19,7 +19,8 @@ from datetime import datetime from augur import instance_id -from augur.tasks.start_tasks import augur_collection_monitor, CollectionState, create_collection_status_records +from augur.tasks.util.collection_state import CollectionState +from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records from augur.tasks.git.facade_tasks import clone_repos from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model from augur.tasks.init.redis_connection import redis_connection @@ -91,9 +92,12 @@ def start(disable_collection, development, port): logger.info("Deleting old task schedule") os.remove("celerybeat-schedule.db") - celery_beat_process = None - celery_command = "celery -A augur.tasks.init.celery_app.celery_app beat -l debug" - celery_beat_process = subprocess.Popen(celery_command.split(" ")) + with DatabaseSession(logger) as db_session: + config = AugurConfig(logger, db_session) + log_level = config.get_value("Logging", "log_level") + celery_beat_process = None + celery_command = f"celery -A augur.tasks.init.celery_app.celery_app beat -l {log_level.lower()}" + celery_beat_process = subprocess.Popen(celery_command.split(" ")) if not disable_collection: diff --git a/augur/application/cli/db.py b/augur/application/cli/db.py index f09aaabbd..42d57ecc6 100644 --- a/augur/application/cli/db.py +++ b/augur/application/cli/db.py @@ -99,7 +99,7 @@ def add_repo_groups(filename): """ Create new repo groups in Augur's database """ - with DatabaseEngine() as engine, engine.connect() as connection: + with DatabaseEngine() as engine, engine.begin() as connection: df = pd.read_sql( s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"), @@ -248,7 +248,7 @@ def update_api_key(api_key): """ ) - with DatabaseEngine() as engine, engine.connect() as connection: + with DatabaseEngine() as engine, engine.begin() as connection: connection.execute(update_api_key_sql, api_key=api_key) logger.info(f"Updated Augur API key to: {api_key}") diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index abdc6de54..756218139 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -37,8 +37,63 @@ def extract_needed_pr_label_data(labels: List[dict], repo_id: int, tool_source: return label_dicts -# retrieve only the needed data for pr assignees from the api response + +def extract_needed_mr_label_data(labels: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + labels: List of dictionaries of label data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed label dicts + """ + + if len(labels) == 0: + return [] + + label_dicts = [] + for label in labels: + + label_dict = { + 'pr_src_id': label['id'], + 'pr_src_node_id': None, + 'pr_src_url': None, + 'pr_src_description': label['name'], + 'pr_src_color': label['color'], + # TODO: Populate this by making an api call for each label + 'pr_src_default_bool': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + 'repo_id': repo_id + } + + label_dicts.append(label_dict) + + return label_dicts + + def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr assignees from the api response + + Arguments: + assignees: List of dictionaries of asignee data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed asignee dicts + """ if len(assignees) == 0: return [] @@ -48,7 +103,6 @@ def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_so for assignee in assignees: assignee_dict = { - # store the pr_url data on in the pr assignee data for now so we can relate it back to a pr later 'contrib_id': assignee["cntrb_id"], 'pr_assignee_src_id': int(assignee['id']), 'tool_source': tool_source, @@ -61,8 +115,59 @@ def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_so return assignee_dicts -# retrieve only the needed data for pr reviewers from the api response +def extract_needed_merge_request_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for merge request assignees from the api response + + Arguments: + assignees: List of dictionaries of asignee data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed asignee dicts + """ + + if len(assignees) == 0: + return [] + + assignee_dicts = [] + for assignee in assignees: + + assignee_dict = { + 'contrib_id': None, + 'repo_id': repo_id, + # TODO: Temporarily setting this to id which the id of the contributor, unitl we can get the contrib_id set and create a unique on the contrib_id and the pull_request_id + 'pr_assignee_src_id': assignee["id"], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + assignee_dicts.append(assignee_dict) + + return assignee_dicts + + + def extract_needed_pr_reviewer_data(reviewers: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr reviewers from the api response + + Arguments: + reviewers: List of dictionaries of reviewer data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed reviewer dicts + """ if len(reviewers) == 0: return [] @@ -247,6 +352,42 @@ def extract_needed_issue_assignee_data(assignees: List[dict], repo_id: int, tool return assignee_dicts +def extract_needed_gitlab_issue_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for gitlab issue assignees from the api response + + Arguments: + assignees: List of dictionaries of gitlab assignee data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed assignee dicts + """ + + if len(assignees) == 0: + return [] + + assignee_dicts = [] + for assignee in assignees: + + assignee_dict = { + "cntrb_id": None, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + "issue_assignee_src_id": assignee['id'], + "issue_assignee_src_node": None, + "repo_id": repo_id + } + + assignee_dicts.append(assignee_dict) + + return assignee_dicts + # retrieve only the needed data for pr labels from the api response @@ -277,9 +418,62 @@ def extract_needed_issue_label_data(labels: List[dict], repo_id: int, tool_sourc return label_dicts +def extract_needed_gitlab_issue_label_data(labels: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for gitlab issue labels from the api response + + Arguments: + labels: List of dictionaries of gitlab issue label data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed label dicts + """ + + if len(labels) == 0: + return [] + + label_dicts = [] + for label in labels: + + label_dict = { + "label_text": label["name"], + "label_description": label.get("description", None), + "label_color": label['color'], + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + "label_src_id": label['id'], + "label_src_node_id": None, + "repo_id": repo_id + } + + label_dicts.append(label_dict) + + return label_dicts + + -# retrieve only the needed data for pr labels from the api response def extract_needed_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr labels from the api response + + Arguments: + message: Message data dict + issue_id: id of the issue + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict of message ref data. + """ message_ref_dict = { 'issue_id': issue_id, @@ -311,9 +505,21 @@ def extract_needed_pr_message_ref_data(comment: dict, pull_request_id: int, repo def extract_needed_pr_data(pr, repo_id, tool_source, tool_version): + """ + Retrieve only the needed data for the pr api response + + Arguments: + pr: PR data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + + Returns: + Parsed pr dict + """ - pr_dict = { + pr = { 'repo_id': repo_id, 'pr_url': pr['url'], # 1-22-2022 inconsistent casting; sometimes int, sometimes float in bulk_insert @@ -367,9 +573,23 @@ def extract_needed_pr_data(pr, repo_id, tool_source, tool_version): 'data_source': 'GitHub API' } - return pr_dict + return pr def extract_needed_issue_data(issue: dict, repo_id: int, tool_source: str, tool_version: str, data_source: str): + """ + Retrieve only the needed data for the issue api response + + Arguments: + issue: Issue data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + + Returns: + Parsed issue dict + """ dict_data = { 'cntrb_id': None, # this the contributor who closed the issue @@ -513,8 +733,438 @@ def extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, return review_row +def extract_needed_pr_data_from_gitlab_merge_request(pr, repo_id, tool_source, tool_version): + """ + Retrieve only the needed data for the pr gitlab api response - + Arguments: + pr: PR data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + + + Returns: + Parsed pr dict + """ + + pr_dict = { + 'repo_id': repo_id, + 'pr_url': pr['web_url'], + 'pr_src_id': pr['id'], + 'pr_src_node_id': None, + 'pr_html_url': pr['web_url'], + 'pr_diff_url': None, + 'pr_patch_url': None, + 'pr_issue_url': None, + 'pr_augur_issue_id': None, + 'pr_src_number': pr['iid'], + 'pr_src_state': pr['state'], + 'pr_src_locked': pr['discussion_locked'], + 'pr_src_title': pr['title'], + # TODO: Add contributor logic for gitlab + 'pr_augur_contributor_id': None, + 'pr_body': pr['description'], + 'pr_created_at': pr['created_at'], + 'pr_updated_at': pr['updated_at'], + 'pr_closed_at': pr['closed_at'], + 'pr_merged_at': pr['merged_at'], + 'pr_merge_commit_sha': pr['merge_commit_sha'], + 'pr_teams': None, + 'pr_milestone': pr['milestone'].get('title') if pr['milestone'] else None, + 'pr_commits_url': None, + 'pr_review_comments_url': None, + 'pr_review_comment_url': None, + 'pr_comments_url': None, + 'pr_statuses_url': None, + 'pr_meta_head_id': None, + 'pr_meta_base_id': None, + 'pr_src_issue_url': None, + 'pr_src_comments_url': None, + 'pr_src_review_comments_url': None, + 'pr_src_commits_url': None, + 'pr_src_statuses_url': None, + 'pr_src_author_association': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': 'Gitlab API' + } + + return pr_dict + + +def extract_needed_issue_data_from_gitlab_issue(issue: dict, repo_id: int, tool_source: str, tool_version: str, data_source: str): + """ + Retrieve only the needed data for the issue gitlab api response + + Arguments: + issue: Issue data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + Returns: + Parsed issue dict + """ + + issue_dict = { + "repo_id": repo_id, + "reporter_id": None, + "pull_request": None, + "pull_request_id": None, + "created_at": issue['created_at'], + "issue_title": issue['title'], + "issue_body": issue['description'] if 'description' in issue else None, + "comment_count": issue['user_notes_count'], + "updated_at": issue['updated_at'], + "closed_at": issue['closed_at'], + "repository_url": issue['_links']['project'], + "issue_url": issue['_links']['self'], + "labels_url": None, + "comments_url": issue['_links']['notes'], + "events_url": None, + "html_url": issue['_links']['self'], + "issue_state": issue['state'], + "issue_node_id": None, + "gh_issue_id": issue['id'], + "gh_issue_number": issue['iid'], + "gh_user_id": issue['author']['id'], + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + return issue_dict + + + +def extract_gitlab_mr_event_data(event: dict, pr_id: int, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: + """ + Retrieve only the needed data for the mr event gitlab api response + + Arguments: + event: Event data dict + pr_id: id of the pr + platform_id: id of the platform + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + + Returns: + Parsed event dict + """ + + mr_event = { + 'pull_request_id': pr_id, + 'cntrb_id': None, + 'action': event['action_name'], + 'action_commit_hash': None, + 'created_at': event['created_at'], + 'issue_event_src_id': event['target_id'], + 'repo_id': repo_id, + 'platform_id': platform_id, + 'node_id': None, + 'node_url': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + return mr_event + +def extract_gitlab_issue_event_data(event: dict, issue_id: int, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: + """ + Retrieve only the needed data for the issue event gitlab api response + + Arguments: + event: Event data dict + issue_id: id of the issue + platform_id: id of the platform + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + + Returns: + Parsed event dict + """ + + issue_event = { + "issue_event_src_id": event['target_id'], + "issue_id": issue_id, + "node_id": None, + "node_url": None, + "cntrb_id": None, + "created_at": event['created_at'], + "action": event["action_name"], + "action_commit_hash": None, + "platform_id": platform_id, + "repo_id" : repo_id, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + return issue_event + + +def extract_needed_mr_reviewer_data(data: List[dict], pull_request_id, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr reviewers from the api response + + Arguments: + data: List of dictionaries that contain mr reviewer data to parse + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of extracted relevant data from needed mr reviwer data + """ + + if len(data) == 0: + return [] + + reviewer_dicts = [] + for x in data: + + for _ in x["suggested_approvers"]: + + reviewer_dict = { + 'pull_request_id': pull_request_id, + 'cntrb_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + reviewer_dicts.append(reviewer_dict) + + return reviewer_dicts + + +def extract_needed_mr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source): + """ + Retrieve only the needed data for mr commit data from the api response + + Arguments: + commit: commit data dictionary + repo_id: augur id of the repository + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dictionary of the extracted commit data + """ + + commit = { + 'pull_request_id': pull_request_id, + 'pr_cmt_sha': commit['id'], + 'pr_cmt_node_id': None, + 'pr_cmt_message': commit['message'], + 'repo_id': repo_id, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + } + + return commit + + +def extract_needed_mr_file_data(gitlab_file_data, repo_id, pull_request_id, tool_source, tool_version, data_source): + """ + Retrieve only the needed data for mr file data from the api response + Arguments: + gitlab_file_data: file data dictionary + repo_id: augur id of the repository + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of dicts of parsed gitlab file changes + """ + files = [] + + changes = gitlab_file_data["changes"] + for file_changes in changes: + try: + deletes = int(file_changes['diff'].split('@@')[1].strip().split(' ')[0].split(',')[1]) + adds = int(file_changes['diff'].split('@@')[1].strip().split(' ')[1].split(',')[1]) + except Exception: + deletes = 0 + adds = 0 + + file_dict = { + 'pull_request_id': pull_request_id, + 'repo_id': repo_id, + 'pr_file_additions': adds, + 'pr_file_deletions': deletes, + 'pr_file_path': file_changes['old_path'], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + } + + files.append(file_dict) + + return files + + +def extract_needed_mr_metadata(mr_dict, repo_id, pull_request_id, tool_source, tool_version, data_source): + """ + Retrieve only the needed data for mr metadata from the api response + + Arguments: + mr_dict: mr data dictionary + repo_id: augur id of the repository + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of dicts of parsed mr metadata + """ + head = {'sha': mr_dict['diff_refs']['head_sha'], + 'ref': mr_dict['target_branch'], + 'label': str(mr_dict['target_project_id']) + ':' + mr_dict['target_branch'], + 'author': mr_dict['author']['username'], + 'repo': str(mr_dict['target_project_id']) + } + + base = {'sha': mr_dict['diff_refs']['base_sha'], + 'ref': mr_dict['source_branch'], + 'label': str(mr_dict['source_project_id']) + ':' + mr_dict['source_branch'], + 'author': mr_dict['author']['username'], + 'repo': str(mr_dict['source_project_id']) + } + + pr_meta_dict = { + 'head': head, + 'base': base + } + all_meta = [] + for pr_side, pr_meta_data in pr_meta_dict.items(): + pr_meta = { + 'pull_request_id': pull_request_id, + 'repo_id': repo_id, + 'pr_head_or_base': pr_side, + 'pr_src_meta_label': pr_meta_data['label'], + 'pr_src_meta_ref': pr_meta_data['ref'], + 'pr_sha': pr_meta_data['sha'], + 'cntrb_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + all_meta.append(pr_meta) + + return all_meta + + +def extract_needed_gitlab_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Extract the message id for a given message on an issue from an api response + and connect it to the relevant repo id. + + Arguments: + message: message data dict + issue_id: id of the issue + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict containing the message ref id as well as the repo id. + """ + + message_ref_dict = { + 'issue_id': issue_id, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + 'issue_msg_ref_src_comment_id': int(message['id']), + 'issue_msg_ref_src_node_id': None, + 'repo_id': repo_id + } + + return message_ref_dict + + +def extract_needed_gitlab_message_data(comment: dict, platform_id: int, tool_source: str, tool_version: str, data_source: str): + """ + Extract specific metadata for a comment from an api response + and connect it to the relevant platform id. + + Arguments: + comment: comment data dict + platform_id: augur id of the relevant platform + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict containing parsed comment text and metadata + """ + + comment_dict = { + "pltfrm_id": platform_id, + "msg_text": comment['body'], + "msg_timestamp": comment['created_at'], + "cntrb_id": None, + "platform_msg_id": int(comment['id']), + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + return comment_dict + +def extract_needed_gitlab_mr_message_ref_data(comment: dict, pull_request_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr labels from the api response + + Arguments: + comment: comment data dict + pull_request_id: id of the PR + repo_id: augur id of the repository + platform_id: augur id of the relevant platform + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict containing the comment, pr and repo id of the parsed comment data. + """ + + pr_msg_ref = { + 'pull_request_id': pull_request_id, + 'pr_message_ref_src_comment_id': comment['id'], + 'repo_id': repo_id, + 'pr_message_ref_src_node_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + return pr_msg_ref diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 95cb0725d..7f97e4bbd 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -267,6 +267,7 @@ class Contributor(Base): @classmethod def from_github(cls, contributor, tool_source, tool_version, data_source): + from augur.tasks.util.AugurUUID import GithubUUID cntrb_id = GithubUUID() cntrb_id["user"] = contributor["id"] @@ -563,6 +564,8 @@ class RepoGroup(Base): data_source = Column(String) data_collection_date = Column(TIMESTAMP(precision=0)) + repo = relationship("Repo", back_populates="repo_group") + @staticmethod def is_valid_repo_group_id(session, repo_group_id: int) -> bool: """Deterime is repo_group_id exists. @@ -865,8 +868,8 @@ class Repo(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - repo_group = relationship("RepoGroup") - user_repo = relationship("UserRepo") + repo_group = relationship("RepoGroup", back_populates="repo") + user_repo = relationship("UserRepo", back_populates="repo") collection_status = relationship("CollectionStatus", back_populates="repo") issues = relationship("Issue", back_populates="repo") prs = relationship("PullRequest", back_populates="repo") @@ -926,6 +929,44 @@ def is_valid_github_repo(gh_session, url: str) -> bool: return False, {"status": f"Github Error: {data['message']}"} return True, {"status": "Valid repo", "repo_type": data["owner"]["type"]} + + @staticmethod + def is_valid_gitlab_repo(gl_session, url: str) -> bool: + """Determine whether a GitLab repo URL is valid. + + Args: + gl_session: GitLab session object with API key + url: Repository URL + + Returns: + True if repo URL is valid, False otherwise + """ + from augur.tasks.github.util.github_paginator import hit_api + + REPO_ENDPOINT = "https://gitlab.com/api/v4/projects/{}/" + + owner, repo = Repo.parse_gitlab_repo_url(url) + if not owner or not repo: + return False, {"status": "Invalid repo URL"} + + # Encode namespace and project name for the API request + project_identifier = f"{owner}%2F{repo}" + url = REPO_ENDPOINT.format(project_identifier) + + attempts = 0 + while attempts < 10: + response = hit_api(gl_session.oauths, url, logger) + + if response.status_code == 404: + return False, {"status": "Invalid repo"} + + if response.status_code == 200: + return True, {"status": "Valid repo"} + + attempts += 1 + + return False, {"status": "Failed to validate repo after multiple attempts"} + @staticmethod def parse_github_repo_url(url: str) -> tuple: @@ -945,6 +986,29 @@ def parse_github_repo_url(url: str) -> tuple: capturing_groups = result.groups() + owner = capturing_groups[0] + repo = capturing_groups[1] + + return owner, repo + + @staticmethod + def parse_gitlab_repo_url(url: str) -> tuple: + """ Gets the owner and repo from a gitlab url. + + Args: + url: Gitlab url + + Returns: + Tuple of owner and repo. Or a tuple of None and None if the url is invalid. + """ + + result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$", url) + + if not result: + return None, None + + capturing_groups = result.groups() + owner = capturing_groups[0] repo = capturing_groups[1] @@ -971,12 +1035,60 @@ def parse_github_org_url(url): return result.groups()[0] @staticmethod - def insert(session, url: str, repo_group_id: int, tool_source, repo_type): + def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): + """Add a repo to the repo table. + + Args: + url: repo url + repo_group_id: group to assign repo to + + Note: + If repo row exists then it will update the repo_group_id if param repo_group_id is not a default. If it does not exist is will simply insert the repo. + """ + + if not isinstance(url, str) or not isinstance(repo_group_id, int) or not isinstance(tool_source, str): + return None + + if not RepoGroup.is_valid_repo_group_id(session, repo_group_id): + return None + + if url.endswith("/"): + url = url[:-1] + + url = url.lower() + + owner, repo = Repo.parse_gitlab_repo_url(url) + if not owner or not repo: + return None + + repo_data = { + "repo_group_id": repo_group_id, + "repo_git": url, + "repo_path": f"gitlab.com/{owner}/", + "repo_name": repo, + "repo_type": None, + "tool_source": tool_source, + "tool_version": "1.0", + "data_source": "Git" + } + + repo_unique = ["repo_git"] + return_columns = ["repo_id"] + result = session.insert_data(repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) + + if not result: + return None + + return result[0]["repo_id"] + + @staticmethod + def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_type): """Add a repo to the repo table. Args: url: repo url repo_group_id: group to assign repo to + repo_type: github or gitlab Note: If repo row exists then it will update the repo_group_id if param repo_group_id is not a default. If it does not exist is will simply insert the repo. @@ -1207,10 +1319,6 @@ class Commit(Base): primaryjoin="Commit.cmt_author_platform_username == Contributor.cntrb_login", back_populates="commits" ) - contributor1 = relationship( - "Contributor", - primaryjoin="Commit.cmt_author_platform_username == Contributor.cntrb_login", - ) repo = relationship("Repo", back_populates="commits") message_ref = relationship("CommitCommentRef", back_populates="cmt") diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index f702d829a..47f28b12f 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -271,9 +271,9 @@ class User(Base): {"schema": "augur_operations"} ) - groups = relationship("UserGroup") - tokens = relationship("UserSessionToken") - applications = relationship("ClientApplication") + groups = relationship("UserGroup", back_populates="user") + tokens = relationship("UserSessionToken", back_populates="user") + applications = relationship("ClientApplication", back_populates="user") _is_authenticated = False _is_active = True @@ -449,17 +449,30 @@ def remove_group(self, group_name): return result - def add_repo(self, group_name, repo_url): + def add_github_repo(self, group_name, repo_url): from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.tasks.github.util.github_api_key_handler import NoValidKeysError try: with GithubTaskSession(logger) as session: - result = UserRepo.add(session, repo_url, self.user_id, group_name) + result = UserRepo.add_github_repo(session, repo_url, self.user_id, group_name) except NoValidKeysError: return False, {"status": "No valid keys"} return result + + def add_gitlab_repo(self, group_name, repo_url): + + from augur.tasks.gitlab.gitlab_task_session import GitlabTaskSession + from augur.tasks.github.util.github_api_key_handler import NoValidKeysError + try: + with GitlabTaskSession(logger) as session: + result = UserRepo.add_gitlab_repo(session, repo_url, self.user_id, group_name) + except NoValidKeysError: + return False, {"status": "No valid keys"} + + return result + def remove_repo(self, group_name, repo_id): @@ -468,14 +481,14 @@ def remove_repo(self, group_name, repo_id): return result - def add_org(self, group_name, org_url): + def add_github_org(self, group_name, org_url): from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.tasks.github.util.github_api_key_handler import NoValidKeysError try: with GithubTaskSession(logger) as session: - result = UserRepo.add_org_repos(session, org_url, self.user_id, group_name) + result = UserRepo.add_github_org_repos(session, org_url, self.user_id, group_name) except NoValidKeysError: return False, {"status": "No valid keys"} @@ -628,8 +641,8 @@ class UserGroup(Base): {"schema": "augur_operations"} ) - user = relationship("User") - repos = relationship("UserRepo") + user = relationship("User", back_populates="groups") + repos = relationship("UserRepo", back_populates="group") @staticmethod def insert(session, user_id:int, group_name:str) -> dict: @@ -739,8 +752,8 @@ class UserRepo(Base): ForeignKey("augur_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False ) - repo = relationship("Repo") - group = relationship("UserGroup") + repo = relationship("Repo", back_populates="user_repo") + group = relationship("UserGroup", back_populates="repos") @staticmethod def insert(session, repo_id: int, group_id:int = 1) -> bool: @@ -769,9 +782,69 @@ def insert(session, repo_id: int, group_id:int = 1) -> bool: return False return data[0]["group_id"] == group_id and data[0]["repo_id"] == repo_id + + @staticmethod + def add_gitlab_repo(session, url: List[str], user_id: int, group_name=None, group_id=None, from_org_list=False, repo_group_id=None) -> dict: + """Add repo to the user repo table + + Args: + urls: list of repo urls + user_id: id of user_id from users table + group_name: name of group to add repo to. + group_id: id of the group + valid_repo: boolean that indicates whether the repo has already been validated + + Note: + Either the group_name or group_id can be passed not both + + Returns: + Dict that contains the key "status" and additional useful data + """ + + if group_name and group_id: + return False, {"status": "Pass only the group name or group id not both"} + + if not group_name and not group_id: + return False, {"status": "Need group name or group id to add a repo"} + + if group_id is None: + + group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) + if group_id is None: + return False, {"status": "Invalid group name"} + + if not from_org_list: + result = Repo.is_valid_gitlab_repo(session, url) + if not result[0]: + return False, {"status": result[1]["status"], "repo_url": url} + + # if no repo_group_id is passed then assign the repo to the frontend repo group + if repo_group_id is None: + + frontend_repo_group = session.query(RepoGroup).filter(RepoGroup.rg_name == FRONTEND_REPO_GROUP_NAME).first() + if not frontend_repo_group: + return False, {"status": "Could not find repo group with name 'Frontend Repos'", "repo_url": url} + + repo_group_id = frontend_repo_group.repo_group_id + + + repo_id = Repo.insert_gitlab_repo(session, url, repo_group_id, "Frontend") + if not repo_id: + return False, {"status": "Repo insertion failed", "repo_url": url} + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + return False, {"status": "repo_user insertion failed", "repo_url": url} + + #collection_status records are now only added during collection -IM 5/1/23 + #status = CollectionStatus.insert(session, repo_id) + #if not status: + # return False, {"status": "Failed to create status for repo", "repo_url": url} + + return True, {"status": "Repo Added", "repo_url": url} @staticmethod - def add(session, url: List[str], user_id: int, group_name=None, group_id=None, from_org_list=False, repo_type=None, repo_group_id=None) -> dict: + def add_github_repo(session, url: List[str], user_id: int, group_name=None, group_id=None, from_org_list=False, repo_type=None, repo_group_id=None) -> dict: """Add repo to the user repo table Args: @@ -820,7 +893,7 @@ def add(session, url: List[str], user_id: int, group_name=None, group_id=None, f repo_group_id = frontend_repo_group.repo_group_id - repo_id = Repo.insert(session, url, repo_group_id, "Frontend", repo_type) + repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type) if not repo_id: return False, {"status": "Repo insertion failed", "repo_url": url} @@ -862,7 +935,7 @@ def delete(session, repo_id:int, user_id:int, group_name:str) -> dict: return True, {"status": "Repo Removed"} @staticmethod - def add_org_repos(session, url: List[str], user_id: int, group_name: int): + def add_github_org_repos(session, url: List[str], user_id: int, group_name: int): """Add list of orgs and their repos to a users repos. Args: @@ -911,7 +984,7 @@ def add_org_repos(session, url: List[str], user_id: int, group_name: int): failed_repos = [] for repo in repos: - result = UserRepo.add(session, repo, user_id, group_id=group_id, from_org_list=True, repo_type=type, repo_group_id=repo_group_id) + result = UserRepo.add_github_repo(session, repo, user_id, group_id=group_id, from_org_list=True, repo_type=type, repo_group_id=repo_group_id) # keep track of all the repos that failed if not result[0]: @@ -949,9 +1022,9 @@ class UserSessionToken(Base): application_id = Column(ForeignKey("augur_operations.client_applications.id", name="user_session_token_application_id_fkey"), nullable=False) created_at = Column(BigInteger) - user = relationship("User") - application = relationship("ClientApplication") - refresh_tokens = relationship("RefreshToken") + user = relationship("User", back_populates="tokens") + application = relationship("ClientApplication", back_populates="sessions") + refresh_tokens = relationship("RefreshToken", back_populates="user_session") @staticmethod def create(session, user_id, application_id, seconds_to_expire=86400): @@ -991,9 +1064,9 @@ class ClientApplication(Base): redirect_url = Column(String, nullable=False) api_key = Column(String, nullable=False) - user = relationship("User") + user = relationship("User", back_populates="applications") sessions = relationship("UserSessionToken") - subscriptions = relationship("Subscription") + subscriptions = relationship("Subscription", back_populates="application") def __eq__(self, other): return isinstance(other, ClientApplication) and str(self.id) == str(other.id) @@ -1013,8 +1086,8 @@ class Subscription(Base): application_id = Column(ForeignKey("augur_operations.client_applications.id", name="subscriptions_application_id_fkey"), primary_key=True) type_id = Column(ForeignKey("augur_operations.subscription_types.id", name="subscriptions_type_id_fkey"), primary_key=True) - application = relationship("ClientApplication") - type = relationship("SubscriptionType") + application = relationship("ClientApplication", back_populates="subscriptions") + type = relationship("SubscriptionType", back_populates="subscriptions") class SubscriptionType(Base): __tablename__ = "subscription_types" @@ -1027,7 +1100,7 @@ class SubscriptionType(Base): id = Column(BigInteger, primary_key=True) name = Column(String, nullable=False) - subscriptions = relationship("Subscription") + subscriptions = relationship("Subscription", back_populates="type") class RefreshToken(Base): @@ -1040,7 +1113,7 @@ class RefreshToken(Base): id = Column(String, primary_key=True) user_session_token = Column(ForeignKey("augur_operations.user_session_tokens.token", name="refresh_token_session_token_id_fkey"), nullable=False) - user_session = relationship("UserSessionToken") + user_session = relationship("UserSessionToken", back_populates="refresh_tokens") @staticmethod def create(session, user_session_token_id): @@ -1159,16 +1232,28 @@ def insert(session, repo_id): repo_git = repo.repo_git collection_status_unique = ["repo_id"] + pr_issue_count = 0 + github_weight = 0 + if "github" in repo_git: - try: - pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) - #session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") - github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) - except Exception as e: - pr_issue_count = None - github_weight = None - session.logger.error( - ''.join(traceback.format_exception(None, e, e.__traceback__))) + try: + pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) + #session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") + github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) + except Exception as e: + pr_issue_count = None + github_weight = None + session.logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) + else: + try: + pr_issue_count = 0 + github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) + except Exception as e: + pr_issue_count = None + github_weight = None + session.logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) record = { @@ -1178,6 +1263,7 @@ def insert(session, repo_id): "secondary_weight": github_weight, "ml_weight": github_weight } + result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) diff --git a/augur/application/db/session.py b/augur/application/db/session.py index 2212c1fdc..22379ad05 100644 --- a/augur/application/db/session.py +++ b/augur/application/db/session.py @@ -85,7 +85,7 @@ def __del__(self): def execute_sql(self, sql_text): - with self.engine.connect() as connection: + with self.engine.begin() as connection: return_data = connection.execute(sql_text) @@ -93,10 +93,10 @@ def execute_sql(self, sql_text): def fetchall_data_from_sql_text(self,sql_text): - with self.engine.connect() as connection: + with self.engine.begin() as connection: - result = connection.execute(sql_text).fetchall() - return [dict(zip(row.keys(), row)) for row in result] + result = connection.execute(sql_text) + return [dict(row) for row in result.mappings()] def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: @@ -174,7 +174,9 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s while attempts < 10: try: - with EngineConnection(self.engine) as connection: + #begin keyword is needed for sqlalchemy 2.x + #this is because autocommit support was removed in 2.0 + with self.engine.begin() as connection: connection.execute(stmnt) break except OperationalError as e: @@ -191,14 +193,16 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s raise e except Exception as e: - if(len(data) == 1): + #self.logger.info(e) + if len(data) == 1: raise e - else: - first_half = data[:len(data)//2] - second_half = data[len(data)//2:] + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] - self.insert_data(first_half, natural_keys, return_columns, string_fields, on_conflict_update) - self.insert_data(second_half, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(second_half,table, natural_keys, return_columns, string_fields, on_conflict_update) else: self.logger.error("Unable to insert data in 10 attempts") @@ -213,8 +217,8 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s # othewise it gets the requested return columns and returns them as a list of dicts while attempts < 10: try: - with EngineConnection(self.engine) as connection: - return_data_tuples = connection.execute(stmnt).fetchall() + with self.engine.begin() as connection: + return_data_tuples = connection.execute(stmnt) break except OperationalError as e: if isinstance(e.orig, DeadlockDetected): @@ -228,14 +232,15 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s raise e except Exception as e: - if(len(data) == 1): + if len(data) == 1: raise e - else: - first_half = data[:len(data)//2] - second_half = data[len(data)//2:] + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] - self.insert_data(first_half, natural_keys, return_columns, string_fields, on_conflict_update) - self.insert_data(second_half, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(second_half, table, natural_keys, return_columns, string_fields, on_conflict_update) else: self.logger.error("Unable to insert and return data in 10 attempts") @@ -244,9 +249,11 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s if deadlock_detected is True: self.logger.error("Made it through even though Deadlock was detected") - return_data = [] - for data_tuple in return_data_tuples: - return_data.append(dict(data_tuple)) + return_data = [dict(row) for row in return_data_tuples.mappings()] + + #no longer working in sqlalchemy 2.x + #for data_tuple in return_data_tuples: + # return_data.append(dict(data_tuple)) # using on confilict do nothing does not return the # present values so this does gets the return values diff --git a/augur/application/schema/alembic/env.py b/augur/application/schema/alembic/env.py index d170ef243..94127a43b 100644 --- a/augur/application/schema/alembic/env.py +++ b/augur/application/schema/alembic/env.py @@ -5,7 +5,9 @@ from alembic import context from augur.application.db.models.base import Base -from augur.application.db.engine import DatabaseEngine +from augur.application.db.engine import DatabaseEngine, get_database_string +from sqlalchemy import create_engine, event +from sqlalchemy.pool import NullPool # this is the Alembic Config object, which provides # access to the values within the .ini file in use. @@ -59,8 +61,20 @@ def run_migrations_online(): and associate a connection with the context. """ + url = get_database_string() + engine = create_engine(url) - with DatabaseEngine() as connectable, connectable.connect() as connection: + @event.listens_for(engine, "connect", insert=True) + def set_search_path(dbapi_connection, connection_record): + existing_autocommit = dbapi_connection.autocommit + dbapi_connection.autocommit = True + cursor = dbapi_connection.cursor() + cursor.execute("SET SESSION search_path=public,augur_data,augur_operations,spdx") + cursor.close() + dbapi_connection.autocommit = existing_autocommit + + + with engine.connect() as connection: context.configure( connection=connection, target_metadata=target_metadata, diff --git a/augur/application/schema/alembic/versions/1_augur_new_changes.py b/augur/application/schema/alembic/versions/1_augur_new_changes.py index 0be3780a3..2e8440294 100644 --- a/augur/application/schema/alembic/versions/1_augur_new_changes.py +++ b/augur/application/schema/alembic/versions/1_augur_new_changes.py @@ -300,8 +300,9 @@ def change_cntrb_id_to_uuid_5(upgrade=True): """ INSERT INTO "augur_data"."contributors"("cntrb_id", "cntrb_login", "cntrb_email", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:cntrb_uuid, 'not-provided', NULL, NULL, '2019-06-13 11:33:39', NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1, 'nobody', 'http://fake.me', 'http://fake.me', 'x', 'http://fake.me', NULL, 'http://fake.me', 'http://fake.me', 'http://fake.me', 'http://fake.me', 'http://fake.me', 'http://fake.me', 'http://fake.me', 'http://fake.me', NULL, NULL, NULL, NULL, NULL, NULL, '2019-06-13 16:35:25'); """ - ), - cntrb_uuid=UnresolvableUUID().to_UUID() + ).bindparams( + cntrb_uuid=UnresolvableUUID().to_UUID() + ) ) conn.execute( @@ -309,8 +310,9 @@ def change_cntrb_id_to_uuid_5(upgrade=True): """ INSERT INTO "augur_data"."contributors" ("cntrb_id", "cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:cntrb_uuid, 'nan', 'kannayoshihiro@gmail.com', 'KANNA Yoshihiro', 'UTMC', '2009-04-17 12:43:58', NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, 'kannayoshihiro@gmail.com', '2021-01-28 21:56:10-06', 74832, 'nan', 'https://api.github.com/users/nan', 'https://github.com/nan', 'MDQ6VXNlcjc0ODMy', 'https://avatars.githubusercontent.com/u/74832?v=4', '', 'https://api.github.com/users/nan/followers', 'https://api.github.com/users/nan/following{/other_user}', 'https://api.github.com/users/nan/gists{/gist_id}', 'https://api.github.com/users/nan/starred{/owner}{/repo}', 'https://api.github.com/users/nan/subscriptions', 'https://api.github.com/users/nan/orgs', 'https://api.github.com/users/nan/repos', 'https://api.github.com/users/nan/events{/privacy}', 'https://api.github.com/users/nan/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'GitHub API Worker', '1.0.0', 'GitHub API', '2021-10-28 15:23:46'); """ - ), - cntrb_uuid=GithubUUID().to_UUID() + ).bindparams( + cntrb_uuid=GithubUUID().to_UUID() + ) ) else: diff --git a/augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py b/augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py new file mode 100644 index 000000000..f381ec48e --- /dev/null +++ b/augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py @@ -0,0 +1,245 @@ +""" Updating materialized views and associated indices + +Revision ID: 26 +Revises: 25 +Create Date: 2023-08-23 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '26' +down_revision = '25' +branch_labels = None +depends_on = None + + +def upgrade(): + + mview_keys_26() + +def downgrade(): + + upgrade=False + + mview_keys_26(upgrade) + +def mview_keys_26(upgrade=True): + + if upgrade: + conn = op.get_bind() + conn.execute(text(""" + drop materialized view if exists augur_data.explorer_pr_assignments; + drop materialized view if exists augur_data.explorer_user_repos; + drop materialized view if exists augur_data.explorer_pr_response_times; + drop materialized view if exists augur_data.explorer_pr_response; + drop materialized view if exists augur_data.explorer_issue_assignments;""")) + + conn.execute(text(""" + create materialized view augur_data.explorer_pr_assignments as + SELECT + pr.pull_request_id, + pr.repo_id AS ID, + pr.pr_created_at AS created, + pr.pr_closed_at AS closed, + pre.created_at AS assign_date, + pre.ACTION AS assignment_action, + pre.cntrb_id AS assignee, + pre.node_id AS node_id + FROM + ( + augur_data.pull_requests pr + LEFT JOIN augur_data.pull_request_events pre ON ( + ( + ( pr.pull_request_id = pre.pull_request_id ) + AND ( + ( pre.ACTION ) :: TEXT = ANY ( ARRAY [ ( 'unassigned' :: CHARACTER VARYING ) :: TEXT, ( 'assigned' :: CHARACTER VARYING ) :: TEXT ] ) + ) + ) + ) + );""")) + conn.execute(text(""" + create materialized view augur_data.explorer_pr_response as + SELECT pr.pull_request_id, + pr.repo_id AS id, + pr.pr_augur_contributor_id AS cntrb_id, + m.msg_timestamp, + m.msg_cntrb_id, + pr.pr_created_at, + pr.pr_closed_at + FROM (augur_data.pull_requests pr + LEFT JOIN ( SELECT prr.pull_request_id, + m_1.msg_timestamp, + m_1.cntrb_id AS msg_cntrb_id + FROM augur_data.pull_request_review_message_ref prrmr, + augur_data.pull_requests pr_1, + augur_data.message m_1, + augur_data.pull_request_reviews prr + WHERE ((prrmr.pr_review_id = prr.pr_review_id) AND (prrmr.msg_id = m_1.msg_id) AND (prr.pull_request_id = pr_1.pull_request_id)) + UNION + SELECT prmr.pull_request_id, + m_1.msg_timestamp, + m_1.cntrb_id AS msg_cntrb_id + FROM augur_data.pull_request_message_ref prmr, + augur_data.pull_requests pr_1, + augur_data.message m_1 + WHERE ((prmr.pull_request_id = pr_1.pull_request_id) AND (prmr.msg_id = m_1.msg_id))) m ON ((m.pull_request_id = pr.pull_request_id)));""")) + + + + conn.execute(text(""" + create materialized view augur_data.explorer_user_repos as + SELECT a.login_name, + a.user_id, + b.group_id, + c.repo_id + FROM augur_operations.users a, + augur_operations.user_groups b, + augur_operations.user_repos c + WHERE ((a.user_id = b.user_id) AND (b.group_id = c.group_id)) + ORDER BY a.user_id;""")) + + conn.execute(text(""" + create materialized view augur_data.explorer_pr_response_times as + SELECT repo.repo_id, + pull_requests.pr_src_id, + repo.repo_name, + pull_requests.pr_src_author_association, + repo_groups.rg_name AS repo_group, + pull_requests.pr_src_state, + pull_requests.pr_merged_at, + pull_requests.pr_created_at, + pull_requests.pr_closed_at, + date_part('year'::text, (pull_requests.pr_created_at)::date) AS created_year, + date_part('month'::text, (pull_requests.pr_created_at)::date) AS created_month, + date_part('year'::text, (pull_requests.pr_closed_at)::date) AS closed_year, + date_part('month'::text, (pull_requests.pr_closed_at)::date) AS closed_month, + base_labels.pr_src_meta_label, + base_labels.pr_head_or_base, + ((EXTRACT(epoch FROM pull_requests.pr_closed_at) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (3600)::numeric) AS hours_to_close, + ((EXTRACT(epoch FROM pull_requests.pr_closed_at) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (86400)::numeric) AS days_to_close, + ((EXTRACT(epoch FROM response_times.first_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (3600)::numeric) AS hours_to_first_response, + ((EXTRACT(epoch FROM response_times.first_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (86400)::numeric) AS days_to_first_response, + ((EXTRACT(epoch FROM response_times.last_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (3600)::numeric) AS hours_to_last_response, + ((EXTRACT(epoch FROM response_times.last_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (86400)::numeric) AS days_to_last_response, + response_times.first_response_time, + response_times.last_response_time, + response_times.average_time_between_responses, + response_times.assigned_count, + response_times.review_requested_count, + response_times.labeled_count, + response_times.subscribed_count, + response_times.mentioned_count, + response_times.referenced_count, + response_times.closed_count, + response_times.head_ref_force_pushed_count, + response_times.merged_count, + response_times.milestoned_count, + response_times.unlabeled_count, + response_times.head_ref_deleted_count, + response_times.comment_count, + master_merged_counts.lines_added, + master_merged_counts.lines_removed, + all_commit_counts.commit_count, + master_merged_counts.file_count + FROM augur_data.repo, + augur_data.repo_groups, + ((((augur_data.pull_requests + LEFT JOIN ( SELECT pull_requests_1.pull_request_id, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'assigned'::text)) AS assigned_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'review_requested'::text)) AS review_requested_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'labeled'::text)) AS labeled_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'unlabeled'::text)) AS unlabeled_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'subscribed'::text)) AS subscribed_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'mentioned'::text)) AS mentioned_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'referenced'::text)) AS referenced_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'closed'::text)) AS closed_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'head_ref_force_pushed'::text)) AS head_ref_force_pushed_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'head_ref_deleted'::text)) AS head_ref_deleted_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'milestoned'::text)) AS milestoned_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'merged'::text)) AS merged_count, + min(message.msg_timestamp) AS first_response_time, + count(DISTINCT message.msg_timestamp) AS comment_count, + max(message.msg_timestamp) AS last_response_time, + ((max(message.msg_timestamp) - min(message.msg_timestamp)) / (count(DISTINCT message.msg_timestamp))::double precision) AS average_time_between_responses + FROM augur_data.pull_request_events, + augur_data.pull_requests pull_requests_1, + augur_data.repo repo_1, + augur_data.pull_request_message_ref, + augur_data.message + WHERE ((repo_1.repo_id = pull_requests_1.repo_id) AND (pull_requests_1.pull_request_id = pull_request_events.pull_request_id) AND (pull_requests_1.pull_request_id = pull_request_message_ref.pull_request_id) AND (pull_request_message_ref.msg_id = message.msg_id)) + GROUP BY pull_requests_1.pull_request_id) response_times ON ((pull_requests.pull_request_id = response_times.pull_request_id))) + LEFT JOIN ( SELECT pull_request_commits.pull_request_id, + count(DISTINCT pull_request_commits.pr_cmt_sha) AS commit_count + FROM augur_data.pull_request_commits, + augur_data.pull_requests pull_requests_1, + augur_data.pull_request_meta + WHERE ((pull_requests_1.pull_request_id = pull_request_commits.pull_request_id) AND (pull_requests_1.pull_request_id = pull_request_meta.pull_request_id) AND ((pull_request_commits.pr_cmt_sha)::text <> (pull_requests_1.pr_merge_commit_sha)::text) AND ((pull_request_commits.pr_cmt_sha)::text <> (pull_request_meta.pr_sha)::text)) + GROUP BY pull_request_commits.pull_request_id) all_commit_counts ON ((pull_requests.pull_request_id = all_commit_counts.pull_request_id))) + LEFT JOIN ( SELECT max(pull_request_meta.pr_repo_meta_id) AS max, + pull_request_meta.pull_request_id, + pull_request_meta.pr_head_or_base, + pull_request_meta.pr_src_meta_label + FROM augur_data.pull_requests pull_requests_1, + augur_data.pull_request_meta + WHERE ((pull_requests_1.pull_request_id = pull_request_meta.pull_request_id) AND ((pull_request_meta.pr_head_or_base)::text = 'base'::text)) + GROUP BY pull_request_meta.pull_request_id, pull_request_meta.pr_head_or_base, pull_request_meta.pr_src_meta_label) base_labels ON ((base_labels.pull_request_id = all_commit_counts.pull_request_id))) + LEFT JOIN ( SELECT sum(commits.cmt_added) AS lines_added, + sum(commits.cmt_removed) AS lines_removed, + pull_request_commits.pull_request_id, + count(DISTINCT commits.cmt_filename) AS file_count + FROM augur_data.pull_request_commits, + augur_data.commits, + augur_data.pull_requests pull_requests_1, + augur_data.pull_request_meta + WHERE (((commits.cmt_commit_hash)::text = (pull_request_commits.pr_cmt_sha)::text) AND (pull_requests_1.pull_request_id = pull_request_commits.pull_request_id) AND (pull_requests_1.pull_request_id = pull_request_meta.pull_request_id) AND (commits.repo_id = pull_requests_1.repo_id) AND ((commits.cmt_commit_hash)::text <> (pull_requests_1.pr_merge_commit_sha)::text) AND ((commits.cmt_commit_hash)::text <> (pull_request_meta.pr_sha)::text)) + GROUP BY pull_request_commits.pull_request_id) master_merged_counts ON ((base_labels.pull_request_id = master_merged_counts.pull_request_id))) + WHERE ((repo.repo_group_id = repo_groups.repo_group_id) AND (repo.repo_id = pull_requests.repo_id)) + ORDER BY response_times.merged_count DESC;""")) + + conn.execute(text(""" + create materialized view augur_data.explorer_issue_assignments as + SELECT + i.issue_id, + i.repo_id AS ID, + i.created_at AS created, + i.closed_at AS closed, + ie.created_at AS assign_date, + ie.ACTION AS assignment_action, + ie.cntrb_id AS assignee, + ie.node_id as node_id + FROM + ( + augur_data.issues i + LEFT JOIN augur_data.issue_events ie ON ( + ( + ( i.issue_id = ie.issue_id ) + AND ( + ( ie.ACTION ) :: TEXT = ANY ( ARRAY [ ( 'unassigned' :: CHARACTER VARYING ) :: TEXT, ( 'assigned' :: CHARACTER VARYING ) :: TEXT ] ) + ) + ) + ) + );""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_user_repos(login_name,user_id,group_id,repo_id);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_pr_response_times(repo_id, pr_src_id, pr_src_meta_label);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_pr_assignments(pull_request_id, id, node_id);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_issue_assignments(issue_id, id, node_id);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_pr_response(pull_request_id, id, cntrb_id, msg_cntrb_id, msg_timestamp);""")) + conn.execute(text("""COMMIT;""")) \ No newline at end of file diff --git a/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py b/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py index 8d75b7a70..0d9c6d744 100644 --- a/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py +++ b/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py @@ -85,9 +85,9 @@ def upgrade(): table_changes = """ - ALTER TABLE user_repos + ALTER TABLE augur_operations.user_repos ADD COLUMN group_id BIGINT, - ADD CONSTRAINT user_repos_group_id_fkey FOREIGN KEY (group_id) REFERENCES user_groups(group_id), + ADD CONSTRAINT user_repos_group_id_fkey FOREIGN KEY (group_id) REFERENCES augur_operations.user_groups(group_id), DROP COLUMN user_id, ADD PRIMARY KEY (group_id, repo_id); """ diff --git a/augur/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py b/augur/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py index 288f584cf..52a6e017d 100644 --- a/augur/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py +++ b/augur/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py @@ -25,7 +25,7 @@ def upgrade(): conn = op.get_bind() result = conn.execute(text(f"""SELECT * FROM "augur_data"."repo_groups" WHERE rg_name='{repo_group_name}';""")).fetchall() if len(result) == 0: - conn.execute(f"""INSERT INTO "augur_data"."repo_groups" ("rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ('{repo_group_name}', 'DO NOT DELETE OR FRONTEND REPOS WILL BREAK', '', 0, '2023-02-17 15:00:00', NULL, NULL, NULL, NULL, NULL);""") + conn.execute(text(f"""INSERT INTO "augur_data"."repo_groups" ("rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ('{repo_group_name}', 'DO NOT DELETE OR FRONTEND REPOS WILL BREAK', '', 0, '2023-02-17 15:00:00', NULL, NULL, NULL, NULL, NULL);""")) # ### end Alembic commands ### diff --git a/augur/application/util.py b/augur/application/util.py index 1915abdeb..03e591df9 100644 --- a/augur/application/util.py +++ b/augur/application/util.py @@ -25,6 +25,3 @@ def get_all_repos_count(**kwargs): result = controller.get_repo_count(source="all", **kwargs) return result - - - diff --git a/augur/tasks/data_analysis/clustering_worker/setup.py b/augur/tasks/data_analysis/clustering_worker/setup.py index 9a1b425f9..78fb0b4b5 100644 --- a/augur/tasks/data_analysis/clustering_worker/setup.py +++ b/augur/tasks/data_analysis/clustering_worker/setup.py @@ -29,11 +29,11 @@ def read(filename): 'psycopg2-binary==2.9.3', #'sklearn==0.0.0', 'scikit-learn==1.1.3', - 'numpy==1.22.0', + 'numpy==1.26.0', 'nltk==3.6.6', 'seaborn==0.11.1', - 'pandas==1.3.5', - 'matplotlib==3.5.1' + 'pandas==1.5.3', + 'matplotlib>=3.5.1' ], classifiers=[ 'Development Status :: 2 - Pre-Alpha', diff --git a/augur/tasks/data_analysis/clustering_worker/tasks.py b/augur/tasks/data_analysis/clustering_worker/tasks.py index 2d4f4973d..c102e6c22 100644 --- a/augur/tasks/data_analysis/clustering_worker/tasks.py +++ b/augur/tasks/data_analysis/clustering_worker/tasks.py @@ -116,7 +116,9 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: """ ) # result = db.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) - msg_df_cur_repo = pd.read_sql(get_messages_for_repo_sql, engine, params={"repo_id": repo_id}) + + with engine.connect() as conn: + msg_df_cur_repo = pd.read_sql(get_messages_for_repo_sql, conn, params={"repo_id": repo_id}) logger.info(msg_df_cur_repo.head()) logger.debug(f"Repo message df size: {len(msg_df_cur_repo.index)}") @@ -303,7 +305,9 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): AND prmr.msg_id=m.msg_id """ ) - msg_df_all = pd.read_sql(get_messages_sql, engine, params={}) + + with engine.connect() as conn: + msg_df_all = pd.read_sql(get_messages_sql, conn, params={}) # select only highly active repos logger.debug("Selecting highly active repos") diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 8034112ad..4521a722e 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -56,9 +56,10 @@ def contributor_breadth_model() -> None: ) b """) - result = engine.execute(cntrb_login_query) + with engine.connect() as connection: + result = connection.execute(cntrb_login_query) - current_cntrb_logins = [dict(row) for row in result] + current_cntrb_logins = [dict(row) for row in result.mappings()] cntrb_newest_events_query = s.sql.text(""" @@ -68,8 +69,10 @@ def contributor_breadth_model() -> None: GROUP BY c.gh_login; """) - cntrb_newest_events_list = engine.execute(cntrb_newest_events_query) - cntrb_newest_events_list = [dict(row) for row in cntrb_newest_events_list] + with engine.connect() as connection: + cntrb_newest_events_list = connection.execute(cntrb_newest_events_query) + + cntrb_newest_events_list = [dict(row) for row in cntrb_newest_events_list.mappings()] cntrb_newest_events_map = {} for cntrb_event in cntrb_newest_events_list: diff --git a/augur/tasks/data_analysis/discourse_analysis/setup.py b/augur/tasks/data_analysis/discourse_analysis/setup.py index 9a4e91c01..37d6557ec 100644 --- a/augur/tasks/data_analysis/discourse_analysis/setup.py +++ b/augur/tasks/data_analysis/discourse_analysis/setup.py @@ -28,13 +28,13 @@ def read(filename): 'requests==2.28.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', - 'scipy==1.7.3', + 'scipy>=1.10.0', 'nltk==3.6.6', - 'pandas==1.3.5', + 'pandas==1.5.3', 'scikit-learn==1.1.3', 'textblob==0.15.3', - 'python-crfsuite==0.9.8', - 'sklearn-crfsuite==0.3.6', + 'python-crfsuite>=0.9.8', + 'sklearn-crfsuite>=0.3.6', 'tabulate==0.8.9' ], # python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6 tabulate-0.8.9 entry_points={ diff --git a/augur/tasks/data_analysis/discourse_analysis/tasks.py b/augur/tasks/data_analysis/discourse_analysis/tasks.py index 2febe8636..5a9941679 100644 --- a/augur/tasks/data_analysis/discourse_analysis/tasks.py +++ b/augur/tasks/data_analysis/discourse_analysis/tasks.py @@ -72,7 +72,9 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: """) # result = db.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) - msg_df_cur_repo = pd.read_sql(get_messages_for_repo_sql, engine, params={"repo_id": repo_id}) + + with engine.connect() as conn: + msg_df_cur_repo = pd.read_sql(get_messages_for_repo_sql, conn, params={"repo_id": repo_id}) msg_df_cur_repo = msg_df_cur_repo.sort_values(by=['thread_id']).reset_index(drop=True) logger.info(msg_df_cur_repo.head()) diff --git a/augur/tasks/data_analysis/insight_worker/setup.py b/augur/tasks/data_analysis/insight_worker/setup.py index 0eb35d8a7..1ee6e8a4b 100644 --- a/augur/tasks/data_analysis/insight_worker/setup.py +++ b/augur/tasks/data_analysis/insight_worker/setup.py @@ -29,9 +29,9 @@ def read(filename): 'requests==2.28.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', - 'scipy>=1.7.3', + 'scipy>=1.10.0', 'sklearn==0.0', - 'numpy==1.22.0', + 'numpy==1.26.0', ], entry_points={ 'console_scripts': [ diff --git a/augur/tasks/data_analysis/insight_worker/tasks.py b/augur/tasks/data_analysis/insight_worker/tasks.py index 7f506c8d1..37ae5f484 100644 --- a/augur/tasks/data_analysis/insight_worker/tasks.py +++ b/augur/tasks/data_analysis/insight_worker/tasks.py @@ -134,13 +134,16 @@ def insight_model(repo_git: str,logger,engine,session) -> None: WHERE repo_insights.ri_metric = to_delete.ri_metric AND repo_insights.ri_field = to_delete.ri_field """) - result = engine.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) + + with engine.connect as conn: + result = conn.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) # get table values to check for dupes later on table_values_sql = s.sql.text("""SELECT * FROM repo_insights_records WHERE repo_id={}""".format(repo_id)) - insight_table_values = pd.read_sql(table_values_sql, engine, params={}) + with engine.connect() as conn: + insight_table_values = pd.read_sql(table_values_sql,conn, params={}) to_model_columns = df.columns[0:len(metrics) + 1] @@ -257,7 +260,7 @@ def classify_anomalies(df, metric): repo_insight_record_obj.ri_id)) # Send insight to Jonah for slack bot - send_insight(record, abs(next_recent_anomaly.iloc[0][metric] - mean), logger) + send_insight(record, abs(next_recent_anomaly.iloc[0][metric] - mean), logger,engine) insight_count += 1 else: @@ -526,8 +529,8 @@ def send_insight(insight, units_from_mean, logger, engine): FROM repo, repo_groups WHERE repo_id = {} """.format(insight['repo_id'])) - - repo = pd.read_sql(repoSQL, engine, params={}).iloc[0] + with engine.connect() as conn: + repo = pd.read_sql(repoSQL, conn, params={}).iloc[0] begin_date = datetime.datetime.now() - datetime.timedelta(days=anomaly_days) dict_date = insight['ri_date'].strftime("%Y-%m-%d %H:%M:%S") @@ -565,7 +568,8 @@ def clear_insights(repo_id, new_endpoint, new_field, logger): AND ri_field = '{}' """.format(repo_id, new_endpoint, new_field) try: - result = engine.execute(deleteSQL) + with engine.connect() as conn: + result = conn.execute(deleteSQL) except Exception as e: logger.info("Error occured deleting insight slot: {}".format(e)) @@ -582,7 +586,8 @@ def clear_insights(repo_id, new_endpoint, new_field, logger): AND ri_field = '{}' """.format(repo_id, new_endpoint, new_field) try: - result = engine.execute(deleteSQL) + with engine.connect() as conn: + result = conn.execute(deleteSQL) except Exception as e: logger.info("Error occured deleting insight slot: {}".format(e)) @@ -602,7 +607,8 @@ def clear_insight(repo_id, new_score, new_metric, new_field, logger): AND ri_field = '{}' ORDER BY ri_score DESC """.format(repo_id, new_metric, new_field)) - rec = json.loads(pd.read_sql(recordSQL, engine, params={}).to_json(orient='records')) + with engine.connect() as conn: + rec = json.loads(pd.read_sql(recordSQL, conn, params={}).to_json(orient='records')) logger.info("recordsql: {}, \n{}".format(recordSQL, rec)) # If new score is higher, continue with deletion if len(rec) > 0: @@ -623,7 +629,8 @@ def clear_insight(repo_id, new_score, new_metric, new_field, logger): AND ri_field = '{}' """.format(record['repo_id'], record['ri_metric'], record['ri_field']) try: - result = engine.execute(deleteSQL) + with engine.connect() as conn: + result = conn.execute(deleteSQL) except Exception as e: logger.info("Error occured deleting insight slot: {}".format(e)) else: @@ -637,7 +644,8 @@ def clear_insight(repo_id, new_score, new_metric, new_field, logger): WHERE repo_id = {} ORDER BY ri_score ASC """.format(repo_id)) - ins = json.loads(pd.read_sql(insightSQL, engine, params={}).to_json(orient='records')) + with engine.connect() as conn: + ins = json.loads(pd.read_sql(insightSQL, conn, params={}).to_json(orient='records')) logger.info("This repos insights: {}".format(ins)) # Determine if inisghts need to be deleted based on if there are more insights than we want stored, @@ -675,7 +683,8 @@ def clear_insight(repo_id, new_score, new_metric, new_field, logger): AND ri_metric = '{}' """.format(insight['repo_id'], insight['ri_metric']) try: - result = engine.execute(deleteSQL) + with engine.connect() as conn: + result = conn.execute(deleteSQL) except Exception as e: logger.info("Error occured deleting insight slot: {}".format(e)) @@ -744,7 +753,9 @@ def filter_duplicates(cols, tables, og_data, logger, engine): colSQL = s.sql.text(""" SELECT {} FROM {} """.format(col, table_str)) - values = pd.read_sql(colSQL, engine, params={}) + + with engine.connect() as conn: + values = pd.read_sql(colSQL, conn, params={}) for obj in og_data: if values.isin([obj[cols[col]]]).any().any(): diff --git a/augur/tasks/data_analysis/message_insights/setup.py b/augur/tasks/data_analysis/message_insights/setup.py index 311eb9b6f..a4f6a30c4 100644 --- a/augur/tasks/data_analysis/message_insights/setup.py +++ b/augur/tasks/data_analysis/message_insights/setup.py @@ -30,22 +30,22 @@ def read(filename): 'requests==2.28.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', - 'scipy==1.7.3', + 'scipy>=1.10.0', 'scikit-learn==1.1.3', #0.24.2', - 'numpy==1.22.0', + 'numpy==1.26.0', 'nltk==3.6.6', - 'pandas==1.3.5', + 'pandas==1.5.3', 'emoji==1.2.0', - 'Keras<2.9.0rc0', - 'Keras-Preprocessing==1.1.2', - 'tensorflow==2.8.0', - 'h5py~=3.6.0', + 'keras>=2.15.0', + 'Keras-Preprocessing', + 'tensorflow==2.15.0', + 'h5py==3.10.0', 'scikit-image==0.19.1', - 'joblib==1.0.1', + 'joblib==1.2.0', 'xgboost', 'bs4==0.0.1', 'xlrd==2.0.1', - 'gensim==4.2.0' + 'gensim>=4.2.0' ], classifiers=[ 'Development Status :: 3 - Alpha', diff --git a/augur/tasks/data_analysis/message_insights/tasks.py b/augur/tasks/data_analysis/message_insights/tasks.py index 1acec976c..4727d3def 100644 --- a/augur/tasks/data_analysis/message_insights/tasks.py +++ b/augur/tasks/data_analysis/message_insights/tasks.py @@ -59,7 +59,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: repo_exists_SQL = s.sql.text(""" SELECT exists (SELECT 1 FROM augur_data.message_analysis_summary WHERE repo_id = :repo_id LIMIT 1)""") - df_rep = pd.read_sql_query(repo_exists_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + df_rep = pd.read_sql_query(repo_exists_SQL, conn, params={'repo_id': repo_id}) #full_train = not(df_rep['exists'].iloc[0]) logger.info(f'Full Train: {full_train}') @@ -84,7 +85,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: where message.repo_id = :repo_id """) - df_past = pd.read_sql_query(past_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + df_past = pd.read_sql_query(past_SQL, conn, params={'repo_id': repo_id}) df_past['msg_timestamp'] = pd.to_datetime(df_past['msg_timestamp']) df_past = df_past.sort_values(by='msg_timestamp') @@ -124,7 +126,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id""") - df_message = pd.read_sql_query(join_SQL, engine, params={'repo_id': repo_id, 'begin_date': begin_date}) + with engine.connect() as conn: + df_message = pd.read_sql_query(join_SQL, conn, params={'repo_id': repo_id, 'begin_date': begin_date}) logger.info(f'Messages dataframe dim: {df_message.shape}') logger.info(f'Value 1: {df_message.shape[0]}') @@ -159,7 +162,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") - df_past = pd.read_sql_query(merge_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + df_past = pd.read_sql_query(merge_SQL, conn, params={'repo_id': repo_id}) df_past = df_past.loc[df_past['novelty_flag'] == 0] rec_errors = df_past['reconstruction_error'].tolist() threshold = threshold_otsu(np.array(rec_errors)) @@ -345,7 +349,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: FROM message_analysis_summary WHERE repo_id=:repo_id""") - df_past = pd.read_sql_query(message_analysis_query, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + df_past = pd.read_sql_query(message_analysis_query, conn, params={'repo_id': repo_id}) # df_past = get_table_values(cols=['period', 'positive_ratio', 'negative_ratio', 'novel_count'], # tables=['message_analysis_summary'], @@ -414,12 +419,13 @@ def send_insight(repo_id, insights, logger, engine): WHERE repo_id = {} """.format(repo_id)) - repo = pd.read_sql(repoSQL, engine, params={}).iloc[0] + with engine.connect() as conn: + repo = pd.read_sql(repoSQL, conn, params={}).iloc[0] to_send = { 'message_insight': True, 'repo_git': repo['repo_git'], - 'insight_begin_date': begin_date.strftime("%Y-%m-%d %H:%M:%S"), + 'insight_begin_date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), # date from when insights are calculated 'sentiment': insights[0], # sentiment insight dict 'novelty': insights[1], # novelty insight dict @@ -449,13 +455,14 @@ def get_max_id(table, column, logger, engine, default=25150): SELECT max({0}.{1}) AS {1} FROM {0} """.format(table, column)) - rs = pd.read_sql(max_id_sql, engine, params={}) + + with engine.connect() as conn: + rs = pd.read_sql(max_id_sql, conn, params={}) if rs.iloc[0][column] is not None: max_id = int(rs.iloc[0][column]) + 1 logger.info("Found max id for {} column in the {} table: {}\n".format(column, table, max_id)) else: max_id = default - logger.warning("Could not find max id for {} column in the {} table... " + - "using default set to: {}\n".format(column, table, max_id)) + logger.warning(f"Could not find max id for {column} column in the {table} table... using default set to: {max_id}\n") return max_id diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py b/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py index dc13c94bf..3341f24ff 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py @@ -29,12 +29,12 @@ def read(filename): 'psycopg2-binary==2.9.3', 'sklearn==0.0', 'nltk==3.6.6', - 'numpy==1.22.0', - 'pandas==1.3.5', + 'numpy==1.26.0', + 'pandas==1.5.3', 'emoji==1.2.0', - 'joblib==1.0.1', + 'joblib==1.2.0', 'xgboost==1.4.2', - 'scipy==1.7.3' + 'scipy>=1.10.0' ], classifiers=[ 'Development Status :: 2 - Pre-Alpha', diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py index c2816bed8..9d6d5be78 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -74,8 +74,8 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: and pull_requests.repo_id = :repo_id and pr_src_state like 'open' """) - - df_pr = pd.read_sql_query(pr_SQL, engine, params={'begin_date': begin_date, 'repo_id': repo_id}) + with engine.connect() as conn: + df_pr = pd.read_sql_query(pr_SQL, conn, params={'begin_date': begin_date, 'repo_id': repo_id}) logger.info(f'PR Dataframe dim: {df_pr.shape}\n') @@ -106,15 +106,16 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from augur_data.message left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") - - df_message = pd.read_sql_query(messages_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + df_message = pd.read_sql_query(messages_SQL, conn, params={'repo_id': repo_id}) logger.info(f'Mapping messages to PR, find comment & participants counts') # Map PR to its corresponding messages pr_ref_sql = s.sql.text("select * from augur_data.pull_request_message_ref") - df_pr_ref = pd.read_sql_query(pr_ref_sql, engine) + with engine.connect() as conn: + df_pr_ref = pd.read_sql_query(pr_ref_sql, conn) df_merge = pd.merge(df_pr, df_pr_ref, on='pull_request_id', how='left') df_merge = pd.merge(df_merge, df_message, on='msg_id', how='left') df_merge = df_merge.dropna(subset=['msg_id'], axis=0) @@ -167,7 +168,9 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: SELECT repo_id, pull_requests_merged, pull_request_count,watchers_count, last_updated FROM augur_data.repo_info where repo_id = :repo_id """) - df_repo = pd.read_sql_query(repo_sql, engine, params={'repo_id': repo_id}) + + with engine.connect() as conn: + df_repo = pd.read_sql_query(repo_sql, conn, params={'repo_id': repo_id}) df_repo = df_repo.loc[df_repo.groupby('repo_id').last_updated.idxmax(), :] df_repo = df_repo.drop(['last_updated'], axis=1) diff --git a/augur/tasks/db/refresh_materialized_views.py b/augur/tasks/db/refresh_materialized_views.py index 76420c253..f04d01552 100644 --- a/augur/tasks/db/refresh_materialized_views.py +++ b/augur/tasks/db/refresh_materialized_views.py @@ -59,15 +59,35 @@ def refresh_materialized_views(): COMMIT; """) + mv9_refresh = s.sql.text(""" + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_user_repos with data; + COMMIT; + """) - try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv1_refresh) - except Exception as e: - logger.info(f"error is {e}") - pass + mv10_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_response_times with data; + COMMIT; + """) + mv11_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_assignments with data; + COMMIT; + """) + + mv12_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_issue_assignments with data; + COMMIT; + """) + + mv13_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_response with data; + COMMIT; + """) try: with DatabaseSession(logger, engine) as session: @@ -125,7 +145,40 @@ def refresh_materialized_views(): logger.info(f"error is {e}") pass + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv9_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv10_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv11_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv12_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv13_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index b8eb8b203..fffd79d33 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -30,15 +30,15 @@ def add_org_repo_list(user_id, group_name, urls): valid_repos = [] for url in urls: - # matches https://github.com/{org}/ or htts://github.com/{org} + # matches https://github.com/{org}/ or http://github.com/{org} if Repo.parse_github_org_url(url): - added = user.add_org(group_name, url)[0] + added = user.add_github_org(group_name, url)[0] if added: valid_orgs.append(url) - # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} + # matches https://github.com/{org}/{repo}/ or http://github.com/{org}/{repo} elif Repo.parse_github_repo_url(url)[0]: - added = user.add_repo(group_name, url)[0] + added = user.add_github_repo(group_name, url)[0] if added: valid_repos.append(url) @@ -46,7 +46,7 @@ def add_org_repo_list(user_id, group_name, urls): elif (match := parse_org_and_repo_name(url)): org, repo = match.groups() repo_url = f"https://github.com/{org}/{repo}/" - added = user.add_repo(group_name, repo_url)[0] + added = user.add_github_repo(group_name, repo_url)[0] if added: valid_repos.append(url) @@ -54,9 +54,17 @@ def add_org_repo_list(user_id, group_name, urls): elif (match := parse_org_name(url)): org = match.group(1) org_url = f"https://github.com/{org}/" - added = user.add_org(group_name, org_url)[0] + added = user.add_github_org(group_name, org_url)[0] if added: valid_orgs.append(url) + + # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} + elif Repo.parse_gitlab_repo_url(url)[0]: + + added = user.add_gitlab_repo(group_name, url)[0] + if added: + valid_repos.append(url) + else: invalid_urls.append(url) @@ -66,24 +74,25 @@ def add_org_repo_list(user_id, group_name, urls): - +# TODO: Change to github specific @celery.task def add_repo(user_id, group_name, repo_url): logger = logging.getLogger(add_org.__name__) with GithubTaskSession(logger) as session: - result = UserRepo.add(session, repo_url, user_id, group_name) + result = UserRepo.add_github_repo(session, repo_url, user_id, group_name) print(repo_url, result) +# TODO: Change to github specific @celery.task def add_org(user_id, group_name, org_url): logger = logging.getLogger(add_org.__name__) with GithubTaskSession(logger) as session: - result = UserRepo.add_org_repos(session, org_url, user_id, group_name) + result = UserRepo.add_github_org_repos(session, org_url, user_id, group_name) print(org_url, result) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index c763a2a2c..ee3dc047f 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -31,7 +31,8 @@ from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor, get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count, facade_bulk_insert_commits from augur.tasks.github.facade_github.tasks import * -from augur.tasks.util.collection_util import CollectionState, get_collection_status_repo_git_from_filter +from augur.tasks.util.collection_state import CollectionState +from augur.tasks.util.collection_util import get_collection_status_repo_git_from_filter from augur.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 304574bc8..cf7d2d1e5 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -6,20 +6,24 @@ from augur.tasks.github.util.util import parse_json_response import logging from datetime import datetime -from enum import Enum +from augur.tasks.util.collection_state import CollectionState from augur.application.db.util import execute_session_query -class CollectionState(Enum): - SUCCESS = "Success" - PENDING = "Pending" - ERROR = "Error" - COLLECTING = "Collecting" -def update_repo_with_dict(current_dict,new_dict,logger,db): - +def update_repo_with_dict(repo,new_dict,logger,db): + """ + Update a repository record in the database using a dictionary tagged with + the appropriate table fields + + Args: + repo: orm repo object to update + new_dict: dict of new values to add to the repo record + logger: logging object + db: db object + """ - to_insert = current_dict + to_insert = repo.__dict__ del to_insert['_sa_instance_state'] to_insert.update(new_dict) @@ -45,7 +49,6 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' owner, name = get_owner_repo(repo.repo_git) url = f"https://api.github.com/repos/{owner}/{name}" - current_repo_dict = repo.__dict__ attempts = 0 while attempts < 10: @@ -56,64 +59,71 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' attempts += 1 - #Mark as errored if not found - if response_from_gh.status_code == 404: - logger.error(f"Repo {repo.repo_git} responded 404 when pinged!") + #Update Url and retry if 301 + #301 moved permanently + if response_from_gh.status_code == 301: + + owner, name = extract_owner_and_repo_from_endpoint(key_auth, response_from_gh.headers['location'], logger) + try: + old_description = str(repo.description) + except Exception: + old_description = "" + + #Create new repo object to update existing repo_update_dict = { - 'repo_git': repo.repo_git, - 'repo_path': None, - 'repo_name': None, - 'description': f"During our check for this repo on {datetime.today().strftime('%Y-%m-%d')}, a 404 error was returned. The repository does not appear to have moved. Instead, it appears to be deleted", - 'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + 'repo_git': f"https://github.com/{owner}/{name}", + 'repo_path': None, + 'repo_name': None, + 'description': f"(Originally hosted at {url}) {old_description}" } - update_repo_with_dict(current_repo_dict, repo_update_dict, logger, augur_db) - - raise Exception(f"ERROR: Repo not found at requested host {repo.repo_git}") - elif attempts >= 10: - logger.warning(f"Could not check if repo moved because the api timed out 10 times. Url: {url}") - return - + update_repo_with_dict(repo, repo_update_dict, logger,augur_db) - #skip if not moved - #301 moved permanently - if response_from_gh.status_code != 301: - logger.info(f"Repo found at url: {url}") - return + raise Exception("ERROR: Repo has moved! Resetting Collection!") - owner, name = extract_owner_and_repo_from_endpoint(key_auth, response_from_gh.headers['location'], logger) - - - try: - old_description = str(repo.description) - except: - old_description = "" + #Mark as ignore if 404 + if response_from_gh.status_code == 404: + repo_update_dict = { + 'repo_git': repo.repo_git, + 'repo_path': None, + 'repo_name': None, + 'description': f"During our check for this repo on {datetime.today().strftime('%Y-%m-%d')}, a 404 error was returned. The repository does not appear to have moved. Instead, it appears to be deleted", + 'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + } - #Create new repo object to update existing - repo_update_dict = { - 'repo_git': f"https://github.com/{owner}/{name}", - 'repo_path': None, - 'repo_name': None, - 'description': f"(Originally hosted at {url}) {old_description}" - } + update_repo_with_dict(repo, repo_update_dict, logger, augur_db) - update_repo_with_dict(current_repo_dict, repo_update_dict, logger,augur_db) + statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) - statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) + collectionRecord = execute_session_query(statusQuery,'one') - collectionRecord = execute_session_query(statusQuery,'one') - if collection_hook == 'core': - collectionRecord.core_status = CollectionState.PENDING.value + collectionRecord.core_status = CollectionState.IGNORE.value collectionRecord.core_task_id = None collectionRecord.core_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - elif collection_hook == 'secondary': - collectionRecord.secondary_status = CollectionState.PENDING.value + + collectionRecord.secondary_status = CollectionState.IGNORE.value collectionRecord.secondary_task_id = None collectionRecord.secondary_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - augur_db.session.commit() + collectionRecord.facade_status = CollectionState.IGNORE.value + collectionRecord.facade_task_id = None + collectionRecord.facade_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - raise Exception("ERROR: Repo has moved! Marked repo as pending and stopped collection") + collectionRecord.ml_status = CollectionState.IGNORE.value + collectionRecord.ml_task_id = None + collectionRecord.ml_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + + augur_db.session.commit() + raise Exception("ERROR: Repo has moved! Resetting Collection!") + + + if attempts >= 10: + logger.error(f"Could not check if repo moved because the api timed out 10 times. Url: {url}") + raise Exception(f"ERROR: Could not get api response for repo: {url}") + #skip if not 404 + logger.info(f"Repo found at url: {url}") + return + diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events/tasks.py index 129afd0de..640079d85 100644 --- a/augur/tasks/github/events/tasks.py +++ b/augur/tasks/github/events/tasks.py @@ -210,9 +210,11 @@ def update_issue_closed_cntrbs_from_events(engine, repo_id): SELECT issue_id, cntrb_id from RankedIssues where rn=1 and repo_id={repo_id} and cntrb_id is not NULL """) - result = engine.execute(get_ranked_issues).fetchall() - update_data = [{'issue_id': row['issue_id'], 'cntrb_id': row['cntrb_id'], 'repo_id': repo_id} for row in result] + with engine.connect() as conn: + result = conn.execute(get_ranked_issues).fetchall() + + update_data = [{'issue_id': row[0], 'cntrb_id': row[1], 'repo_id': repo_id} for row in result] with engine.connect() as connection: update_stmt = s.text(""" UPDATE issues diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 577f17c32..26d102753 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -252,8 +252,8 @@ def insert_facade_contributors(repo_id): """).bindparams(repo_id=repo_id) #Execute statement with session. - result = manifest.augur_db.execute_sql(new_contrib_sql).fetchall() - new_contribs = [dict(zip(row.keys(), row)) for row in result] + result = manifest.augur_db.execute_sql(new_contrib_sql) + new_contribs = [dict(row) for row in result.mappings()] #print(new_contribs) @@ -303,8 +303,8 @@ def insert_facade_contributors(repo_id): #existing_cntrb_emails = json.loads(pd.read_sql(resolve_email_to_cntrb_id_sql, self.db, params={ # 'repo_id': repo_id}).to_json(orient="records")) - result = session.execute_sql(resolve_email_to_cntrb_id_sql).fetchall() - existing_cntrb_emails = [dict(zip(row.keys(), row)) for row in result] + result = session.execute_sql(resolve_email_to_cntrb_id_sql) + existing_cntrb_emails = [dict(row) for row in result.mappings()] print(existing_cntrb_emails) link_commits_to_contributor(session,list(existing_cntrb_emails)) diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index 5380b8bf1..0ba793470 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -195,7 +195,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: issue_assignee_dicts += add_key_value_pair_to_dicts(other_issue_data["assignees"], "issue_id", issue_id) - logger.info(f"{task_name}: Inserting other issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") + logger.info(f"{task_name}: Inserting other github issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") # inserting issue labels # we are using label_src_id and issue_id to determine if the label is already in the database. diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index 6e23434ba..4dfd3a634 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -187,7 +187,8 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): message_string_fields = ["msg_text"] message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) - + if message_return_data is None: + return pr_message_ref_dicts = [] issue_message_ref_dicts = [] diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index e7ebcd945..81b4c4397 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -20,8 +20,8 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): pr_numbers = [] #pd.read_sql(pr_number_sql, self.db, params={}) - result = augur_db.execute_sql(pr_number_sql).fetchall() - pr_numbers = [dict(zip(row.keys(), row)) for row in result] + result = augur_db.execute_sql(pr_number_sql)#.fetchall() + pr_numbers = [dict(row) for row in result.mappings()] query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 3af6e39e0..8db394754 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -74,9 +74,18 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: return all_data - -def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): +def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): + """ + Parse and insert all retrieved PR data. + + Arguments: + pull_requests: List of paginated pr endpoint data + task_name: Name of the calling task and the repo + repo_id: augur id of the repository + logger: logging object + augur_db: sqlalchemy db object + """ tool_source = "Pr Task" tool_version = "2.0" data_source = "Github API" diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index f3050fc1b..5957d4cb5 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -84,7 +84,8 @@ def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): release_inf = get_release_inf(repo_id, release, tag_only) #Do an upsert - augur_db.insert_data(release_inf,Release,['release_id']) + string_fields = ["release_name", "release_description", "release_author", "release_tag_name"] + augur_db.insert_data(release_inf,Release,['release_id'], string_fields=string_fields) logger.info(f"Inserted info for {owner}/{repo_id}/{release['name']}\n") diff --git a/augur/tasks/github/util/github_api_key_handler.py b/augur/tasks/github/util/github_api_key_handler.py index 8a19430e8..20ce07f06 100644 --- a/augur/tasks/github/util/github_api_key_handler.py +++ b/augur/tasks/github/util/github_api_key_handler.py @@ -32,7 +32,7 @@ def __init__(self, session: DatabaseSession): self.logger = session.logger self.config = AugurConfig(self.logger, session) - self.oauth_redis_key = "oauth_keys_list" + self.oauth_redis_key = "github_oauth_keys_list" self.redis_key_list = RedisList(self.oauth_redis_key) diff --git a/augur/tasks/github/util/github_paginator.py b/augur/tasks/github/util/github_paginator.py index 548d25b0f..31c14565d 100644 --- a/augur/tasks/github/util/github_paginator.py +++ b/augur/tasks/github/util/github_paginator.py @@ -154,6 +154,8 @@ class GithubApiResult(Enum): SECONDARY_RATE_LIMIT = 4 RATE_LIMIT_EXCEEDED = 5 ABUSE_MECHANISM_TRIGGERED = 6 + # TODO: Add bad credentials detection that removes key + # from redis if bad credentials are detected BAD_CREDENTIALS = 7 HTML = 8 EMPTY_STRING = 9 diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index fbb23dd6e..42989dcca 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -54,10 +54,21 @@ def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dic try: return response.json() except json.decoder.JSONDecodeError as e: - logger.warning(f"invalid return from GitHub. Response was: {response.text}. Exception: {e}") + logger.warning(f"invalid return. Response was: {response.text}. Exception: {e}") return json.loads(json.dumps(response.text)) def get_repo_weight_by_issue(logger,repo_git): + """ + Retrieve the sum of the number of issues and prs in a repository from a graphql query. + + Arguments: + logger: logger object + repo_git: repository url + + Returns: + Sum of issues and prs for that repo + """ + from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql owner,name = get_owner_repo(repo_git) diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py new file mode 100644 index 000000000..8058831ba --- /dev/null +++ b/augur/tasks/gitlab/events_task.py @@ -0,0 +1,209 @@ +""" +Module to define the task methods to collect gitlab event data for augur +""" +import logging + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest +from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest, PullRequestEvent +from augur.application.db.util import execute_session_query + +platform_id = 2 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_issue_events(repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issue_events.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + events = retrieve_all_gitlab_event_data("issue", repo_git, logger, manifest.key_auth) + + if events: + logger.info(f"Length of gitlab issue events: {len(events)}") + process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab issue events") + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_merge_request_events(repo_git) -> int: + """ + Retrieve and parse gitlab mrs for the desired repo + + Arguments: + repo_git: the repo url string + """ + + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issue_events.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, manifest.key_auth) + + if events: + logger.info(f"Length of gitlab merge request events: {len(events)}") + process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request events") + + +def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None: + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + gtype: type of event data + repo_git: url of the relevant repo + logger: loggin object + key_auth: key auth cache and rotator object + """ + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting gitlab issue events for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type={gtype}" + events = GitlabApiHandler(key_auth, logger) + + all_data = [] + num_pages = events.get_num_pages(url) + for page_data, page in events.iter_pages(url): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo}: Gitlab {gtype} Events Page {page} contains no data...returning") + logger.info(f"{owner}/{repo}: {gtype} Events Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo}: Gitlab {gtype} Events Page {page} of {num_pages}") + + all_data += page_data + + return all_data + +def process_issue_events(events, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + events: List of dictionaries of issue event data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Gitlab issue events task" + tool_version = "2.0" + data_source = "Gitlab API" + + issue_event_dicts = [] + + # create mapping from issue number to issue id of current issues + issue_url_to_id_map = {} + issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + for issue in issues: + issue_url_to_id_map[issue.gh_issue_number] = issue.issue_id + + for event in events: + + issue_number = event["target_iid"] + + try: + issue_id = issue_url_to_id_map[issue_number] + except KeyError: + logger.info(f"{task_name}: Could not find related issue") + logger.info(f"{task_name}: We were searching for an issue with number {issue_number} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + issue_event_dicts.append( + extract_gitlab_issue_event_data(event, issue_id, platform_id, repo_id, + tool_source, tool_version, data_source) + ) + + logger.info(f"{task_name}: Inserting {len(issue_event_dicts)} gitlab issue events") + issue_event_natural_keys = ["issue_id", "issue_event_src_id"] + augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) + + +def process_mr_events(events, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr events from the api response + + Arguments: + labels: List of dictionaries of label data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed label dicts + """ + + tool_source = "Gitlab mr events task" + tool_version = "2.0" + data_source = "Gitlab API" + + mr_event_dicts = [] + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + for event in events: + + mr_number = event["target_iid"] + + try: + issue_id = mr_number_to_id_map[mr_number] + except KeyError: + logger.info(f"{task_name}: Could not find related mr") + logger.info(f"{task_name}: We were searching for an mr with number {mr_number} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + mr_event_dicts.append( + extract_gitlab_mr_event_data(event, issue_id, platform_id, repo_id, + tool_source, tool_version, data_source) + ) + + # TODO: Add unique key for this + logger.info(f"{task_name}: Inserting {len(mr_event_dicts)} gitlab mr events") + mr_event_natural_keys = ["pull_request_id", "issue_event_src_id"] + augur_db.insert_data(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) + + diff --git a/augur/tasks/gitlab/gitlab_api_handler.py b/augur/tasks/gitlab/gitlab_api_handler.py new file mode 100644 index 000000000..5303d606e --- /dev/null +++ b/augur/tasks/gitlab/gitlab_api_handler.py @@ -0,0 +1,386 @@ +""" +Defines a GitlabApiHandler class to paginate and handle interaction with GitLab's +api through automatic use of relevant key auth and pagination tools. +""" +import httpx +import time +import logging + +from typing import List, Optional, Union, Generator, Tuple +from urllib.parse import urlencode, urlparse, parse_qs, urlunparse +from enum import Enum + +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from augur.tasks.github.util.util import parse_json_response + +class GitlabApiResult(Enum): + """All the different results of querying the Gitlab API.""" + + SUCCESS = 0 + TIMEOUT = 1 + NO_MORE_ATTEMPTS = 2 + NOT_FOUND = 3 + SECONDARY_RATE_LIMIT = 4 + RATE_LIMIT_EXCEEDED = 5 + ABUSE_MECHANISM_TRIGGERED = 6 + # TODO: Add bad credentials detection that removes key + # from redis if bad credentials are detected + BAD_CREDENTIALS = 7 + +class GitlabApiHandler(): + """This class is a sequence that handles retrieving data from the Gitlab API. + + Attributes: + url (str): The url that we are collecting data + key_mangager (GitlabRandomKeyAuth): Custom httpx auth class + that randomizes the github api key a request gets. + This is how the requests are getting their api keys + logger (logging.Logger): Logger that handler printing information to files and stdout + """ + + def __init__(self, key_manager: GitlabRandomKeyAuth, logger: logging.Logger): + """Initialize the class GitlabPaginator. + + Args: + url: url that the data is being collected + key_manager: class that randomly selects a Gitlab API key for each request + logger: handles logging + from_datetime: collects data after this datatime (not yet implemented) + to_datetime: collects data before this datatime (not yet implemented) + """ + self.key_manager = key_manager + self.logger = logger + + def get_length(self, url): + """Get the length of the Gitlab API data. + + Returns: + The length of the Gitlab API data at the url. + + Examples: + This function is called when len() is called on the GitlabPaginator class for example. + + issues = GitlabPaginator(url, session.oauths, logger) + issue_len = len(issues) + """ + + num_pages = self.get_num_pages(url) + + self.logger.info(f"Num pages: {num_pages}") + + params = {"page": num_pages} + url = add_query_params(url, params) + + # get the amount of data on last page + data, _, result = self.retrieve_data(url) + + if result == GitlabApiResult.SUCCESS: + return (100 * (num_pages -1)) + len(data) + + self.logger.debug("Unable to retrieve data length from api") + return 0 + + def iter(self, url) -> Generator[Optional[dict], None, None]: + """Provide data from Gitlab API via a generator that yields one dict at a time. + + Yields: + A piece of data from the github api as the specified url + """ + + url = self._set_paginaton_query_params(url) + + data_list, response, result = self.retrieve_data(url) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug("Failed to retrieve the data even though 10 attempts were given") + yield None + return + + # yield the first page data + for data in data_list: + yield data + + while 'next' in response.links.keys(): + next_page = response.links['next']['url'] + + # Here we don't need to pass in params with the page, or the default params because the url from the headers already has those values + data_list, response, result = self.retrieve_data(next_page) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug("Failed to retrieve the data even though 10 attempts were given") + return + + for data in data_list: + yield data + + def iter_pages(self, url) -> Generator[Tuple[Optional[List[dict]], int], None, None]: + """Provide data from Gitlab API via a generator that yields a page of dicts at a time. + + Returns: + A page of data from the Gitlab API at the specified url + """ + + url = self._set_paginaton_query_params(url) + + # retrieves the data for the given url + data_list, response, result = self.retrieve_data(url) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug("Failed to retrieve the data even though 10 attempts were given") + yield None, None + return + + # this retrieves the page for the given url + page_number = get_url_page_number(url) + + # yields the first page of data and its page number + yield data_list, page_number + + while 'next' in response.links.keys(): + + # gets the next page from the last responses header + next_page = response.links['next']['url'] + + # Here we don't need to pass in params with the page, or the default params because the url from the headers already has those values + data_list, response, result = self.retrieve_data(next_page) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug(f"Failed to retrieve the data for even though 10 attempts were given. Url: {next_page}") + return + + page_number = get_url_page_number(next_page) + + # if either the data or response is None then yield None and return + if data_list is None or response is None: + return + + # yield the data from the page and its number + yield data_list, page_number + + def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx.Response]]: + """Attempt to retrieve data at given url. + + Args: + url: The url to retrieve the data from + + Returns + The response object from hitting the url and the data on the page + """ + + timeout = 30 + timeout_count = 0 + num_attempts = 1 + while num_attempts <= 10: + + response = hit_api(self.key_manager, url, self.logger, timeout) + + num_attempts += 1 + + if response is None: + if timeout_count == 10: + self.logger.error(f"Request timed out 10 times for {url}") + return None, None, GitlabApiResult.TIMEOUT + + timeout = timeout * 1.1 + num_attempts += 1 + continue + + if response.status_code == 500: + self.logger.error(f"Gitlab returned {response.status_code} error when fetching {url}. Message: {response.json()}") + continue + + if response.status_code == 429: + + current_epoch = int(time.time()) + epoch_when_key_resets = int(response.headers["ratelimit-reset"]) + key_reset_time = epoch_when_key_resets - current_epoch + + if key_reset_time < 0: + self.logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") + key_reset_time = 0 + + self.logger.info(f"\n\n\nGitlab API rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") + time.sleep(key_reset_time) + continue + + if response.status_code == 404: + self.logger.info(f"ERROR: 404 not found for {url}") + return [], response, GitlabApiResult.NOT_FOUND + + if response.status_code == 204: + return [], response, GitlabApiResult.SUCCESS + + if response.status_code >= 200 and response.status_code <=299: + + page_data = parse_json_response(self.logger, response) + return page_data, response, GitlabApiResult.SUCCESS + + self.logger.warning(f"Unhandled gitlab response. Status code: {response.status_code}. Body: {response.json()}") + + + + self.logger.error("Unable to collect data in 10 attempts") + return None, None, GitlabApiResult.NO_MORE_ATTEMPTS + + def get_num_pages(self, url) -> Optional[int]: + """Get the number of pages of data that a url can paginate through. + + Returns: + The number of pages a url can access + """ + + url = self._set_paginaton_query_params(url) + + timeout: float = 5 + num_attempts = 0 + while num_attempts < 10: + r = self.hit_api(url=url, timeout=timeout, method="HEAD") + + if r: + break + + timeout = timeout * 1.2 + else: + raise RuntimeError("Unable to get the number of pages of data in 10 attempts") + + if 'last' not in r.links.keys(): + return 1 + + # get the last url from header + last_page_url = r.links['last']['url'] + + parsed_url = urlparse(last_page_url) + try: + num_pages = int(parse_qs(parsed_url.query)['page'][0]) + except (KeyError, ValueError): + return None + + return num_pages + + def hit_api(self, url, timeout, method): + """Attempt to retrieve data at given url. + + Args: + url: The url to retrieve the data from + timeout: time to wait until timeout + method: GET, POST, etc. + + Returns + The response object from hitting the url and the data on the page + """ + + return hit_api(self.key_manager, url, self.logger, timeout, method=method) + + def _set_paginaton_query_params(self, url): + + remove_fields = ["per_page", "page"] + url = clean_url(url, remove_fields) + + # we need to add query params directly to the url, instead of passing the param to the httpx.Client.request + # this is because github will only append specified params to the links in the headers if they are a part + # of the url, and not the params with the request + params = {"per_page": 100} + url = add_query_params(url, params) + + return url + +################################################################################ + +# Url Helper Method to remove query parameters from the url +def clean_url(url: str, keys: List[str]) -> str: + """Remove query params from url. + + Args: + url: the url that is being modified + keys: the query params that are being removed + + Returns: + A url with the params in keys removed + """ + u = urlparse(url) + query = parse_qs(u.query, keep_blank_values=True) + + for key in keys: + query.pop(key, None) + + u = u._replace(query=urlencode(query, True)) + + return urlunparse(u) + + +def add_query_params(url: str, additional_params: dict) -> str: + """Add query params to a url. + + Args: + url: the url that is being modified + additional_params: key value pairs specifying the parameters to be added + + Returns: + The url with the key value pairs in additional_params added as query params + """ + url_components = urlparse(url) + original_params = parse_qs(url_components.query) + # Before Python 3.5 you could update original_params with + # additional_params, but here all the variables are immutable. + merged_params = {**original_params, **additional_params} + updated_query = urlencode(merged_params, doseq=True) + # _replace() is how you can create a new NamedTuple with a changed field + return url_components._replace(query=updated_query).geturl() + + +def get_url_page_number(url: str) -> int: + """Parse the page number from the url. + + Note: + If the url does not contain a page number the function returns 1 + + Args: + url: url to get the page number from + + Returns: + The page number that the url contains + """ + parsed_url = urlparse(url) + try: + # if page is not a url query param then this is page 1 + page_number = int(parse_qs(parsed_url.query)['page'][0]) + + except KeyError: + return 1 + + return page_number + +################################################################################ + +def hit_api(key_manager, url: str, logger: logging.Logger, timeout: float = 10, method: str = 'GET', ) -> Optional[httpx.Response]: + """Ping the api and get the data back for the page. + + Returns: + A httpx response that contains the data. None if a timeout occurs + """ + # self.logger.info(f"Hitting endpoint with {method} request: {url}...\n") + + with httpx.Client() as client: + + try: + response = client.request( + method=method, url=url, auth=key_manager, timeout=timeout, follow_redirects=True) + + except TimeoutError: + logger.info(f"Request timed out. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.TimeoutException: + logger.info(f"Request timed out. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.NetworkError: + logger.info(f"Network Error. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.ProtocolError: + logger.info(f"Protocol Error. Sleeping {round(timeout*1.5)} seconds and trying again...\n") + time.sleep(round(timeout*1.5)) + return None + + return response diff --git a/augur/tasks/gitlab/gitlab_api_key_handler.py b/augur/tasks/gitlab/gitlab_api_key_handler.py new file mode 100644 index 000000000..20bc1219c --- /dev/null +++ b/augur/tasks/gitlab/gitlab_api_key_handler.py @@ -0,0 +1,176 @@ +""" +Defines the handler logic needed to effectively fetch GitLab auth keys +from either the redis cache or the database. Follows the same patterns as +the github api key handler. +""" +import httpx +import time +import random + +from typing import Optional, List + +from augur.tasks.util.redis_list import RedisList +from augur.application.db.session import DatabaseSession +from augur.application.config import AugurConfig +from sqlalchemy import func + + +class NoValidKeysError(Exception): + """Defines an exception that is thrown when no gitlab keys are valid""" + + +class GitlabApiKeyHandler(): + """Handles Gitlab API key retrieval from the database and redis + + Attributes: + session (DatabaseSession): Database connection + logger (logging.Logger): Handles all logs + oauth_redis_key (str): The key where the gitlab api keys are cached in redis + redis_key_list (RedisList): Acts like a python list, and interacts directly with the redis cache + config_key (str): The api key that is stored in the users config table + key: (List[str]): List of keys retrieve from database or cache + """ + + def __init__(self, session: DatabaseSession): + + self.session = session + self.logger = session.logger + self.config = AugurConfig(self.logger, session) + + self.oauth_redis_key = "gitlab_oauth_keys_list" + + self.redis_key_list = RedisList(self.oauth_redis_key) + + self.config_key = self.get_config_key() + + self.keys = self.get_api_keys() + + self.logger.info(f"Retrieved {len(self.keys)} gitlab api keys for use") + + def get_random_key(self): + """Retrieves a random key from the list of keys + + Returns: + A random gitlab api key + """ + + return random.choice(self.keys) + + def get_config_key(self) -> str: + """Retrieves the users gitlab api key from their config table + + Returns: + Github API key from config table + """ + return self.config.get_value("Keys", "gitlab_api_key") + + def get_api_keys_from_database(self) -> List[str]: + """Retieves all gitlab api keys from database + + Note: + It retrieves all the keys from the database except the one defined in the users config + + Returns: + Github api keys that are in the database + """ + from augur.application.db.models import WorkerOauth + + select = WorkerOauth.access_token + # randomizing the order at db time + #select.order_by(func.random()) + where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'gitlab'] + + return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] + #return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] + + + def get_api_keys(self) -> List[str]: + """Retrieves all valid Github API Keys + + Note: + It checks to see if the keys are in the redis cache first. + It removes bad keys before returning. + If keys were taken from the database, it caches all the valid keys that were found + + Returns: + Valid Github api keys + """ + + redis_keys = list(self.redis_key_list) + + if redis_keys: + return redis_keys + + attempts = 0 + while attempts < 3: + + try: + keys = self.get_api_keys_from_database() + break + except Exception as e: + self.logger.error(f"Ran into issue when fetching key from database:\n {e}\n") + self.logger.error("Sleeping for 5 seconds...") + time.sleep(5) + attempts += 1 + + if self.config_key is not None: + keys += [self.config_key] + + if len(keys) == 0: + return [] + + valid_keys = [] + with httpx.Client() as client: + + for key in keys: + + # removes key if it returns "Bad Credentials" + if self.is_bad_api_key(client, key) is False: + valid_keys.append(key) + else: + print(f"WARNING: The key '{key}' is not a valid key. Hint: If valid in past it may have expired") + + # just in case the mulitprocessing adds extra values to the list. + # we are clearing it before we push the values we got + self.redis_key_list.clear() + + # add all the keys to redis + self.redis_key_list.extend(valid_keys) + + if not valid_keys: + raise NoValidKeysError("No valid gitlab api keys found in the config or worker oauth table") + + + # shuffling the keys so not all processes get the same keys in the same order + #valid_now = valid_keys + #try: + #self.logger.info(f'valid keys before shuffle: {valid_keys}') + #valid_keys = random.sample(valid_keys, len(valid_keys)) + #self.logger.info(f'valid keys AFTER shuffle: {valid_keys}') + #except Exception as e: + # self.logger.debug(f'{e}') + # valid_keys = valid_now + # pass + + return valid_keys + + def is_bad_api_key(self, client: httpx.Client, oauth_key: str) -> bool: + """Determines if a Gitlab API key is bad + + Args: + client: makes the http requests + oauth_key: gitlab api key that is being tested + + Returns: + True if key is bad. False if the key is good + """ + + url = "https://gitlab.com/api/v4/user" + + headers = {'Authorization': f'Bearer {oauth_key}'} + + response = client.request(method="GET", url=url, headers=headers, timeout=180) + if response.status_code == 401: + return True + + return False \ No newline at end of file diff --git a/augur/tasks/gitlab/gitlab_random_key_auth.py b/augur/tasks/gitlab/gitlab_random_key_auth.py new file mode 100644 index 000000000..64ba31dd1 --- /dev/null +++ b/augur/tasks/gitlab/gitlab_random_key_auth.py @@ -0,0 +1,26 @@ +"""Defines the GitlabRandomKeyAuth class""" + +from augur.tasks.util.random_key_auth import RandomKeyAuth +from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler +from augur.application.db.session import DatabaseSession + + +class GitlabRandomKeyAuth(RandomKeyAuth): + """Defines a gitlab specific RandomKeyAuth class so + gitlab collections can have a class randomly selects an api key for each request + """ + + def __init__(self, session: DatabaseSession, logger): + """Creates a GitlabRandomKeyAuth object and initializes the RandomKeyAuth parent class""" + + + # gets the gitlab api keys from the database via the GitlabApiKeyHandler + gitlab_api_keys = GitlabApiKeyHandler(session).keys + + if not gitlab_api_keys: + print("Failed to find github api keys. This is usually because your key has expired") + + header_name = "Authorization" + key_format = "Bearer {0}" + + super().__init__(gitlab_api_keys, header_name, session.logger, key_format) \ No newline at end of file diff --git a/augur/tasks/gitlab/gitlab_task_session.py b/augur/tasks/gitlab/gitlab_task_session.py new file mode 100644 index 000000000..58a6e6437 --- /dev/null +++ b/augur/tasks/gitlab/gitlab_task_session.py @@ -0,0 +1,55 @@ +""" +Defines a GitLab-specific session and manifest object for use in GitLab tasks +""" +from logging import Logger + +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from augur.application.db.session import DatabaseSession + +class GitlabTaskManifest: + """ + Manifest object that represents the state and common elements of + the specified task. GitLab version for the GitLab tasks. + + Attributes: + augur_db: sqlalchemy db object + key_auth: GitLab specific key auth retrieval collection + logger: logging object + platform_id: GitLab specific platform id (github is 1) + """ + + def __init__(self, logger): + + from augur.tasks.init.celery_app import engine + + self.augur_db = DatabaseSession(logger, engine) + self.key_auth = GitlabRandomKeyAuth(self.augur_db.session, logger) + self.logger = logger + self.platform_id = 2 + + def __enter__(self): + + return self + + def __exit__(self, exception_type, exception_value, exception_traceback): + + self.augur_db.close() + +class GitlabTaskSession(DatabaseSession): + """ORM session used in gitlab tasks. + This class adds the platform_id and the gitlab key authentication class, + to the already existing DatabaseSession so there is a central location to access + api keys and a single platform_id reference + + Attributes: + oauths (GitlabRandomKeyAuth): Class that handles randomly assigning gitlab api keys to httpx requests + platform_id (int): The id that refers to the Gitlab platform + """ + + def __init__(self, logger: Logger, engine=None): + + super().__init__(logger, engine=engine) + + self.oauths = GitlabRandomKeyAuth(self, logger) + self.platform_id = 2 + diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py new file mode 100644 index 000000000..cf6e5e5da --- /dev/null +++ b/augur/tasks/gitlab/issues_task.py @@ -0,0 +1,320 @@ +""" +Defines the set of tasks used to retrieve GitLab issue data. +""" +import logging +import traceback + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest +from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo +from augur.application.db.util import execute_session_query + +platform_id = 2 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_issues(repo_git : str) -> int: + """ + Retrieve and parse gitlab issues for the desired repo + + Arguments: + repo_git: the repo url string + """ + + logger = logging.getLogger(collect_gitlab_issues.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + try: + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + owner, repo = get_owner_repo(repo_git) + + issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, manifest.key_auth) + + if issue_data: + issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + + return issue_ids + else: + logger.info(f"{owner}/{repo} has no issues") + return [] + except Exception as e: + logger.error(f"Could not collect gitlab issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + return -1 + + + +def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: + """ + Retrieve only the needed data for issues from the api response + + Arguments: + repo_git: url of the relevant repo + logger: loggin object + key_auth: key auth cache and rotator object + """ + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting gitlab issues for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues?with_labels_details=True" + issues = GitlabApiHandler(key_auth, logger) + + all_data = [] + num_pages = issues.get_num_pages(url) + for page_data, page in issues.iter_pages(url): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo}: Gitlab Issues Page {page} contains no data...returning") + logger.info(f"{owner}/{repo}: Issues Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo}: Gitlab Issues Page {page} of {num_pages}") + + all_data += page_data + + return all_data + +def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: + """ + Retrieve only the needed data for issues from the api response + + Arguments: + issues: List of dictionaries of issue data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + # get repo_id or have it passed + tool_source = "Gitlab Issue Task" + tool_version = "2.0" + data_source = "Gitlab API" + + issue_dicts = [] + issue_ids = [] + issue_mapping_data = {} + for issue in issues: + + issue_ids.append(issue["iid"]) + + issue_dicts.append( + extract_needed_issue_data_from_gitlab_issue(issue, repo_id, tool_source, tool_version, data_source) + ) + + issue_labels = extract_needed_gitlab_issue_label_data(issue["labels"], repo_id, + tool_source, tool_version, data_source) + + issue_assignees = extract_needed_gitlab_issue_assignee_data(issue["assignees"], repo_id, + tool_source, tool_version, data_source) + + mapping_data_key = issue["id"] + issue_mapping_data[mapping_data_key] = { + "labels": issue_labels, + "assignees": issue_assignees, + } + + + if len(issue_dicts) == 0: + print("No gitlab issues found while processing") + return + + logger.info(f"{task_name}: Inserting {len(issue_dicts)} gitlab issues") + issue_natural_keys = ["repo_id", "gh_issue_id"] + issue_string_columns = ["issue_title", "issue_body"] + issue_return_columns = ["gh_issue_id", "issue_id"] + + issue_return_data = augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + + issue_label_dicts = [] + issue_assignee_dicts = [] + for data in issue_return_data: + + gh_issue_id = data["gh_issue_id"] + issue_id = data["issue_id"] + + try: + other_issue_data = issue_mapping_data[gh_issue_id] + except KeyError as e: + logger.info(f"{task_name}: Cold not find other gitlab issue data. This should never happen. Error: {e}") + + + # add the issue id to the lables and assignees, then add them to a list of dicts that will be inserted soon + dict_key = "issue_id" + issue_label_dicts += add_key_value_pair_to_dicts(other_issue_data["labels"], dict_key, issue_id) + issue_assignee_dicts += add_key_value_pair_to_dicts(other_issue_data["assignees"], dict_key, issue_id) + + + logger.info(f"{task_name}: Inserting other gitlab issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") + + # inserting issue labels + # we are using label_src_id and issue_id to determine if the label is already in the database. + issue_label_natural_keys = ['label_src_id', 'issue_id'] + issue_label_string_fields = ["label_text", "label_description"] + augur_db.insert_data(issue_label_dicts, IssueLabel, + issue_label_natural_keys, string_fields=issue_label_string_fields) + + # inserting issue assignees + # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. + # issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] + # augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + + return issue_ids + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + issue_ids: Set of issue ids to collect coments for + repo_git: repo url + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issues.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + comments = retrieve_all_gitlab_issue_comments(manifest.key_auth, logger, issue_ids, repo_git) + + if comments: + logger.info(f"Length of comments: {len(comments)}") + process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab issue comments") + + +def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): + """ + Retrieve only the needed data for issue comments + + Arguments: + key_auth: key auth cache and rotator object + logger: loggin object + issue_ids: ids of issues to find comements for + repo_git: repo url + """ + + owner, repo = get_owner_repo(repo_git) + + all_comments = {} + issue_count = len(issue_ids) + index = 1 + + comments = GitlabApiHandler(key_auth, logger) + + for id in issue_ids: + + logger.info(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" + + for page_data, page in comments.iter_pages(url): + + if page_data is None or len(page_data) == 0: + break + + if id in all_comments: + all_comments[id].extend(page_data) + else: + all_comments[id] = page_data + + index += 1 + + return all_comments + + +def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for issue messages from the api response + + Arguments: + data: List of dictionaries of issue event data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Gitlab issue comments" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + issue_number_to_id_map = {} + issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + for issue in issues: + issue_number_to_id_map[issue.gh_issue_number] = issue.issue_id + + message_dicts = [] + message_ref_mapping_data = {} + for id, messages in data.items(): + + try: + issue_id = issue_number_to_id_map[id] + except KeyError: + logger.info(f"{task_name}: Could not find related issue") + logger.info(f"{task_name}: We were searching for issue number {id} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + for message in messages: + + issue_message_ref_data = extract_needed_gitlab_issue_message_ref_data(message, issue_id, repo_id, tool_source, tool_version, data_source) + + message_ref_mapping_data[message["id"]] = { + "msg_ref_data": issue_message_ref_data + } + + message_dicts.append( + extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source) + ) + + + logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") + message_natural_keys = ["platform_msg_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_string_fields = ["msg_text"] + message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + return_columns=message_return_columns, string_fields=message_string_fields) + + issue_message_ref_dicts = [] + for data in message_return_data: + + augur_msg_id = data["msg_id"] + platform_message_id = data["platform_msg_id"] + + ref = message_ref_mapping_data[platform_message_id] + message_ref_data = ref["msg_ref_data"] + message_ref_data["msg_id"] = augur_msg_id + + issue_message_ref_dicts.append(message_ref_data) + + logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} gitlab issue messages ref rows") + issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] + augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + + diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py new file mode 100644 index 000000000..ccf3c7e01 --- /dev/null +++ b/augur/tasks/gitlab/merge_request_task.py @@ -0,0 +1,560 @@ +import logging + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message +from augur.application.db.util import execute_session_query + +platform_id = 2 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_merge_requests(repo_git: str) -> int: + """ + Retrieve and parse gitlab MRs for the desired repo + + Arguments: + repo_git: the repo url string + """ + + + logger = logging.getLogger(collect_gitlab_merge_requests.__name__) + + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + repo_id = augur_db.session.query(Repo).filter( + Repo.repo_git == repo_git).one().repo_id + + owner, repo = get_owner_repo(repo_git) + mr_data = retrieve_all_mr_data(repo_git, logger, manifest.key_auth) + + if mr_data: + mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger, augur_db) + + return mr_ids + else: + logger.info(f"{owner}/{repo} has no merge requests") + return [] + + +def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: + """ + Retrieve only the needed data for MRs from the api response + + Arguments: + repo_git: url of the relevant repo + logger: loggin object + key_auth: key auth cache and rotator object + """ + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting pull requests for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests?with_labels_details=True" + mrs = GitlabApiHandler(key_auth, logger) + + all_data = [] + num_pages = mrs.get_num_pages(url) + for page_data, page in mrs.iter_pages(url): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo} Mrs Page {page} contains no data...returning") + logger.info(f"{owner}/{repo} Mrs Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo} Mrs Page {page} of {num_pages}") + + all_data += page_data + + return all_data + + +def process_merge_requests(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + data: collection of mr data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + + Returns: + List of parsed MR ids. + """ + + tool_source = "Mr Task" + tool_version = "2.0" + data_source = "Gitlab API" + + merge_requests = [] + mr_ids = [] + mr_mapping_data = {} + for mr in data: + + mr_ids.append(mr["iid"]) + + merge_requests.append(extract_needed_pr_data_from_gitlab_merge_request(mr, repo_id, tool_source, tool_version)) + + assignees = extract_needed_merge_request_assignee_data(mr["assignees"], repo_id, tool_source, tool_version, data_source) + + labels = extract_needed_mr_label_data(mr["labels"], repo_id, tool_source, tool_version, data_source) + + mapping_data_key = mr["id"] + mr_mapping_data[mapping_data_key] = { + "assignees": assignees, + "labels": labels + } + + logger.info(f"{task_name}: Inserting mrs of length: {len(merge_requests)}") + pr_natural_keys = ["repo_id", "pr_src_id"] + pr_string_fields = ["pr_src_title", "pr_body"] + pr_return_columns = ["pull_request_id", "pr_src_id"] + pr_return_data = augur_db.insert_data(merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) + + + mr_assignee_dicts = [] + mr_label_dicts = [] + for data in pr_return_data: + + mr_src_id = data["pr_src_id"] + pull_request_id = data["pull_request_id"] + + try: + other_mr_data = mr_mapping_data[mr_src_id] + except KeyError as e: + logger.info(f"Cold not find other pr data. This should never happen. Error: {e}") + + dict_key = "pull_request_id" + mr_assignee_dicts += add_key_value_pair_to_dicts(other_mr_data["assignees"], dict_key, pull_request_id) + mr_label_dicts += add_key_value_pair_to_dicts(other_mr_data["labels"], dict_key, pull_request_id) + + logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") + + # TODO: Setup unique key on asignees with a value of ('cntrb_id', 'pull_request_id') and add 'cntrb_id' to assingee data + # mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] + # augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) + + pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] + pr_label_string_fields = ["pr_src_description"] + augur_db.insert_data(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + + return mr_ids + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_comments(mr_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + mr_ids: ids of MRs to paginate comments for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_comments.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") + comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger, response_type="list") + + if comments: + logger.info(f"Length of merge request comments: {len(comments)}") + process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request comments") + + +def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + data: List of dictionaries of mr message data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Gitlab mr comments" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + message_dicts = [] + message_ref_mapping_data = {} + for id, messages in data.items(): + + try: + pull_request_id = mr_number_to_id_map[id] + except KeyError: + logger.info(f"{task_name}: Could not find related mr") + logger.info(f"{task_name}: We were searching for mr number {id} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + for message in messages: + + mr_message_ref_data = extract_needed_gitlab_mr_message_ref_data(message, pull_request_id, repo_id, tool_source, tool_version, data_source) + + message_ref_mapping_data[message["id"]] = { + "msg_ref_data": mr_message_ref_data + } + + message_dicts.append( + extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source) + ) + + + logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") + message_natural_keys = ["platform_msg_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_string_fields = ["msg_text"] + message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + return_columns=message_return_columns, string_fields=message_string_fields) + + mr_message_ref_dicts = [] + for data in message_return_data: + + augur_msg_id = data["msg_id"] + platform_message_id = data["platform_msg_id"] + + ref = message_ref_mapping_data[platform_message_id] + message_ref_data = ref["msg_ref_data"] + message_ref_data["msg_id"] = augur_msg_id + + mr_message_ref_dicts.append(message_ref_data) + + logger.info(f"{task_name}: Inserting {len(mr_message_ref_dicts)} mr messages ref rows") + mr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] + augur_db.insert_data(mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_metadata(mr_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + mr_ids: list of mr ids to find metadata for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_metadata.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") + metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger, response_type="dict") + + if metadata_list: + logger.info(f"Length of merge request metadata: {len(metadata_list)}") + process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request metadata") + +def process_mr_metadata(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + data: List of dictionaries of mr metadata + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Mr Metadata Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_metadata = [] + for id, metadata in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + all_metadata.extend(extract_needed_mr_metadata(metadata, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + logger.info(f"{task_name}: Inserting {len(all_metadata)} merge request metadata") + pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] + augur_db.insert_data(all_metadata, PullRequestMeta, pr_metadata_natural_keys) + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_reviewers(mr_ids, repo_git) -> int: + """ + Retrieve and parse mr reviewers for the desired repo + + Arguments: + mr_ids: mrs to search for reviewers for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_reviewers.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") + reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger, response_type="dict") + + if reviewers: + logger.info(f"Length of merge request reviewers: {len(reviewers)}") + process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") + +def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr Reviewer data from the api response + + Arguments: + data: List of dictionaries of mr Reviewer data + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Mr Reviewer Task" + tool_version = "2.0" + data_source = "Gitlab API" + + logger.info(f"Running {task_name}...") + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_reviewers = [] + for id, values in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + reviewers = extract_needed_mr_reviewer_data(values, pull_request_id, tool_source, tool_version, data_source) + + all_reviewers += reviewers + + # TODO: Need to add unique key with pull_request_id and cntrb_id to insert gitlab reviewers + # pr_reviewer_natural_keys = ["pull_request_id", "cntrb_id"] + # augur_db.insert_data(all_reviewers, PullRequestReviewer, pr_reviewer_natural_keys) + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_commits(mr_ids, repo_git) -> int: + """ + Retrieve and parse mr commits for the desired repo + + Arguments: + mr_ids: ids of mrs to get commits for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_commits.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") + commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger, response_type="list") + + if commits: + logger.info(f"Length of merge request commits: {len(commits)}") + process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request commits") + + +def process_mr_commits(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr commits from the api response + + Arguments: + data: List of dictionaries of mr commit data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Mr Commit Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_commits = [] + for id, values in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + for commit in values: + + all_commits.append(extract_needed_mr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + + logger.info(f"{task_name}: Inserting {len(all_commits)} merge request commits") + pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] + augur_db.insert_data(all_commits,PullRequestCommit,pr_commits_natural_keys) + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_files(mr_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + mr_ids: the ids of mrs to get files for. + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_files.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") + files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger, response_type="dict") + + if files: + logger.info(f"Length of merge request files: {len(files)}") + process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request files") + +def process_mr_files(data, task_name, repo_id, logger, augur_db): + + tool_source = "Mr files Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_files = [] + for id, gitlab_file_data in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + all_files.extend(extract_needed_mr_file_data(gitlab_file_data, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + logger.info(f"{task_name}: Inserting {len(all_files)} merge request files") + pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] + augur_db.insert_data(all_files, PullRequestFile, pr_file_natural_keys) + + +def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): + """ + Retrieve specific mr data from the GitLab api. + + Arguments: + ids: mr ids to paginate info for + url: endpoint to paginate or hit + name: name of data to collect + owner: owner of the repo + repo: repo name + key_auth: key auth cache and rotator object + logger: loggin object + response_type: type of data to get from the api + """ + + all_data = {} + mr_count = len(ids) + index = 1 + + api_handler = GitlabApiHandler(key_auth, logger) + for id in ids: + + print(f"Collecting {owner}/{repo} gitlab merge request {name} for merge request {index} of {mr_count}") + formatted_url = url.format(id=id) + + if response_type == "dict": + page_data, _, _ = api_handler.retrieve_data(formatted_url) + if page_data: + all_data[id] = page_data + + elif response_type == "list": + + for page_data, _ in api_handler.iter_pages(formatted_url): + + if page_data is None or len(page_data) == 0: + break + + if id in all_data: + all_data[id].extend(page_data) + else: + all_data[id] = page_data + else: + raise Exception(f"Unexpected response type: {response_type}") + + index += 1 + + return all_data diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index 706541d1c..274305449 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -20,16 +20,7 @@ from augur.application.db.engine import get_database_string from augur.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string from augur.application.db.models import CollectionStatus, Repo - -class CollectionState(Enum): - SUCCESS = "Success" - PENDING = "Pending" - ERROR = "Error" - COLLECTING = "Collecting" - INITIALIZING = "Initializing" - UPDATE = "Update" - FAILED_CLONE = "Failed Clone" - +from augur.tasks.util.collection_state import CollectionState logger = logging.getLogger(__name__) @@ -50,6 +41,10 @@ class CollectionState(Enum): 'augur.tasks.github.pull_requests.commits_model.tasks', 'augur.tasks.github.traffic.tasks'] +gitlab_tasks = ['augur.tasks.gitlab.merge_request_task', + 'augur.tasks.gitlab.issues_task', + 'augur.tasks.gitlab.events_task'] + git_tasks = ['augur.tasks.git.facade_tasks', 'augur.tasks.git.dependency_tasks.tasks', 'augur.tasks.git.dependency_libyear_tasks.tasks', @@ -66,7 +61,7 @@ class CollectionState(Enum): frontend_tasks = ['augur.tasks.frontend'] -tasks = start_tasks + github_tasks + git_tasks + materialized_view_tasks + frontend_tasks +tasks = start_tasks + github_tasks + gitlab_tasks + git_tasks + materialized_view_tasks + frontend_tasks if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": tasks += data_analysis_tasks @@ -81,7 +76,7 @@ class CollectionState(Enum): #Classes for tasks that take a repo_git as an argument. class AugurCoreRepoCollectionTask(celery.Task): - def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core'): + def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core',after_fail=CollectionState.ERROR.value): from augur.tasks.init.celery_app import engine logger = AugurLogger(logger_name).get_logger() @@ -100,7 +95,7 @@ def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_h prevStatus = getattr(repoStatus, f"{collection_hook}_status") if prevStatus == CollectionState.COLLECTING.value or prevStatus == CollectionState.INITIALIZING.value: - setattr(repoStatus, f"{collection_hook}_status", CollectionState.ERROR.value) + setattr(repoStatus, f"{collection_hook}_status", after_fail) setattr(repoStatus, f"{collection_hook}_task_id", None) session.commit() @@ -125,6 +120,7 @@ def on_failure(self,exc,task_id,args,kwargs,einfo): repo_git = args[0] self.augur_handle_task_failure(exc,task_id,repo_git, "ml_task_failure", collection_hook='ml') + #task_cls='augur.tasks.init.celery_app:AugurCoreRepoCollectionTask' celery_app = Celery('tasks', broker=BROKER_URL, backend=BACKEND_URL, include=tasks) @@ -205,7 +201,7 @@ def setup_periodic_tasks(sender, **kwargs): """ from celery.schedules import crontab from augur.tasks.start_tasks import augur_collection_monitor, augur_collection_update_weights - from augur.tasks.start_tasks import non_repo_domain_tasks + from augur.tasks.start_tasks import non_repo_domain_tasks, retry_errored_repos from augur.tasks.git.facade_tasks import clone_repos from augur.tasks.db.refresh_materialized_views import refresh_materialized_views from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model @@ -230,6 +226,9 @@ def setup_periodic_tasks(sender, **kwargs): logger.info(f"Scheduling update of collection weights on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) + logger.info(f"Setting 404 repos to be marked for retry on midnight each day") + sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s()) + logger.info(f"Scheduling contributor breadth every 30 days") thirty_days_in_seconds = 30*24*60*60 sender.add_periodic_task(thirty_days_in_seconds, contributor_breadth_model.s()) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 54068d30a..a9ba7e163 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -24,15 +24,18 @@ from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data +from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_comments, collect_merge_request_metadata, collect_merge_request_reviewers, collect_merge_request_commits, collect_merge_request_files +from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments +from augur.tasks.gitlab.events_task import collect_gitlab_issue_events, collect_gitlab_merge_request_events from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * # from augur.tasks.data_analysis import * from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.session import DatabaseSession from logging import Logger -from enum import Enum from augur.tasks.util.redis_list import RedisList from augur.application.db.models import CollectionStatus, Repo +from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import * from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor @@ -93,6 +96,27 @@ def primary_repo_collect_phase(repo_git): return repo_task_group +def primary_repo_collect_phase_gitlab(repo_git): + + logger = logging.getLogger(primary_repo_collect_phase_gitlab.__name__) + + jobs = group( + chain(collect_gitlab_merge_requests.si(repo_git), group( + #collect_merge_request_comments.s(repo_git), + #collect_merge_request_reviewers.s(repo_git), + collect_merge_request_metadata.s(repo_git), + collect_merge_request_commits.s(repo_git), + collect_merge_request_files.s(repo_git), + collect_gitlab_merge_request_events.si(repo_git), + )), + chain(collect_gitlab_issues.si(repo_git), group( + #collect_gitlab_issue_comments.s(repo_git), + collect_gitlab_issue_events.si(repo_git), + )), + ) + + return jobs + #This phase creates the message for secondary collection tasks. #These are less important and have their own worker. @@ -102,8 +126,8 @@ def secondary_repo_collect_phase(repo_git): repo_task_group = group( process_pull_request_files.si(repo_git), process_pull_request_commits.si(repo_git), - process_ossf_dependency_metrics.si(repo_git), - chain(collect_pull_request_reviews.si(repo_git), collect_pull_request_review_comments.si(repo_git)) + chain(collect_pull_request_reviews.si(repo_git), collect_pull_request_review_comments.si(repo_git)), + process_ossf_dependency_metrics.si(repo_git) ) return repo_task_group @@ -146,20 +170,23 @@ def non_repo_domain_tasks(): def build_primary_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): #Add all required tasks to a list and pass it to the CollectionRequest primary_enabled_phases = [] + primary_gitlab_enabled_phases = [] #Primary jobs if prelim_phase.__name__ in enabled_phase_names: primary_enabled_phases.append(prelim_phase) primary_enabled_phases.append(primary_repo_collect_phase) + primary_gitlab_enabled_phases.append(primary_repo_collect_phase_gitlab) #task success is scheduled no matter what the config says. def core_task_success_util_gen(repo_git): return core_task_success_util.si(repo_git) primary_enabled_phases.append(core_task_success_util_gen) + primary_gitlab_enabled_phases.append(core_task_success_util_gen) - primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7) + primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7, gitlab_phases=primary_gitlab_enabled_phases) primary_request.get_valid_repos(session) return primary_request @@ -301,9 +328,41 @@ def augur_collection_update_weights(): session.commit() #git_update_commit_count_weight(repo_git) +@celery.task +def retry_errored_repos(): + """ + Periodic task to reset repositories that have errored and try again. + """ + from augur.tasks.init.celery_app import engine + logger = logging.getLogger(create_collection_status_records.__name__) + + #TODO: Isaac needs to normalize the status's to be abstract in the + #collection_status table once augur dev is less unstable. + with DatabaseSession(logger,engine) as session: + query = s.sql.text(f"""UPDATE repo SET secondary_status = {CollectionState.PENDING.value}""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' ;""" + f"""UPDATE repo SET core_status = {CollectionState.PENDING.value}""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' ;""" + f"""UPDATE repo SET facade_status = {CollectionState.PENDING.value}""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' ;""" + f"""UPDATE repo SET ml_status = {CollectionState.PENDING.value}""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' ;""" + ) + + session.execute_sql(query) + + + #Retry this task for every issue so that repos that were added manually get the chance to be added to the collection_status table. @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) def create_collection_status_records(): + """ + Automatic task that runs and checks for repos that haven't been given a collection_status + record corresponding to the state of their collection at the monent. + + A special celery task that automatically retries itself and has no max retries. + """ + from augur.tasks.init.celery_app import engine logger = logging.getLogger(create_collection_status_records.__name__) diff --git a/augur/tasks/util/collection_state.py b/augur/tasks/util/collection_state.py new file mode 100644 index 000000000..b5b8f0d26 --- /dev/null +++ b/augur/tasks/util/collection_state.py @@ -0,0 +1,30 @@ + +from enum import Enum + +class CollectionState(Enum): + """ + Enum of possible states a repository's collection + can have whether it is core, secondary, facade, etc. + + Attributes: + + SUCCESS: State of success for the jobs in that collection hook + PENDING: Means the repo has not had collection run at all + ERROR: The collection hook has crashed + COLLECTING: The collection hook is running + INITIALIZING: Only for facade, indicates the repo is being cloned via git + UPDATE: Only for facade, indicates the repo has been cloned + FAILED_CLONE: Only for facade, indicates the clone has failed (usually 404) + STANDBY: Indicates the repo has been paused + IGNORE: Repo has encountered an error and we will not try again (usually 404) + """ + + SUCCESS = "Success" + PENDING = "Pending" + ERROR = "Error" + COLLECTING = "Collecting" + INITIALIZING = "Initializing" + UPDATE = "Update" + FAILED_CLONE = "Failed Clone" + STANDBY = "Standby" + IGNORE = "Ignore" diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 4d5b663a2..89ae5f3d5 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -24,18 +24,9 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.session import DatabaseSession from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps +from augur.tasks.util.collection_state import CollectionState -# class syntax -class CollectionState(Enum): - SUCCESS = "Success" - PENDING = "Pending" - ERROR = "Error" - COLLECTING = "Collecting" - INITIALIZING = "Initializing" - UPDATE = "Update" - FAILED_CLONE = "Failed Clone" - def get_list_of_all_users(session): #Get a list of all users. query = s.sql.text(""" @@ -132,9 +123,10 @@ def get_required_conditions_for_ml_repos(allow_collected_before = False, days_un class CollectionRequest: - def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1): + def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab_phases=None): self.name = name self.phases = phases + self.gitlab_phases = gitlab_phases self.max_repo = max_repo self.days_until_collect_again = days_until_collect_again self.new_status = CollectionState.PENDING.value @@ -587,27 +579,44 @@ def send_messages(self): for col_hook in self.collection_hooks: self.logger.info(f"Starting collection on {len(col_hook.repo_list)} {col_hook.name} repos") - + for repo_git in col_hook.repo_list: - #repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() - #repo_id = repo.repo_id - - augur_collection_sequence = [] - for job in col_hook.phases: - #Add the phase to the sequence in order as a celery task. - #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git)) - - #augur_collection_sequence.append(core_task_success_util.si(repo_git)) - #Link all phases in a chain and send to celery - augur_collection_chain = chain(*augur_collection_sequence) - task_id = augur_collection_chain.apply_async().task_id - - self.logger.info(f"Setting repo {col_hook.name} status to collecting for repo: {repo_git}") - - #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated - yield repo_git, task_id, col_hook.name + repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() + if "github" in repo.repo_git: + augur_collection_sequence = [] + for job in col_hook.phases: + #Add the phase to the sequence in order as a celery task. + #The preliminary task creates the larger task chain + augur_collection_sequence.append(job(repo_git)) + + #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #Link all phases in a chain and send to celery + augur_collection_chain = chain(*augur_collection_sequence) + task_id = augur_collection_chain.apply_async().task_id + + self.logger.info(f"Setting github repo {col_hook.name} status to collecting for repo: {repo_git}") + + #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated + yield repo_git, task_id, col_hook.name + else: + if col_hook.gitlab_phases is not None: + + augur_collection_sequence = [] + for job in col_hook.gitlab_phases: + #Add the phase to the sequence in order as a celery task. + #The preliminary task creates the larger task chain + augur_collection_sequence.append(job(repo_git)) + + #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #Link all phases in a chain and send to celery + augur_collection_chain = chain(*augur_collection_sequence) + task_id = augur_collection_chain.apply_async().task_id + + self.logger.info(f"Setting gitlab repo {col_hook.name} status to collecting for repo: {repo_git}") + + #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated + yield repo_git, task_id, col_hook.name #def start_block_of_repos(logger,session,repo_git_identifiers,phases,repos_type,hook="core"): # diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index 6380ed22b..84c177724 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -138,7 +138,7 @@ def parse_json_from_subprocess_call(logger, subprocess_arr, cwd=None): try: required_output = json.loads(output) except json.decoder.JSONDecodeError as e: - session.logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") + logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") raise e return required_output diff --git a/augur/templates/login.j2 b/augur/templates/login.j2 index c71d02d50..faaab620e 100644 --- a/augur/templates/login.j2 +++ b/augur/templates/login.j2 @@ -108,7 +108,7 @@