From 431f58e4e47e5bea6da5c5755033805399cd9b2e Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Fri, 30 Aug 2024 06:54:13 -0500 Subject: [PATCH 01/21] add alembic script to add repo_src_id --- .../alembic/versions/30_add_repo_src_id.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 augur/application/schema/alembic/versions/30_add_repo_src_id.py diff --git a/augur/application/schema/alembic/versions/30_add_repo_src_id.py b/augur/application/schema/alembic/versions/30_add_repo_src_id.py new file mode 100644 index 000000000..c43c409ee --- /dev/null +++ b/augur/application/schema/alembic/versions/30_add_repo_src_id.py @@ -0,0 +1,23 @@ +"""Add commit message table + +Revision ID: 30 +Revises: 29 +Create Date: 2024-08-30 + +""" +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = '30' +down_revision = '29' +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column('repo', sa.Column('repo_src_id', sa.BigInteger(), nullable=True), schema='augur_data') + + +def downgrade(): + op.drop_column('repo', 'repo_src_id', schema='augur_data') From 8214dc5bd74103b0cca0e142c24d8ffbf5f72341 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 2 Sep 2024 15:40:49 -0500 Subject: [PATCH 02/21] add task to populate repo src id --- .../tasks/github/util/populate_repo_src_id.py | 57 +++++++++++++++++++ augur/tasks/init/celery_app.py | 3 +- augur/tasks/start_tasks.py | 2 + 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 augur/tasks/github/util/populate_repo_src_id.py diff --git a/augur/tasks/github/util/populate_repo_src_id.py b/augur/tasks/github/util/populate_repo_src_id.py new file mode 100644 index 000000000..8c5c088d3 --- /dev/null +++ b/augur/tasks/github/util/populate_repo_src_id.py @@ -0,0 +1,57 @@ +import logging +import sqlalchemy as s + +from augur.tasks.init.celery_app import celery_app as celery +from augur.application.db.lib import get_repo_by_repo_git, execute_sql +from augur.tasks.github.util.util import get_owner_repo +from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + +@celery.task +def populate_repo_src_id_task(repo_git): + + logger = logging.getLogger(populate_repo_src_id_task.__name__) + + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id + + owner, repo = get_owner_repo(repo_git) + + key_auth = GithubRandomKeyAuth(logger) + + repo_src_id = get_repo_src_id(owner, repo, logger, key_auth) + + update_repo_src_id(repo_id, repo_src_id) + + +def get_repo_src_id(owner, repo, logger, key_auth): + + + query = """query($repo: String!, $owner: String!) { + repository(name: $repo, owner: $owner) { + updatedAt + } + } + """ + + github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger) + + variables = { + "owner": owner, + "repo": repo + } + + result_keys = ["repository", "databaseId"] + + repo_src_id = github_graphql_data_access.get_resource(query, variables, result_keys) + + return repo_src_id + + + +def update_repo_src_id(repo_id, repo_src_id): + + query = s.sql.text("""UPDATE repo SET repo_src_id=:repo_src_id WHERE repo_id=:repo_id; + """).bindparams(repo_src_id=repo_src_id, repo_id=repo_id) + + execute_sql(query) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index da97751db..562ddcb90 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -37,7 +37,8 @@ 'augur.tasks.github.detect_move.tasks', 'augur.tasks.github.pull_requests.files_model.tasks', 'augur.tasks.github.pull_requests.commits_model.tasks', - 'augur.tasks.github.traffic'] + 'augur.tasks.github.traffic', + 'augur.tasks.github.util.populate_repo_src_id'] gitlab_tasks = ['augur.tasks.gitlab.merge_request_task', 'augur.tasks.gitlab.issues_task', diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 562069ce8..785470ce8 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -14,6 +14,7 @@ from augur.tasks.github.repo_info.tasks import collect_repo_info, collect_linux_badge_info from augur.tasks.github.pull_requests.files_model.tasks import process_pull_request_files from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits +from augur.tasks.github.util.populate_repo_src_id import populate_repo_src_id_task from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic import collect_github_repo_clones_data from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_metadata, collect_merge_request_commits, collect_merge_request_files, collect_merge_request_comments @@ -65,6 +66,7 @@ def primary_repo_collect_phase(repo_git, full_collection): #Define primary group of jobs for the primary collect phase: issues and pull requests. primary_repo_jobs = group( + populate_repo_src_id_task.si(repo_git) collect_issues.si(repo_git, full_collection), collect_pull_requests.si(repo_git, full_collection) ) From 9ad160c087d008307b0186f60ac375bd53f19065 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 7 Sep 2024 11:31:11 -0500 Subject: [PATCH 03/21] move get_repo_src_id to generic location --- .../tasks/github/util/populate_repo_src_id.py | 33 ++----------------- augur/tasks/github/util/util.py | 27 +++++++++++++++ 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/augur/tasks/github/util/populate_repo_src_id.py b/augur/tasks/github/util/populate_repo_src_id.py index 8c5c088d3..e5460f9b1 100644 --- a/augur/tasks/github/util/populate_repo_src_id.py +++ b/augur/tasks/github/util/populate_repo_src_id.py @@ -4,8 +4,8 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.lib import get_repo_by_repo_git, execute_sql from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.tasks.github.util import get_repo_src_id + @celery.task def populate_repo_src_id_task(repo_git): @@ -17,38 +17,11 @@ def populate_repo_src_id_task(repo_git): owner, repo = get_owner_repo(repo_git) - key_auth = GithubRandomKeyAuth(logger) - - repo_src_id = get_repo_src_id(owner, repo, logger, key_auth) + repo_src_id = get_repo_src_id(owner, repo, logger) update_repo_src_id(repo_id, repo_src_id) -def get_repo_src_id(owner, repo, logger, key_auth): - - - query = """query($repo: String!, $owner: String!) { - repository(name: $repo, owner: $owner) { - updatedAt - } - } - """ - - github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger) - - variables = { - "owner": owner, - "repo": repo - } - - result_keys = ["repository", "databaseId"] - - repo_src_id = github_graphql_data_access.get_resource(query, variables, result_keys) - - return repo_src_id - - - def update_repo_src_id(repo_id, repo_src_id): query = s.sql.text("""UPDATE repo SET repo_src_id=:repo_src_id WHERE repo_id=:repo_id; diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 8dd6e4d81..feb3c0e37 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -4,9 +4,36 @@ import json import httpx from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess from augur.application.db.lib import get_repo_by_repo_git from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps +def get_repo_src_id(owner, repo, logger): + + + query = """query($repo: String!, $owner: String!) { + repository(name: $repo, owner: $owner) { + databaseId + } + } + """ + + key_auth = GithubRandomKeyAuth(logger) + + github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger) + + variables = { + "owner": owner, + "repo": repo + } + + result_keys = ["repository", "databaseId"] + + repo_src_id = github_graphql_data_access.get_resource(query, variables, result_keys) + + return repo_src_id + + # This function adds a key value pair to a list of dicts and returns the modified list of dicts back def add_key_value_pair_to_dicts(data: List[dict], key: str, value: Any) -> List[dict]: From db7e9efdce33c16813b05d9428a4a797b72c8b4a Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 7 Sep 2024 11:37:11 -0500 Subject: [PATCH 04/21] fix syntax issues --- augur/tasks/github/util/populate_repo_src_id.py | 4 +--- augur/tasks/start_tasks.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/augur/tasks/github/util/populate_repo_src_id.py b/augur/tasks/github/util/populate_repo_src_id.py index e5460f9b1..e4aad54ab 100644 --- a/augur/tasks/github/util/populate_repo_src_id.py +++ b/augur/tasks/github/util/populate_repo_src_id.py @@ -3,9 +3,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.lib import get_repo_by_repo_git, execute_sql -from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.github.util import get_repo_src_id - +from augur.tasks.github.util.util import get_owner_repo, get_repo_src_id @celery.task def populate_repo_src_id_task(repo_git): diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 785470ce8..ab4cf217c 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -66,7 +66,7 @@ def primary_repo_collect_phase(repo_git, full_collection): #Define primary group of jobs for the primary collect phase: issues and pull requests. primary_repo_jobs = group( - populate_repo_src_id_task.si(repo_git) + populate_repo_src_id_task.si(repo_git), collect_issues.si(repo_git, full_collection), collect_pull_requests.si(repo_git, full_collection) ) From 7c6ea1195f6d8a739fb0727213cff99a30ceccc0 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 9 Sep 2024 18:31:04 -0500 Subject: [PATCH 05/21] first run at new algorithm --- augur/application/db/lib.py | 32 +++- augur/application/db/models/augur_data.py | 1 + augur/tasks/frontend.py | 154 ++++++++++++++++-- .../github/util/github_graphql_data_access.py | 27 +-- 4 files changed, 188 insertions(+), 26 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 8db495c76..e65f4665d 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -9,7 +9,7 @@ from psycopg2.errors import DeadlockDetected from typing import List, Any, Optional, Union -from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus +from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus, UserGroup, RepoGroup from augur.tasks.util.collection_state import CollectionState from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query @@ -144,6 +144,15 @@ def get_repo_by_repo_id(repo_id): return repo +def get_repo_by_src_id(src_id): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_src_id == src_id) + repo = execute_session_query(query, 'first') + + return repo + def remove_working_commits_by_repo_id_and_hashes(repo_id, commit_hashes): remove_working_commits = s.sql.text("""DELETE FROM working_commits @@ -537,3 +546,24 @@ def get_updated_issues(repo_id, since): with get_session() as session: return session.query(Issue).filter(Issue.repo_id == repo_id, Issue.updated_at >= since).order_by(Issue.gh_issue_number).all() + + +def get_group_by_name(user_id, group_name): + + + with get_session() as session: + + try: + user_group = session.query(UserGroup).filter(UserGroup.user_id == user_id, UserGroup.name == group_name).one() + except s.orm.exc.NoResultFound: + return None + + return user_group.group_id + +def get_repo_group_by_name(name): + + + with get_session() as session: + + return session.query(RepoGroup).filter(RepoGroup.rg_name == name).first() + \ No newline at end of file diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 934949138..afd4c685c 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -869,6 +869,7 @@ class Repo(Base): data_collection_date = Column( TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) + repo_src_id = Column(BigInteger) repo_group = relationship("RepoGroup", back_populates="repo") user_repo = relationship("UserRepo", back_populates="repo") diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index fffd79d33..669d8c1cd 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -1,8 +1,14 @@ import logging import re +import sqlalchemy as s from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.github.util.github_task_session import GithubTaskSession +from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess +from augur.application.db.lib import get_group_by_name, get_repo_group_by_name, get_repo_by_repo_git, get_repo_by_src_id +from augur.tasks.github.util.util import get_owner_repo +from augur.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME + from augur.application.db.models import UserRepo, Repo, User def parse_org_name(string): @@ -15,6 +21,91 @@ def parse_org_and_repo_name(string): match = re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', string) return match +@celery.task +def add_orgs_and_repos(user_id, group_name, org_urls, repo_urls): + + logger = logging.getLogger(add_org_repo_list.__name__) + + with GithubTaskSession(logger) as session: + + user = User.get_by_id(session, user_id) + + group = get_group_by_name(session, user_id, group_name) + if not group: + return False, {"status": "Invalid group name"} + + group_id = group.group_id + + for url in org_urls: + org_repos, _ = retrieve_owner_repos(url) + if not org_repos: + continue + + repo_urls.extend(org_repos) + + + data = get_repos_data(repo_urls, session, logger) + + for url in repo_urls: + + repo_data = data[url] + if not repo_data: + # skip since the repo doesn't exists + continue + + repo_type = repo_data["databaseId"] + repo_src_id = repo_data["owner"]["__typename"] + + try: + repo = get_repo_by_repo_git(url) + except s.orm.exc.NoResultFound: + # log a warning + continue + + repo = get_repo_by_src_id(repo_src_id) + if repo: + # log a warning + continue + + frontend_repo_group = get_repo_group_by_name(FRONTEND_REPO_GROUP_NAME) + if not frontend_repo_group: + return False, {"status": "Could not find repo group with name 'Frontend Repos'", "repo_url": url} + + repo_group_id = frontend_repo_group.repo_group_id + + + # These two things really need to be done in one commit + repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type) + if not repo_id: + # log a warning + continue + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + # log a warning + continue + + + # repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type) + # if not repo_id: + # return False, {"status": "Repo insertion failed", "repo_url": url} + + # result = UserRepo.insert(session, repo_id, group_id) + # if not result: + # return False, {"status": "repo_user insertion failed", "repo_url": url} + + #collection_status records are now only added during collection -IM 5/1/23 + #status = CollectionStatus.insert(session, repo_id) + #if not status: + # return False, {"status": "Failed to create status for repo", "repo_url": url} + + return True, {"status": "Repo Added", "repo_url": url} + +@celery.task +def add_org(): + + pass + @celery.task def add_org_repo_list(user_id, group_name, urls): @@ -75,24 +166,61 @@ def add_org_repo_list(user_id, group_name, urls): # TODO: Change to github specific -@celery.task -def add_repo(user_id, group_name, repo_url): +# @celery.task +# def add_repo(user_id, group_name, repo_url): - logger = logging.getLogger(add_org.__name__) +# logger = logging.getLogger(add_org.__name__) - with GithubTaskSession(logger) as session: - result = UserRepo.add_github_repo(session, repo_url, user_id, group_name) +# with GithubTaskSession(logger) as session: +# result = UserRepo.add_github_repo(session, repo_url, user_id, group_name) - print(repo_url, result) +# print(repo_url, result) -# TODO: Change to github specific -@celery.task -def add_org(user_id, group_name, org_url): +# # TODO: Change to github specific +# @celery.task +# def add_org(user_id, group_name, org_url): - logger = logging.getLogger(add_org.__name__) +# logger = logging.getLogger(add_org.__name__) - with GithubTaskSession(logger) as session: - result = UserRepo.add_github_org_repos(session, org_url, user_id, group_name) +# with GithubTaskSession(logger) as session: +# result = UserRepo.add_github_org_repos(session, org_url, user_id, group_name) + +# print(org_url, result) + + + + + + + + + + + +def get_repos_data(repo_urls, session, logger): + + github_graphql_data_access = GithubGraphQlDataAccess(session.oauths, logger, ingore_not_found_error=True) + + query_parts = [] + repo_map = {} + for i, url in enumerate(repo_urls): + owner, repo = get_owner_repo(url) + query_parts.append(f"""{i}: repository(owner: "{owner}", name: "{repo}") {{ + databaseId, owner {{ __typename }} + }}""") + repo_map[url] = i + + query = f"query GetRepoIds {{ {' '.join(query_parts)}}}" + + data = github_graphql_data_access.get_resource(query, {}, []) + + result_data = {} + for url in repo_urls: + key =repo_map[url] + repo_data = data[key] + + result_data[url] = repo_data + + return result_data - print(org_url, result) diff --git a/augur/tasks/github/util/github_graphql_data_access.py b/augur/tasks/github/util/github_graphql_data_access.py index 7d8c6851e..96b0c6ab7 100644 --- a/augur/tasks/github/util/github_graphql_data_access.py +++ b/augur/tasks/github/util/github_graphql_data_access.py @@ -22,10 +22,11 @@ class InvalidDataException(Exception): class GithubGraphQlDataAccess: - def __init__(self, key_manager, logger: logging.Logger): + def __init__(self, key_manager, logger: logging.Logger, ingore_not_found_error=False): self.logger = logger self.key_manager = key_manager + self.ingore_not_found_error = ingore_not_found_error def get_resource(self, query, variables, result_keys): @@ -77,17 +78,19 @@ def make_request(self, query, variables, timeout=40): response.raise_for_status() - json_response = response.json() - if "errors" in json_response and len(json_response["errors"]) > 0: - errors = json_response["errors"] - - not_found_error = self.__find_first_error_of_type(errors, "NOT_FOUND") - - if not_found_error: - message = not_found_error.get("message", "Resource not found.") - raise NotFoundException(f"Could not find: {message}") - - raise Exception(f"Github Graphql Data Access Errors: {errors}") + if not self.ingore_not_found_error: + + json_response = response.json() + if "errors" in json_response and len(json_response["errors"]) > 0: + errors = json_response["errors"] + + not_found_error = self.__find_first_error_of_type(errors, "NOT_FOUND") + + if not_found_error: + message = not_found_error.get("message", "Resource not found.") + raise NotFoundException(f"Could not find: {message}") + + raise Exception(f"Github Graphql Data Access Errors: {errors}") return response From 652d7f394da11d39dd3fb1a35798b62f02360348 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 9 Sep 2024 19:22:37 -0500 Subject: [PATCH 06/21] insert src_id and make it unique --- augur/application/db/models/augur_data.py | 5 +++-- .../schema/alembic/versions/30_add_repo_src_id.py | 2 ++ augur/tasks/frontend.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index afd4c685c..ddc9a14c4 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -1111,7 +1111,7 @@ def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): return result[0]["repo_id"] @staticmethod - def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_type): + def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_type, repo_src_id): """Add a repo to the repo table. Args: @@ -1146,7 +1146,8 @@ def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_ "repo_type": repo_type, "tool_source": tool_source, "tool_version": "1.0", - "data_source": "Git" + "data_source": "Git", + "repo_src_id": repo_src_id } repo_unique = ["repo_git"] diff --git a/augur/application/schema/alembic/versions/30_add_repo_src_id.py b/augur/application/schema/alembic/versions/30_add_repo_src_id.py index c43c409ee..d47eb568c 100644 --- a/augur/application/schema/alembic/versions/30_add_repo_src_id.py +++ b/augur/application/schema/alembic/versions/30_add_repo_src_id.py @@ -17,7 +17,9 @@ def upgrade(): op.add_column('repo', sa.Column('repo_src_id', sa.BigInteger(), nullable=True), schema='augur_data') + op.create_unique_constraint('repo_src_id_unique', 'repo', ['repo_src_id'], schema='augur_data') def downgrade(): + op.drop_constraint('repo_src_id_unique', 'repo', schema='augur_data', type_='unique') op.drop_column('repo', 'repo_src_id', schema='augur_data') diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index 669d8c1cd..c3bc80dd0 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -75,7 +75,7 @@ def add_orgs_and_repos(user_id, group_name, org_urls, repo_urls): # These two things really need to be done in one commit - repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type) + repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type, repo_src_id) if not repo_id: # log a warning continue From 3d89a4f870385ab7d12055fec195020b91f5d0ea Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 10 Sep 2024 18:05:29 -0500 Subject: [PATCH 07/21] improve algorithm --- augur/api/view/api.py | 23 +++- augur/tasks/frontend.py | 272 ++++++++++++++++++++++------------------ 2 files changed, 171 insertions(+), 124 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index cbd7e4a0f..a9653b9f9 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -2,7 +2,7 @@ import re from flask_login import current_user, login_required from augur.application.db.models import Repo, RepoGroup, UserGroup, UserRepo -from augur.tasks.frontend import add_org_repo_list, parse_org_and_repo_name, parse_org_name +from augur.tasks.frontend import add_orgs_and_repos, parse_org_and_repo_name, parse_org_name from .utils import * from ..server import app from augur.application.db.session import DatabaseSession @@ -68,11 +68,16 @@ def av_add_user_repo(): invalid_urls = [] + orgs = [] + repo_urls = [] with DatabaseSession(logger, current_app.engine) as session: for url in urls: # matches https://github.com/{org}/ or htts://github.com/{org} if (org_name := Repo.parse_github_org_url(url)): + + orgs.append(org_name) + rg_obj = RepoGroup.get_by_name(session, org_name) if rg_obj: # add the orgs repos to the group @@ -80,6 +85,9 @@ def av_add_user_repo(): # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} elif Repo.parse_github_repo_url(url)[0]: + + repo_urls.append(url) + org_name, repo_name = Repo.parse_github_repo_url(url) repo_git = f"https://github.com/{org_name}/{repo_name}" repo_obj = Repo.get_by_repo_git(session, repo_git) @@ -90,6 +98,9 @@ def av_add_user_repo(): elif (match := parse_org_and_repo_name(url)): org, repo = match.groups() repo_git = f"https://github.com/{org}/{repo}" + + repo_urls.append(repo_git) + repo_obj = Repo.get_by_repo_git(session, repo_git) if repo_obj: add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) @@ -97,6 +108,9 @@ def av_add_user_repo(): # matches /{org}/ or /{org} or {org}/ or {org} elif (match := parse_org_name(url)): org_name = match.group(1) + + orgs.append(org) + rg_obj = RepoGroup.get_by_name(session, org_name) logger.info(rg_obj) if rg_obj: @@ -117,9 +131,10 @@ def av_add_user_repo(): else: invalid_urls.append(url) - if urls: - urls = [url.lower() for url in urls] - add_org_repo_list.si(current_user.user_id, group, urls).apply_async() + if orgs or repo_urls: + repo_urls = [url.lower() for url in repo_urls] + orgs = [url.lower() for url in orgs] + add_orgs_and_repos.si(current_user.user_id, group, orgs, repo_urls).apply_async() flash("Adding repos and orgs in the background") diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index c3bc80dd0..e726468be 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -7,7 +7,7 @@ from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess from augur.application.db.lib import get_group_by_name, get_repo_group_by_name, get_repo_by_repo_git, get_repo_by_src_id from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME +from augur.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup from augur.application.db.models import UserRepo, Repo, User @@ -22,144 +22,201 @@ def parse_org_and_repo_name(string): return match @celery.task -def add_orgs_and_repos(user_id, group_name, org_urls, repo_urls): +def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): - logger = logging.getLogger(add_org_repo_list.__name__) + logger = logging.getLogger(add_orgs_and_repos.__name__) with GithubTaskSession(logger) as session: - user = User.get_by_id(session, user_id) - + # determine group id from name group = get_group_by_name(session, user_id, group_name) if not group: - return False, {"status": "Invalid group name"} + logger.error(f"Error while adding repo. Invalid group name of {group_name}. Cannot insert repos") + return group_id = group.group_id - for url in org_urls: + # get frontend repo group + frontend_repo_group = RepoGroup.get_by_name(FRONTEND_REPO_GROUP_NAME) + if not frontend_repo_group: + logger.error("Error while adding repo: Could not find frontend repo group so repos cannot be inserted") + return + + repo_group_id = frontend_repo_group.repo_group_id + + + # define repo_data and assoicate repos with frontend repo group + repo_data = [tuple(url, repo_group_id) for url in repo_urls] + + for org in orgs: + + # create repo group for org if it doesn't exist + repo_group = RepoGroup.get_by_name(org) + if not repo_group: + repo_group = create_repo_group(session, org) + + # retrieve repo urls for org org_repos, _ = retrieve_owner_repos(url) if not org_repos: continue - repo_urls.extend(org_repos) + # define urls and repo_group_id of org and then add to repo_data + org_repo_data = [tuple(url, repo_group.repo_group_id) for url in org_repos] + repo_data.extend(org_repo_data) - data = get_repos_data(repo_urls, session, logger) + # get data for repos to determine type, src id, and if they exist + data = get_repos_data(repo_data, session, logger) - for url in repo_urls: + for url, repo_group_id in repo_data: repo_data = data[url] if not repo_data: - # skip since the repo doesn't exists + # skip since cause the repo is not valid (doesn't exist likely) continue - repo_type = repo_data["databaseId"] - repo_src_id = repo_data["owner"]["__typename"] + repo_src_id = repo_data["databaseId"] + repo_type = repo_data["owner"]["__typename"] - try: - repo = get_repo_by_repo_git(url) - except s.orm.exc.NoResultFound: - # log a warning + repo = get_repo_by_repo_git(session, url) + if repo: + # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record + add_existing_repo_to_group(logger, session, user_id, group_name, repo.repo_id) + logger.warning(f"Error while adding repo: Repo already exists with {url}") continue repo = get_repo_by_src_id(repo_src_id) if repo: - # log a warning + # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record + add_existing_repo_to_group(logger, session, user_id, group_name, repo.repo_id) + logger.warning(f"Error while adding repo: Repo found with same src id. Inserting url: {url}. Inserting src_id {repo_src_id}") continue - frontend_repo_group = get_repo_group_by_name(FRONTEND_REPO_GROUP_NAME) - if not frontend_repo_group: - return False, {"status": "Could not find repo group with name 'Frontend Repos'", "repo_url": url} + add_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id) - repo_group_id = frontend_repo_group.repo_group_id + return - # These two things really need to be done in one commit - repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type, repo_src_id) - if not repo_id: - # log a warning - continue +def get_repos_data(repo_data, session, logger): - result = UserRepo.insert(session, repo_id, group_id) - if not result: - # log a warning - continue + repo_urls = [x[0] for x in repo_data] + + github_graphql_data_access = GithubGraphQlDataAccess(session.oauths, logger, ingore_not_found_error=True) + + query_parts = [] + repo_map = {} + for i, url in enumerate(repo_urls): + owner, repo = get_owner_repo(url) + query_parts.append(f"""{i}: repository(owner: "{owner}", name: "{repo}") {{ + databaseId, owner {{ __typename }} + }}""") + repo_map[url] = i + + query = f"query GetRepoIds {{ {' '.join(query_parts)}}}" + data = github_graphql_data_access.get_resource(query, {}, []) - # repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type) - # if not repo_id: - # return False, {"status": "Repo insertion failed", "repo_url": url} + result_data = {} + for url in repo_urls: + key =repo_map[url] + repo_data = data[key] - # result = UserRepo.insert(session, repo_id, group_id) - # if not result: - # return False, {"status": "repo_user insertion failed", "repo_url": url} + result_data[url] = repo_data + + return result_data - #collection_status records are now only added during collection -IM 5/1/23 - #status = CollectionStatus.insert(session, repo_id) - #if not status: - # return False, {"status": "Failed to create status for repo", "repo_url": url} +def get_repo_by_repo_git(session, url): - return True, {"status": "Repo Added", "repo_url": url} + return session.query(Repo).filter(Repo.repo_git == url).first() -@celery.task -def add_org(): - pass +def add_existing_repo_to_group(logger, session, user_id, group_name, repo_id): + logger.info("Adding existing repo to group") -@celery.task -def add_org_repo_list(user_id, group_name, urls): + group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) + if group_id is None: + return False + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + return False + +def create_repo_group(session, owner): - logger = logging.getLogger(add_org_repo_list.__name__) + repo_group = RepoGroup(rg_name=owner.lower(), rg_description="", rg_website="", rg_recache=0, rg_type="Unknown", + tool_source="Loaded by user", tool_version="1.0", data_source="Git") + session.add(repo_group) + session.commit() - with GithubTaskSession(logger) as session: + return repo_group + +def add_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id): + + # These two things really need to be done in one commit in the future to prevent one existing without the other + repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type, repo_src_id) + if not repo_id: + logger.error("Error while adding repo: Failed to insert github repo") + return + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + logger.error(f"Error while adding repo: Failed to insert user repo record. A record with a repo_id of {repo_id} and a group id of {group_id} needs to be added to the user repo table so that this repo shows up in the users group") + return + +# @celery.task +# def add_org_repo_list(user_id, group_name, urls): + +# logger = logging.getLogger(add_org_repo_list.__name__) + +# with GithubTaskSession(logger) as session: - user = User.get_by_id(session, user_id) - - invalid_urls = [] - valid_orgs = [] - valid_repos = [] - for url in urls: - - # matches https://github.com/{org}/ or http://github.com/{org} - if Repo.parse_github_org_url(url): - added = user.add_github_org(group_name, url)[0] - if added: - valid_orgs.append(url) - - # matches https://github.com/{org}/{repo}/ or http://github.com/{org}/{repo} - elif Repo.parse_github_repo_url(url)[0]: - added = user.add_github_repo(group_name, url)[0] - if added: - valid_repos.append(url) - - # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} - elif (match := parse_org_and_repo_name(url)): - org, repo = match.groups() - repo_url = f"https://github.com/{org}/{repo}/" - added = user.add_github_repo(group_name, repo_url)[0] - if added: - valid_repos.append(url) - - # matches /{org}/ or /{org} or {org}/ or {org} - elif (match := parse_org_name(url)): - org = match.group(1) - org_url = f"https://github.com/{org}/" - added = user.add_github_org(group_name, org_url)[0] - if added: - valid_orgs.append(url) - - # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} - elif Repo.parse_gitlab_repo_url(url)[0]: - - added = user.add_gitlab_repo(group_name, url)[0] - if added: - valid_repos.append(url) - - else: - invalid_urls.append(url) - - return valid_orgs, valid_repos, invalid_urls +# user = User.get_by_id(session, user_id) + +# invalid_urls = [] +# valid_orgs = [] +# valid_repos = [] +# for url in urls: + +# # matches https://github.com/{org}/ or http://github.com/{org} +# if Repo.parse_github_org_url(url): +# added = user.add_github_org(group_name, url)[0] +# if added: +# valid_orgs.append(url) + +# # matches https://github.com/{org}/{repo}/ or http://github.com/{org}/{repo} +# elif Repo.parse_github_repo_url(url)[0]: +# added = user.add_github_repo(group_name, url)[0] +# if added: +# valid_repos.append(url) + +# # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} +# elif (match := parse_org_and_repo_name(url)): +# org, repo = match.groups() +# repo_url = f"https://github.com/{org}/{repo}/" +# added = user.add_github_repo(group_name, repo_url)[0] +# if added: +# valid_repos.append(url) + +# # matches /{org}/ or /{org} or {org}/ or {org} +# elif (match := parse_org_name(url)): +# org = match.group(1) +# org_url = f"https://github.com/{org}/" +# added = user.add_github_org(group_name, org_url)[0] +# if added: +# valid_orgs.append(url) + +# # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} +# elif Repo.parse_gitlab_repo_url(url)[0]: + +# added = user.add_gitlab_repo(group_name, url)[0] +# if added: +# valid_repos.append(url) + +# else: +# invalid_urls.append(url) + +# return valid_orgs, valid_repos, invalid_urls @@ -198,29 +255,4 @@ def add_org_repo_list(user_id, group_name, urls): -def get_repos_data(repo_urls, session, logger): - - github_graphql_data_access = GithubGraphQlDataAccess(session.oauths, logger, ingore_not_found_error=True) - - query_parts = [] - repo_map = {} - for i, url in enumerate(repo_urls): - owner, repo = get_owner_repo(url) - query_parts.append(f"""{i}: repository(owner: "{owner}", name: "{repo}") {{ - databaseId, owner {{ __typename }} - }}""") - repo_map[url] = i - - query = f"query GetRepoIds {{ {' '.join(query_parts)}}}" - - data = github_graphql_data_access.get_resource(query, {}, []) - - result_data = {} - for url in repo_urls: - key =repo_map[url] - repo_data = data[key] - - result_data[url] = repo_data - - return result_data From 02d1a177dedc488610d6d6dae99a0b8d2e41f835 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 10 Sep 2024 18:11:00 -0500 Subject: [PATCH 08/21] remove old code --- augur/api/view/api.py | 40 ++++++++++++++++++++-------------------- augur/tasks/frontend.py | 1 + 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index a9653b9f9..bb18fbf4a 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -78,21 +78,21 @@ def av_add_user_repo(): orgs.append(org_name) - rg_obj = RepoGroup.get_by_name(session, org_name) - if rg_obj: - # add the orgs repos to the group - add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) + # rg_obj = RepoGroup.get_by_name(session, org_name) + # if rg_obj: + # # add the orgs repos to the group + # add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} elif Repo.parse_github_repo_url(url)[0]: repo_urls.append(url) - org_name, repo_name = Repo.parse_github_repo_url(url) - repo_git = f"https://github.com/{org_name}/{repo_name}" - repo_obj = Repo.get_by_repo_git(session, repo_git) - if repo_obj: - add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + # org_name, repo_name = Repo.parse_github_repo_url(url) + # repo_git = f"https://github.com/{org_name}/{repo_name}" + # repo_obj = Repo.get_by_repo_git(session, repo_git) + # if repo_obj: + # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} elif (match := parse_org_and_repo_name(url)): @@ -101,9 +101,9 @@ def av_add_user_repo(): repo_urls.append(repo_git) - repo_obj = Repo.get_by_repo_git(session, repo_git) - if repo_obj: - add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + # repo_obj = Repo.get_by_repo_git(session, repo_git) + # if repo_obj: + # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) # matches /{org}/ or /{org} or {org}/ or {org} elif (match := parse_org_name(url)): @@ -111,11 +111,11 @@ def av_add_user_repo(): orgs.append(org) - rg_obj = RepoGroup.get_by_name(session, org_name) - logger.info(rg_obj) - if rg_obj: - # add the orgs repos to the group - add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) + # rg_obj = RepoGroup.get_by_name(session, org_name) + # logger.info(rg_obj) + # if rg_obj: + # # add the orgs repos to the group + # add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} elif Repo.parse_gitlab_repo_url(url)[0]: @@ -124,9 +124,9 @@ def av_add_user_repo(): repo_git = f"https://gitlab.com/{org_name}/{repo_name}" # TODO: gitlab ensure the whole repo git is inserted so it can be found here - repo_obj = Repo.get_by_repo_git(session, repo_git) - if repo_obj: - add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + # repo_obj = Repo.get_by_repo_git(session, repo_git) + # if repo_obj: + # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) else: invalid_urls.append(url) diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index e726468be..e20d539c2 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -21,6 +21,7 @@ def parse_org_and_repo_name(string): match = re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', string) return match +# TODO: Add support for gitlab @celery.task def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): From 35505e62fce35a689a1e86f399529c51ce8323e7 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 10 Sep 2024 19:27:24 -0500 Subject: [PATCH 09/21] fix syntax errors --- augur/api/view/api.py | 97 ++++++++++--------- augur/application/db/lib.py | 2 +- .../alembic/versions/30_add_repo_src_id.py | 2 +- augur/tasks/frontend.py | 22 ++--- 4 files changed, 64 insertions(+), 59 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index bb18fbf4a..9f5c4a235 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -48,6 +48,8 @@ def add_existing_org_to_group(session, user_id, group_name, rg_id): @login_required def av_add_user_repo(): + print("Adding user repos") + urls = request.form.get('urls') group = request.form.get("group_name") @@ -70,70 +72,73 @@ def av_add_user_repo(): orgs = [] repo_urls = [] - with DatabaseSession(logger, current_app.engine) as session: - for url in urls: + for url in urls: - # matches https://github.com/{org}/ or htts://github.com/{org} - if (org_name := Repo.parse_github_org_url(url)): + # matches https://github.com/{org}/ or htts://github.com/{org} + if (org_name := Repo.parse_github_org_url(url)): - orgs.append(org_name) + orgs.append(org_name) - # rg_obj = RepoGroup.get_by_name(session, org_name) - # if rg_obj: - # # add the orgs repos to the group - # add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) + # rg_obj = RepoGroup.get_by_name(session, org_name) + # if rg_obj: + # # add the orgs repos to the group + # add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) - # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} - elif Repo.parse_github_repo_url(url)[0]: + # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} + elif Repo.parse_github_repo_url(url)[0]: - repo_urls.append(url) + repo_urls.append(url) - # org_name, repo_name = Repo.parse_github_repo_url(url) - # repo_git = f"https://github.com/{org_name}/{repo_name}" - # repo_obj = Repo.get_by_repo_git(session, repo_git) - # if repo_obj: - # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + # org_name, repo_name = Repo.parse_github_repo_url(url) + # repo_git = f"https://github.com/{org_name}/{repo_name}" + # repo_obj = Repo.get_by_repo_git(session, repo_git) + # if repo_obj: + # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) - # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} - elif (match := parse_org_and_repo_name(url)): - org, repo = match.groups() - repo_git = f"https://github.com/{org}/{repo}" + # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} + elif (match := parse_org_and_repo_name(url)): + org, repo = match.groups() + repo_git = f"https://github.com/{org}/{repo}" - repo_urls.append(repo_git) + repo_urls.append(repo_git) - # repo_obj = Repo.get_by_repo_git(session, repo_git) - # if repo_obj: - # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) - - # matches /{org}/ or /{org} or {org}/ or {org} - elif (match := parse_org_name(url)): - org_name = match.group(1) + # repo_obj = Repo.get_by_repo_git(session, repo_git) + # if repo_obj: + # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + + # matches /{org}/ or /{org} or {org}/ or {org} + elif (match := parse_org_name(url)): + org_name = match.group(1) - orgs.append(org) + orgs.append(org_name) - # rg_obj = RepoGroup.get_by_name(session, org_name) - # logger.info(rg_obj) - # if rg_obj: - # # add the orgs repos to the group - # add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) + # rg_obj = RepoGroup.get_by_name(session, org_name) + # logger.info(rg_obj) + # if rg_obj: + # # add the orgs repos to the group + # add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) - # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} - elif Repo.parse_gitlab_repo_url(url)[0]: + # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} + elif Repo.parse_gitlab_repo_url(url)[0]: - org_name, repo_name = Repo.parse_github_repo_url(url) - repo_git = f"https://gitlab.com/{org_name}/{repo_name}" + org_name, repo_name = Repo.parse_github_repo_url(url) + repo_git = f"https://gitlab.com/{org_name}/{repo_name}" - # TODO: gitlab ensure the whole repo git is inserted so it can be found here - # repo_obj = Repo.get_by_repo_git(session, repo_git) - # if repo_obj: - # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) - - else: - invalid_urls.append(url) + # TODO: gitlab ensure the whole repo git is inserted so it can be found here + # repo_obj = Repo.get_by_repo_git(session, repo_git) + # if repo_obj: + # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + + else: + invalid_urls.append(url) + + if orgs or repo_urls: repo_urls = [url.lower() for url in repo_urls] orgs = [url.lower() for url in orgs] + flash(f"Adding repos: {repo_urls}") + flash(f"Adding orgs: {orgs}") add_orgs_and_repos.si(current_user.user_id, group, orgs, repo_urls).apply_async() flash("Adding repos and orgs in the background") diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index e65f4665d..5b463ba78 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -558,7 +558,7 @@ def get_group_by_name(user_id, group_name): except s.orm.exc.NoResultFound: return None - return user_group.group_id + return user_group def get_repo_group_by_name(name): diff --git a/augur/application/schema/alembic/versions/30_add_repo_src_id.py b/augur/application/schema/alembic/versions/30_add_repo_src_id.py index d47eb568c..013890697 100644 --- a/augur/application/schema/alembic/versions/30_add_repo_src_id.py +++ b/augur/application/schema/alembic/versions/30_add_repo_src_id.py @@ -1,4 +1,4 @@ -"""Add commit message table +"""Add repo src id Revision ID: 30 Revises: 29 diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index e20d539c2..eae4b2682 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -7,7 +7,7 @@ from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess from augur.application.db.lib import get_group_by_name, get_repo_group_by_name, get_repo_by_repo_git, get_repo_by_src_id from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup +from augur.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup, UserGroup from augur.application.db.models import UserRepo, Repo, User @@ -30,7 +30,7 @@ def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): with GithubTaskSession(logger) as session: # determine group id from name - group = get_group_by_name(session, user_id, group_name) + group = get_group_by_name(user_id, group_name) if not group: logger.error(f"Error while adding repo. Invalid group name of {group_name}. Cannot insert repos") return @@ -38,7 +38,7 @@ def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): group_id = group.group_id # get frontend repo group - frontend_repo_group = RepoGroup.get_by_name(FRONTEND_REPO_GROUP_NAME) + frontend_repo_group = RepoGroup.get_by_name(session, FRONTEND_REPO_GROUP_NAME) if not frontend_repo_group: logger.error("Error while adding repo: Could not find frontend repo group so repos cannot be inserted") return @@ -47,27 +47,28 @@ def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): # define repo_data and assoicate repos with frontend repo group - repo_data = [tuple(url, repo_group_id) for url in repo_urls] + repo_data = [(url, repo_group_id) for url in repo_urls] for org in orgs: # create repo group for org if it doesn't exist - repo_group = RepoGroup.get_by_name(org) + repo_group = RepoGroup.get_by_name(session, org) if not repo_group: repo_group = create_repo_group(session, org) # retrieve repo urls for org - org_repos, _ = retrieve_owner_repos(url) + org_repos, _ = retrieve_owner_repos(session, org) if not org_repos: continue # define urls and repo_group_id of org and then add to repo_data - org_repo_data = [tuple(url, repo_group.repo_group_id) for url in org_repos] + org_repo_data = [(url, repo_group.repo_group_id) for url in org_repos] repo_data.extend(org_repo_data) # get data for repos to determine type, src id, and if they exist data = get_repos_data(repo_data, session, logger) + print(f"Repo data: {data}") for url, repo_group_id in repo_data: @@ -83,14 +84,12 @@ def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): if repo: # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record add_existing_repo_to_group(logger, session, user_id, group_name, repo.repo_id) - logger.warning(f"Error while adding repo: Repo already exists with {url}") continue repo = get_repo_by_src_id(repo_src_id) if repo: # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record add_existing_repo_to_group(logger, session, user_id, group_name, repo.repo_id) - logger.warning(f"Error while adding repo: Repo found with same src id. Inserting url: {url}. Inserting src_id {repo_src_id}") continue add_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id) @@ -98,6 +97,7 @@ def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): return +# TODO: Make it only get like 100 at a time def get_repos_data(repo_data, session, logger): repo_urls = [x[0] for x in repo_data] @@ -108,10 +108,10 @@ def get_repos_data(repo_data, session, logger): repo_map = {} for i, url in enumerate(repo_urls): owner, repo = get_owner_repo(url) - query_parts.append(f"""{i}: repository(owner: "{owner}", name: "{repo}") {{ + query_parts.append(f"""repo_{i}: repository(owner: "{owner}", name: "{repo}") {{ databaseId, owner {{ __typename }} }}""") - repo_map[url] = i + repo_map[url] = f"repo_{i}" query = f"query GetRepoIds {{ {' '.join(query_parts)}}}" From f7baea0ccf3f8084b3ce6dd323b6e7899cf76475 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 16 Sep 2024 18:34:47 -0500 Subject: [PATCH 10/21] consider src id first --- augur/tasks/frontend.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index eae4b2682..9d811d04e 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -68,7 +68,6 @@ def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): # get data for repos to determine type, src id, and if they exist data = get_repos_data(repo_data, session, logger) - print(f"Repo data: {data}") for url, repo_group_id in repo_data: @@ -80,17 +79,17 @@ def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): repo_src_id = repo_data["databaseId"] repo_type = repo_data["owner"]["__typename"] - repo = get_repo_by_repo_git(session, url) + repo = get_repo_by_src_id(repo_src_id) if repo: # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record add_existing_repo_to_group(logger, session, user_id, group_name, repo.repo_id) - continue + continue - repo = get_repo_by_src_id(repo_src_id) + repo = get_repo_by_repo_git(session, url) if repo: # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record add_existing_repo_to_group(logger, session, user_id, group_name, repo.repo_id) - continue + continue add_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id) From 1966ad61dd2ab4a2ca00ef78f22942a9f65bb297 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 16 Sep 2024 18:49:55 -0500 Subject: [PATCH 11/21] process 100 repos at a time --- augur/api/view/api.py | 12 ------- augur/tasks/frontend.py | 77 +++++++++++++++++++++++------------------ 2 files changed, 43 insertions(+), 46 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index 9f5c4a235..ae5a8c250 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -14,18 +14,6 @@ def cache(file=None): return redirect(url_for('static', filename="cache")) return redirect(url_for('static', filename="cache/" + toCacheFilename(file, False))) - -def add_existing_repo_to_group(session, user_id, group_name, repo_id): - - logger.info("Adding existing repo to group") - - group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) - if group_id is None: - return False - - result = UserRepo.insert(session, repo_id, group_id) - if not result: - return False def add_existing_org_to_group(session, user_id, group_name, rg_id): diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index 9d811d04e..e48e24d62 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -65,35 +65,57 @@ def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): org_repo_data = [(url, repo_group.repo_group_id) for url in org_repos] repo_data.extend(org_repo_data) + # break list of repos into lists of 100 so that graphql query isn't overwhelmed + for chunk in divide_list_into_chunks(repo_data, 100): - # get data for repos to determine type, src id, and if they exist - data = get_repos_data(repo_data, session, logger) + add_new_repos(chunk, group_id, session, logger) - for url, repo_group_id in repo_data: + return + - repo_data = data[url] - if not repo_data: - # skip since cause the repo is not valid (doesn't exist likely) - continue +def add_new_repos(repo_data, group_id, session, logger): - repo_src_id = repo_data["databaseId"] - repo_type = repo_data["owner"]["__typename"] + # get data for repos to determine type, src id, and if they exist + data = get_repos_data(repo_data, session, logger) - repo = get_repo_by_src_id(repo_src_id) - if repo: - # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record - add_existing_repo_to_group(logger, session, user_id, group_name, repo.repo_id) - continue + for url, repo_group_id in repo_data: - repo = get_repo_by_repo_git(session, url) - if repo: - # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record - add_existing_repo_to_group(logger, session, user_id, group_name, repo.repo_id) - continue + repo_data = data[url] + if not repo_data: + # skip since cause the repo is not valid (doesn't exist likely) + continue - add_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id) + repo_src_id = repo_data["databaseId"] + repo_type = repo_data["owner"]["__typename"] - return + repo = get_repo_by_src_id(repo_src_id) + if repo: + # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record + add_existing_repo_to_group(logger, session, group_id, repo.repo_id) + continue + + repo = get_repo_by_repo_git(session, url) + if repo: + # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record + add_existing_repo_to_group(logger, session, group_id, repo.repo_id) + continue + + add_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id) + + +def add_existing_repo_to_group(logger, session, group_id, repo_id): + + logger.info("Adding existing repo to group") + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + return False + + +def divide_list_into_chunks(data, size): + + for i in range(0, len(data), size): + yield data[i:i + size] # TODO: Make it only get like 100 at a time @@ -128,19 +150,6 @@ def get_repos_data(repo_data, session, logger): def get_repo_by_repo_git(session, url): return session.query(Repo).filter(Repo.repo_git == url).first() - - -def add_existing_repo_to_group(logger, session, user_id, group_name, repo_id): - - logger.info("Adding existing repo to group") - - group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) - if group_id is None: - return False - - result = UserRepo.insert(session, repo_id, group_id) - if not result: - return False def create_repo_group(session, owner): From cfed800b37016a493834f7b634f8d37f08881b3e Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 16 Sep 2024 18:55:20 -0500 Subject: [PATCH 12/21] improve organization and remove commented code --- augur/api/view/api.py | 30 -------------------------- augur/tasks/frontend.py | 47 ++++++++++++++++++++++------------------- 2 files changed, 25 insertions(+), 52 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index ae5a8c250..c260b65fa 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -64,58 +64,28 @@ def av_add_user_repo(): # matches https://github.com/{org}/ or htts://github.com/{org} if (org_name := Repo.parse_github_org_url(url)): - orgs.append(org_name) - # rg_obj = RepoGroup.get_by_name(session, org_name) - # if rg_obj: - # # add the orgs repos to the group - # add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) - # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} elif Repo.parse_github_repo_url(url)[0]: - repo_urls.append(url) - # org_name, repo_name = Repo.parse_github_repo_url(url) - # repo_git = f"https://github.com/{org_name}/{repo_name}" - # repo_obj = Repo.get_by_repo_git(session, repo_git) - # if repo_obj: - # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) - # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} elif (match := parse_org_and_repo_name(url)): org, repo = match.groups() repo_git = f"https://github.com/{org}/{repo}" - repo_urls.append(repo_git) - - # repo_obj = Repo.get_by_repo_git(session, repo_git) - # if repo_obj: - # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) # matches /{org}/ or /{org} or {org}/ or {org} elif (match := parse_org_name(url)): org_name = match.group(1) - orgs.append(org_name) - # rg_obj = RepoGroup.get_by_name(session, org_name) - # logger.info(rg_obj) - # if rg_obj: - # # add the orgs repos to the group - # add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) - # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} elif Repo.parse_gitlab_repo_url(url)[0]: org_name, repo_name = Repo.parse_github_repo_url(url) repo_git = f"https://gitlab.com/{org_name}/{repo_name}" - - # TODO: gitlab ensure the whole repo git is inserted so it can be found here - # repo_obj = Repo.get_by_repo_git(session, repo_git) - # if repo_obj: - # add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) else: invalid_urls.append(url) diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index e48e24d62..cac4d9fe7 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -45,33 +45,40 @@ def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): repo_group_id = frontend_repo_group.repo_group_id - # define repo_data and assoicate repos with frontend repo group repo_data = [(url, repo_group_id) for url in repo_urls] - for org in orgs: - - # create repo group for org if it doesn't exist - repo_group = RepoGroup.get_by_name(session, org) - if not repo_group: - repo_group = create_repo_group(session, org) - - # retrieve repo urls for org - org_repos, _ = retrieve_owner_repos(session, org) - if not org_repos: - continue - - # define urls and repo_group_id of org and then add to repo_data - org_repo_data = [(url, repo_group.repo_group_id) for url in org_repos] - repo_data.extend(org_repo_data) + # get org repos and associate them with their org repo group + org_repo_data = get_org_repo_data(orgs, session) + repo_data.extend(org_repo_data) # break list of repos into lists of 100 so that graphql query isn't overwhelmed for chunk in divide_list_into_chunks(repo_data, 100): add_new_repos(chunk, group_id, session, logger) - return +def get_org_repo_data(orgs, session): + + repo_data = [] + for org in orgs: + + # create repo group for org if it doesn't exist + repo_group = RepoGroup.get_by_name(session, org) + if not repo_group: + repo_group = create_repo_group(session, org) + + # retrieve repo urls for org + org_repos, _ = retrieve_owner_repos(session, org) + if not org_repos: + continue + + # define urls and repo_group_id of org and then add to repo_data + org_repo_data = [(url, repo_group.repo_group_id) for url in org_repos] + repo_data.extend(org_repo_data) + + return repo_data + def add_new_repos(repo_data, group_id, session, logger): @@ -104,12 +111,8 @@ def add_new_repos(repo_data, group_id, session, logger): def add_existing_repo_to_group(logger, session, group_id, repo_id): - - logger.info("Adding existing repo to group") - result = UserRepo.insert(session, repo_id, group_id) - if not result: - return False + UserRepo.insert(session, repo_id, group_id) def divide_list_into_chunks(data, size): From 18a84d1f102688ba9f42647af8ba93142750ca0a Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 16 Sep 2024 18:56:25 -0500 Subject: [PATCH 13/21] rename task to be github specific --- augur/api/view/api.py | 4 ++-- augur/tasks/frontend.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index c260b65fa..d1f4ebb80 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -2,7 +2,7 @@ import re from flask_login import current_user, login_required from augur.application.db.models import Repo, RepoGroup, UserGroup, UserRepo -from augur.tasks.frontend import add_orgs_and_repos, parse_org_and_repo_name, parse_org_name +from augur.tasks.frontend import add_github_orgs_and_repos, parse_org_and_repo_name, parse_org_name from .utils import * from ..server import app from augur.application.db.session import DatabaseSession @@ -97,7 +97,7 @@ def av_add_user_repo(): orgs = [url.lower() for url in orgs] flash(f"Adding repos: {repo_urls}") flash(f"Adding orgs: {orgs}") - add_orgs_and_repos.si(current_user.user_id, group, orgs, repo_urls).apply_async() + add_github_orgs_and_repos.si(current_user.user_id, group, orgs, repo_urls).apply_async() flash("Adding repos and orgs in the background") diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index cac4d9fe7..614f4013e 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -23,9 +23,9 @@ def parse_org_and_repo_name(string): # TODO: Add support for gitlab @celery.task -def add_orgs_and_repos(user_id, group_name, orgs, repo_urls): +def add_github_orgs_and_repos(user_id, group_name, orgs, repo_urls): - logger = logging.getLogger(add_orgs_and_repos.__name__) + logger = logging.getLogger(add_github_orgs_and_repos.__name__) with GithubTaskSession(logger) as session: From aebe6abf46c2c7dd4c4c3d7933410364f75e68e3 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 16 Sep 2024 18:58:42 -0500 Subject: [PATCH 14/21] rename to be github specific and add template for gitlab task --- augur/tasks/frontend.py | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index 614f4013e..db04ecb8c 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -55,7 +55,40 @@ def add_github_orgs_and_repos(user_id, group_name, orgs, repo_urls): # break list of repos into lists of 100 so that graphql query isn't overwhelmed for chunk in divide_list_into_chunks(repo_data, 100): - add_new_repos(chunk, group_id, session, logger) + add_new_github_repos(chunk, group_id, session, logger) + + +# TODO: Add support for gitlab +@celery.task +def add_gitlab_repos(user_id, group_name, repo_urls): + + logger = logging.getLogger(add_github_orgs_and_repos.__name__) + + with GithubTaskSession(logger) as session: + + # determine group id from name + group = get_group_by_name(user_id, group_name) + if not group: + logger.error(f"Error while adding repo. Invalid group name of {group_name}. Cannot insert repos") + return + + group_id = group.group_id + + # get frontend repo group + frontend_repo_group = RepoGroup.get_by_name(session, FRONTEND_REPO_GROUP_NAME) + if not frontend_repo_group: + logger.error("Error while adding repo: Could not find frontend repo group so repos cannot be inserted") + return + + repo_group_id = frontend_repo_group.repo_group_id + + # define repo_data and assoicate repos with frontend repo group + repo_data = [(url, repo_group_id) for url in repo_urls] + + # break list of repos into lists of 100 so that graphql query isn't overwhelmed + for chunk in divide_list_into_chunks(repo_data, 100): + + add_new_github_repos(chunk, group_id, session, logger) def get_org_repo_data(orgs, session): @@ -80,10 +113,10 @@ def get_org_repo_data(orgs, session): return repo_data -def add_new_repos(repo_data, group_id, session, logger): +def add_new_github_repos(repo_data, group_id, session, logger): # get data for repos to determine type, src id, and if they exist - data = get_repos_data(repo_data, session, logger) + data = get_github_repos_data(repo_data, session, logger) for url, repo_group_id in repo_data: @@ -122,7 +155,7 @@ def divide_list_into_chunks(data, size): # TODO: Make it only get like 100 at a time -def get_repos_data(repo_data, session, logger): +def get_github_repos_data(repo_data, session, logger): repo_urls = [x[0] for x in repo_data] From d39cca5a92dbed9c35af89cf66a52d2d3f1971b3 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 17 Sep 2024 17:38:15 -0500 Subject: [PATCH 15/21] add support for gitlab repo --- augur/api/view/api.py | 7 ++++++- augur/tasks/frontend.py | 34 ++++++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index d1f4ebb80..5b0a7ebdc 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -2,7 +2,7 @@ import re from flask_login import current_user, login_required from augur.application.db.models import Repo, RepoGroup, UserGroup, UserRepo -from augur.tasks.frontend import add_github_orgs_and_repos, parse_org_and_repo_name, parse_org_name +from augur.tasks.frontend import add_github_orgs_and_repos, parse_org_and_repo_name, parse_org_name, add_gitlab_repos from .utils import * from ..server import app from augur.application.db.session import DatabaseSession @@ -60,6 +60,7 @@ def av_add_user_repo(): orgs = [] repo_urls = [] + gitlab_repo_urls = [] for url in urls: # matches https://github.com/{org}/ or htts://github.com/{org} @@ -87,6 +88,7 @@ def av_add_user_repo(): org_name, repo_name = Repo.parse_github_repo_url(url) repo_git = f"https://gitlab.com/{org_name}/{repo_name}" + gitlab_repo_urls.append(repo_git) else: invalid_urls.append(url) @@ -99,6 +101,9 @@ def av_add_user_repo(): flash(f"Adding orgs: {orgs}") add_github_orgs_and_repos.si(current_user.user_id, group, orgs, repo_urls).apply_async() + if gitlab_repo_urls: + add_gitlab_repos(current_user.user_id, group, gitlab_repo_urls) + flash("Adding repos and orgs in the background") return redirect(url_for("user_settings") + "?section=tracker") diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index db04ecb8c..e1543c2fb 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -82,15 +82,33 @@ def add_gitlab_repos(user_id, group_name, repo_urls): repo_group_id = frontend_repo_group.repo_group_id - # define repo_data and assoicate repos with frontend repo group - repo_data = [(url, repo_group_id) for url in repo_urls] + for url in repo_urls: - # break list of repos into lists of 100 so that graphql query isn't overwhelmed - for chunk in divide_list_into_chunks(repo_data, 100): + result = Repo.is_valid_gitlab_repo(session, url) + if not result[0]: + continue + + # TODO: Add logic to get gitlab src id + + repo = get_repo_by_repo_git(session, url) + if repo: + # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record + add_existing_repo_to_group(logger, session, group_id, repo.repo_id) + + add_gitlab_repo(session, url, repo_group_id, group_id) + + +def add_gitlab_repo(session, url, repo_group_id, group_id): + + repo_id = Repo.insert_gitlab_repo(session, url, repo_group_id, "Frontend") + if not repo_id: + return False, {"status": "Repo insertion failed", "repo_url": url} + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + return False, {"status": "repo_user insertion failed", "repo_url": url} - add_new_github_repos(chunk, group_id, session, logger) - def get_org_repo_data(orgs, session): repo_data = [] @@ -140,7 +158,7 @@ def add_new_github_repos(repo_data, group_id, session, logger): add_existing_repo_to_group(logger, session, group_id, repo.repo_id) continue - add_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id) + add_github_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id) def add_existing_repo_to_group(logger, session, group_id, repo_id): @@ -196,7 +214,7 @@ def create_repo_group(session, owner): return repo_group -def add_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id): +def add_github_repo(logger, session, url, repo_group_id, group_id, repo_type, repo_src_id): # These two things really need to be done in one commit in the future to prevent one existing without the other repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type, repo_src_id) From 449917f22110e7d19d82affba88705828defc6d2 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 28 Sep 2024 09:28:10 -0500 Subject: [PATCH 16/21] improve logic --- augur/tasks/frontend.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index e1543c2fb..df7808285 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -130,7 +130,7 @@ def get_org_repo_data(orgs, session): return repo_data - +# TODO: Do we need to check if the repo already exists in the user group? def add_new_github_repos(repo_data, group_id, session, logger): # get data for repos to determine type, src id, and if they exist @@ -186,18 +186,14 @@ def get_github_repos_data(repo_data, session, logger): query_parts.append(f"""repo_{i}: repository(owner: "{owner}", name: "{repo}") {{ databaseId, owner {{ __typename }} }}""") - repo_map[url] = f"repo_{i}" query = f"query GetRepoIds {{ {' '.join(query_parts)}}}" data = github_graphql_data_access.get_resource(query, {}, []) result_data = {} - for url in repo_urls: - key =repo_map[url] - repo_data = data[key] - - result_data[url] = repo_data + for i, url in enumerate(repo_urls): + result_data[url] = data[f"repo_{i}"] return result_data From 08a92a164f8095586fbb42b02eaa2d7edbaa6863 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 8 Oct 2024 12:42:09 -0500 Subject: [PATCH 17/21] add logic to support gitlab repos --- augur/application/db/models/augur_data.py | 5 +- augur/tasks/frontend.py | 75 ++++++++++++++++++++--- 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index baf01b5ac..a645ea1ba 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -1065,7 +1065,7 @@ def parse_github_org_url(url): return result.groups()[0] @staticmethod - def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): + def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source, repo_src_id): """Add a repo to the repo table. Args: @@ -1099,7 +1099,8 @@ def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): "repo_type": None, "tool_source": tool_source, "tool_version": "1.0", - "data_source": "Git" + "data_source": "Git", + "repo_src_id": repo_src_id } repo_unique = ["repo_git"] diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index df7808285..cd38d0acc 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -1,15 +1,18 @@ import logging import re import sqlalchemy as s +import urllib.parse +from time import sleep from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess -from augur.application.db.lib import get_group_by_name, get_repo_group_by_name, get_repo_by_repo_git, get_repo_by_src_id +from augur.application.db.lib import get_group_by_name, get_repo_by_repo_git, get_repo_by_src_id from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup, UserGroup +from augur.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup +from augur.tasks.github.util.github_paginator import hit_api -from augur.application.db.models import UserRepo, Repo, User +from augur.application.db.models import UserRepo, Repo def parse_org_name(string): @@ -84,18 +87,29 @@ def add_gitlab_repos(user_id, group_name, repo_urls): for url in repo_urls: - result = Repo.is_valid_gitlab_repo(session, url) - if not result[0]: + result = get_gitlab_repo_data(session, url, logger) + if not result: continue - # TODO: Add logic to get gitlab src id + if "id" not in result: + logger.error(f"Gitlab repo data returned without id. Url: {url}. Data: {result}") + continue + + repo_src_id = result["id"] + + repo = get_repo_by_src_id(repo_src_id) + if repo: + # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record + add_existing_repo_to_group(logger, session, group_id, repo.repo_id) + continue repo = get_repo_by_repo_git(session, url) if repo: # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record add_existing_repo_to_group(logger, session, group_id, repo.repo_id) + continue - add_gitlab_repo(session, url, repo_group_id, group_id) + add_github_repo(logger, session, url, repo_group_id, group_id, repo_src_id) def add_gitlab_repo(session, url, repo_group_id, group_id): @@ -222,6 +236,53 @@ def add_github_repo(logger, session, url, repo_group_id, group_id, repo_type, re if not result: logger.error(f"Error while adding repo: Failed to insert user repo record. A record with a repo_id of {repo_id} and a group id of {group_id} needs to be added to the user repo table so that this repo shows up in the users group") return + + +def get_gitlab_repo_data(gl_session, url: str, logger) -> bool: + + REPO_ENDPOINT = "https://gitlab.com/api/v4/projects/{}/" + + owner, repo = Repo.parse_gitlab_repo_url(url) + if not owner or not repo: + logger.error(f"Tried to get gitlab repo data for invalid url: {url}") + return None + + # Encode namespace and project name for the API request + project_identifier = urllib.parse.quote(f"{owner}/{repo}", safe='') + url = REPO_ENDPOINT.format(project_identifier) + + attempts = 0 + while attempts < 10: + response = hit_api(gl_session.oauths, url, logger) + + if wait_in_seconds := response.headers.get("Retry-After") is not None: + sleep(int(wait_in_seconds)) + + if response.status_code == 404: + return None + + if response.status_code == 200: + return response.json() + + attempts += 1 + sleep(attempts*3) + + logger.error(f"Failed to get gitlab repo data after multiple attemps. Url: {url}") + + return None + +def add_gitlab_repo(logger, session, url, repo_group_id, group_id, repo_src_id): + + # These two things really need to be done in one commit in the future to prevent one existing without the other + repo_id = Repo.insert_gitlab_repo(session, url, repo_group_id, "Frontend", repo_src_id) + if not repo_id: + logger.error("Error while adding repo: Failed to insert github repo") + return + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + logger.error(f"Error while adding repo: Failed to insert user repo record. A record with a repo_id of {repo_id} and a group id of {group_id} needs to be added to the user repo table so that this repo shows up in the users group") + return # @celery.task # def add_org_repo_list(user_id, group_name, urls): From 6940a262ef5d2a168b05ea337b0ea5a68fb0214e Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 8 Oct 2024 12:49:33 -0500 Subject: [PATCH 18/21] update src id search to only match on relevant repos --- augur/application/db/lib.py | 14 ++++++++++++-- augur/tasks/frontend.py | 6 +++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 9de1243e6..4d10b9011 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -144,15 +144,25 @@ def get_repo_by_repo_id(repo_id): return repo -def get_repo_by_src_id(src_id): +def get_github_repo_by_src_id(src_id): with get_session() as session: - query = session.query(Repo).filter(Repo.repo_src_id == src_id) + query = session.query(Repo).filter(Repo.repo_src_id == src_id, Repo.repo_git.ilike(f'%https://github.com%')) repo = execute_session_query(query, 'first') return repo +def get_gitlab_repo_by_src_id(src_id): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_src_id == src_id, Repo.repo_git.ilike(f'%https://gitlab.com%')) + repo = execute_session_query(query, 'first') + + return repo + + def remove_working_commits_by_repo_id_and_hashes(repo_id, commit_hashes): remove_working_commits = s.sql.text("""DELETE FROM working_commits diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index cd38d0acc..da4c1cd9d 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -7,7 +7,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess -from augur.application.db.lib import get_group_by_name, get_repo_by_repo_git, get_repo_by_src_id +from augur.application.db.lib import get_group_by_name, get_repo_by_repo_git, get_github_repo_by_src_id, get_gitlab_repo_by_src_id from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup from augur.tasks.github.util.github_paginator import hit_api @@ -97,7 +97,7 @@ def add_gitlab_repos(user_id, group_name, repo_urls): repo_src_id = result["id"] - repo = get_repo_by_src_id(repo_src_id) + repo = get_gitlab_repo_by_src_id(repo_src_id) if repo: # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record add_existing_repo_to_group(logger, session, group_id, repo.repo_id) @@ -160,7 +160,7 @@ def add_new_github_repos(repo_data, group_id, session, logger): repo_src_id = repo_data["databaseId"] repo_type = repo_data["owner"]["__typename"] - repo = get_repo_by_src_id(repo_src_id) + repo = get_github_repo_by_src_id(repo_src_id) if repo: # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record add_existing_repo_to_group(logger, session, group_id, repo.repo_id) From 8786f31efec978d88752e6aeba774f8614d4942f Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 15 Oct 2024 17:53:46 -0500 Subject: [PATCH 19/21] remove TODOs --- augur/tasks/frontend.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index da4c1cd9d..f526d9041 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -24,7 +24,6 @@ def parse_org_and_repo_name(string): match = re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', string) return match -# TODO: Add support for gitlab @celery.task def add_github_orgs_and_repos(user_id, group_name, orgs, repo_urls): @@ -61,7 +60,6 @@ def add_github_orgs_and_repos(user_id, group_name, orgs, repo_urls): add_new_github_repos(chunk, group_id, session, logger) -# TODO: Add support for gitlab @celery.task def add_gitlab_repos(user_id, group_name, repo_urls): @@ -109,7 +107,7 @@ def add_gitlab_repos(user_id, group_name, repo_urls): add_existing_repo_to_group(logger, session, group_id, repo.repo_id) continue - add_github_repo(logger, session, url, repo_group_id, group_id, repo_src_id) + add_gitlab_repo(logger, session, url, repo_group_id, group_id, repo_src_id) def add_gitlab_repo(session, url, repo_group_id, group_id): From 055fa7cabcc98947cb87a89e9e4740355fcf3c30 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 15 Oct 2024 18:23:20 -0500 Subject: [PATCH 20/21] update version ID Signed-off-by: Sean P. Goggins --- metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata.py b/metadata.py index 018f51d32..4a3d19a38 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.76.3" -__release__ = "v0.76.3 (Pumpkin Laser)" +__version__ = "0.76.4" +__release__ = "v0.76.4 (Pumpkin Core)" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2112" From 73c8fb32d3af44c80eb3174e501862c260bf558e Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 15 Oct 2024 18:27:07 -0500 Subject: [PATCH 21/21] README Version Update Signed-off-by: Sean P. Goggins --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 69aa2f551..31c626493 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.76.3 +# Augur NEW Release v0.76.4 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.3 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.4 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.