diff --git a/README.md b/README.md index 02ec125fb..c0c99157c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.70.0 +# Augur NEW Release v0.71.0 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.70.0 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.71.0 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. diff --git a/augur/api/routes/config.py b/augur/api/routes/config.py index 6a2f82976..08618091a 100644 --- a/augur/api/routes/config.py +++ b/augur/api/routes/config.py @@ -8,6 +8,7 @@ # Disable the requirement for SSL by setting env["AUGUR_DEV"] = True from augur.application.config import get_development_flag +from augur.application.db.lib import get_session from augur.application.db.models import Config from augur.application.config import AugurConfig from augur.application.db.session import DatabaseSession @@ -45,7 +46,7 @@ def update_config(): update_dict = request.get_json() - with DatabaseSession(logger, engine=current_app.engine) as session: + with get_session() as session: for section, data in update_dict.items(): diff --git a/augur/api/routes/dei.py b/augur/api/routes/dei.py index 990a6e736..621c89604 100644 --- a/augur/api/routes/dei.py +++ b/augur/api/routes/dei.py @@ -92,7 +92,7 @@ def core_task_success_util_gen(repo_git): deiHook = CollectionRequest("core",primary_enabled_phases) deiHook.repo_list = [repo_url] - singleRoutine = AugurTaskRoutine(session,[deiHook]) + singleRoutine = AugurTaskRoutine(logger, session,[deiHook]) singleRoutine.start_data_collection() #start_block_of_repos(logger, session, [repo_url], primary_enabled_phases, "new") diff --git a/augur/api/view/augur_view.py b/augur/api/view/augur_view.py index 5166f2c5a..ff4b25145 100644 --- a/augur/api/view/augur_view.py +++ b/augur/api/view/augur_view.py @@ -48,6 +48,7 @@ def internal_server_error(error): traceback.print_tb(error.__traceback__, file=errout) # traceback.print_exception(error, file=errout) stacktrace = errout.getvalue() + stacktrace += f"\n{type(error).__name__}: {str(error)}" errout.close() except Exception as e: logger.error(e) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index d7a8ad745..2a2deadd1 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -100,6 +100,7 @@ def start(ctx, disable_collection, development, port): create_collection_status_records.si().apply_async() time.sleep(3) + #put contributor breadth back in. Not sure why it was commented out contributor_breadth_model.si().apply_async() # start cloning repos when augur starts @@ -164,15 +165,15 @@ def determine_worker_processes(ratio,maximum): process_list.append(subprocess.Popen(scheduling_worker.split(" "))) sleep_time += 6 - #60% of estimate, Maximum value of 45 - core_num_processes = determine_worker_processes(.6, 45) + #60% of estimate, Maximum value of 45 : Reduced because it can be lower + core_num_processes = determine_worker_processes(.15, 10) logger.info(f"Starting core worker processes with concurrency={core_num_processes}") core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" process_list.append(subprocess.Popen(core_worker.split(" "))) sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.25, 45) + secondary_num_processes = determine_worker_processes(.70, 60) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) @@ -317,7 +318,7 @@ def assign_orphan_repos_to_default_user(session): repos = session.execute_sql(query).fetchall() for repo in repos: - UserRepo.insert(session,repo[0],1) + UserRepo.insert(session, repo[0],1) @cli.command('export-env') diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py index 7d65cad97..c13e64832 100644 --- a/augur/application/cli/collection.py +++ b/augur/application/cli/collection.py @@ -124,15 +124,15 @@ def determine_worker_processes(ratio,maximum): process_list.append(subprocess.Popen(scheduling_worker.split(" "))) sleep_time += 6 - #60% of estimate, Maximum value of 45 - core_num_processes = determine_worker_processes(.6, 45) + #60% of estimate, Maximum value of 45: Reduced because not needed + core_num_processes = determine_worker_processes(.15, 10) logger.info(f"Starting core worker processes with concurrency={core_num_processes}") core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" process_list.append(subprocess.Popen(core_worker.split(" "))) sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.25, 45) + secondary_num_processes = determine_worker_processes(.70, 60) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) @@ -301,4 +301,4 @@ def assign_orphan_repos_to_default_user(session): repos = session.execute_sql(query).fetchall() for repo in repos: - UserRepo.insert(session,repo[0],1) + UserRepo.insert(session, repo[0],1) diff --git a/augur/application/cli/tasks.py b/augur/application/cli/tasks.py index c64dce5b8..d25f081ab 100644 --- a/augur/application/cli/tasks.py +++ b/augur/application/cli/tasks.py @@ -36,8 +36,8 @@ def start(): secondary_worker_process = None scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h" - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=20 -n core:{uuid.uuid4().hex}@%h" + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=60 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" ")) core_worker_process = subprocess.Popen(core_worker.split(" ")) diff --git a/augur/application/config.py b/augur/application/config.py index 8998d6094..bfda4c877 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -288,6 +288,7 @@ def add_or_update_settings(self, settings: List[dict]): query = self.session.query(Config).filter(and_(Config.section_name == setting["section_name"],Config.setting_name == setting["setting_name"]) ) if execute_session_query(query, 'first') is None: + # TODO: Update to use bulk insert dicts so config doesn't require database session self.session.insert_data(setting,Config, ["section_name", "setting_name"]) else: #If setting exists. use raw update to not increase autoincrement diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index c1da707db..cc785951a 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -1,9 +1,19 @@ -import sqlalchemy as s +import time +import random import logging -from typing import List, Any, Optional -from augur.application.db.models import Config -from augur.application.db import get_session +import sqlalchemy as s +from sqlalchemy import func +from sqlalchemy.exc import DataError +from sqlalchemy.dialects import postgresql +from sqlalchemy.exc import OperationalError +from psycopg2.errors import DeadlockDetected +from typing import List, Any, Optional, Union + +from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias, UnresolvedCommitEmail, Contributor, CollectionStatus +from augur.tasks.util.collection_state import CollectionState +from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query +from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts logger = logging.getLogger("db_lib") @@ -95,3 +105,408 @@ def get_value(section_name: str, setting_name: str) -> Optional[Any]: setting_dict = convert_type_of_value(setting_dict, logger) return setting_dict["value"] + + +def execute_sql(sql_text): + + engine = get_engine() + + with engine.begin() as connection: + + return_data = connection.execute(sql_text) + + return return_data + +def fetchall_data_from_sql_text(sql_text): + + engine = get_engine() + + with engine.begin() as connection: + + result = connection.execute(sql_text) + return [dict(row) for row in result.mappings()] + +def get_repo_by_repo_git(repo_git: str): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + + return repo + +def get_repo_by_repo_id(repo_id): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_id == repo_id) + repo = execute_session_query(query, 'one') + + return repo + +def remove_working_commits_by_repo_id_and_hashes(repo_id, commit_hashes): + + remove_working_commits = s.sql.text("""DELETE FROM working_commits + WHERE repos_id = :repo_id AND working_commit IN :hashes + """).bindparams(repo_id=repo_id,hashes=tuple(commit_hashes)) + + execute_sql(remove_working_commits) + +def remove_working_commits_by_repo_id(repo_id): + + remove_working_commits = s.sql.text("""DELETE FROM working_commits WHERE repos_id=:repo_id""").bindparams(repo_id=repo_id) + execute_sql(remove_working_commits) + +def remove_commits_by_repo_id_and_hashes(repo_id, commit_hashes): + + remove_commit = s.sql.text("""DELETE FROM commits + WHERE repo_id=:repo_id + AND cmt_commit_hash IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commit_hashes)) + execute_sql(remove_commit) + + +def remove_commits_by_repo_id(repo_id): + + remove_commits = s.sql.text("""DELETE FROM commits WHERE repo_id=:repo_id""").bindparams(repo_id=repo_id) + execute_sql(remove_commits) + +def get_working_commits_by_repo_id(repo_id): + + query = s.sql.text("""SELECT working_commit FROM working_commits WHERE repos_id=:repo_id + """).bindparams(repo_id=repo_id) + + try: + working_commits = fetchall_data_from_sql_text(query) + except: + working_commits = [] + + return working_commits + +def get_worker_oauth_keys(platform: str): + + with get_session() as session: + + results = session.query(WorkerOauth).filter(WorkerOauth.platform == platform).order_by(func.random()).all() + + return [row.access_token for row in results] + +def get_active_repo_count(collection_type): + + with get_session() as session: + + return session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{collection_type}_status" ) == CollectionState.COLLECTING.value).count() + + +def facade_bulk_insert_commits(logger, records): + + with get_session() as session: + + try: + session.execute( + s.insert(Commit), + records, + ) + session.commit() + except Exception as e: + + if len(records) > 1: + logger.error(f"Ran into issue when trying to insert commits \n Error: {e}") + + #split list into halves and retry insert until we isolate offending record + firsthalfRecords = records[:len(records)//2] + secondhalfRecords = records[len(records)//2:] + + facade_bulk_insert_commits(logger, firsthalfRecords) + facade_bulk_insert_commits(logger, secondhalfRecords) + elif len(records) == 1 and isinstance(e,DataError) and "time zone displacement" in f"{e}": + commit_record = records[0] + #replace incomprehensible dates with epoch. + #2021-10-11 11:57:46 -0500 + placeholder_date = "1970-01-01 00:00:15 -0500" + + #Check for improper utc timezone offset + #UTC timezone offset should be between -14:00 and +14:00 + + commit_record['author_timestamp'] = placeholder_date + commit_record['committer_timestamp'] = placeholder_date + + session.execute( + s.insert(Commit), + [commit_record], + ) + session.commit() + else: + raise e + + +def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: + + if isinstance(data, list) is False: + + # if a dict is passed to data then + # convert it to a list with one value + if isinstance(data, dict) is True: + data = [data] + + else: + logger.info("Data must be a list or a dict") + return None + + if len(data) == 0: + # self.logger.info("Gave no data to insert, returning...") + return None + + if isinstance(data[0], dict) is False: + logger.info("Must be list of dicts") + return None + + # remove any duplicate data + # this only counts something as a duplicate if every field is the same + data = remove_duplicates_by_uniques(data, natural_keys) + + # remove null data from string fields + if string_fields and isinstance(string_fields, list): + data = remove_null_characters_from_list_of_dicts(data, string_fields) + + # creates list of arguments to tell sqlalchemy what columns to return after the data is inserted + returning_args = [] + if return_columns: + for column in return_columns: + argument = getattr(table, column) + returning_args.append(argument) + + # creates insert on table + # that returns cols specificed in returning_args + # and inserts the data specified in data + # NOTE: if return_columns does not have an values this still works + stmnt = postgresql.insert(table).returning(*returning_args).values(data) + + + if on_conflict_update: + + # create a dict that the on_conflict_do_update method requires to be able to map updates whenever there is a conflict. See sqlalchemy docs for more explanation and examples: https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#updating-using-the-excluded-insert-values + setDict = {} + for key in data[0].keys(): + setDict[key] = getattr(stmnt.excluded, key) + + stmnt = stmnt.on_conflict_do_update( + #This might need to change + index_elements=natural_keys, + + #Columns to be updated + set_ = setDict + ) + + else: + stmnt = stmnt.on_conflict_do_nothing( + index_elements=natural_keys + ) + + + # print(str(stmnt.compile(dialect=postgresql.dialect()))) + attempts = 0 + # creates list from 1 to 10 / changed to 10-30 because deadlocks are taking longer + sleep_time_list = list(range(10,30)) + deadlock_detected = False + + engine = get_engine() + + # if there is no data to return then it executes the insert then returns nothing + if not return_columns: + + while attempts < 10: + try: + #begin keyword is needed for sqlalchemy 2.x + #this is because autocommit support was removed in 2.0 + with engine.begin() as connection: + connection.execute(stmnt) + break + except OperationalError as e: + # print(str(e).split("Process")[1].split(";")[0]) + if isinstance(e.orig, DeadlockDetected): + deadlock_detected = True + sleep_time = random.choice(sleep_time_list) + logger.debug(f"Deadlock detected on {table.__table__} table...trying again in {round(sleep_time)} seconds: transaction size: {len(data)}") + time.sleep(sleep_time) + + attempts += 1 + continue + + raise e + + except Exception as e: + #self.logger.info(e) + if len(data) == 1: + raise e + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] + + bulk_insert_dicts(logger, first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + bulk_insert_dicts(logger, second_half,table, natural_keys, return_columns, string_fields, on_conflict_update) + + else: + logger.error("Unable to insert data in 10 attempts") + return None + + if deadlock_detected is True: + logger.error("Made it through even though Deadlock was detected") + + return "success" + + + # othewise it gets the requested return columns and returns them as a list of dicts + while attempts < 10: + try: + with engine.begin() as connection: + return_data_tuples = connection.execute(stmnt) + break + except OperationalError as e: + if isinstance(e.orig, DeadlockDetected): + sleep_time = random.choice(sleep_time_list) + logger.debug(f"Deadlock detected on {table.__table__} table...trying again in {round(sleep_time)} seconds: transaction size: {len(data)}") + time.sleep(sleep_time) + + attempts += 1 + continue + + raise e + + except Exception as e: + if len(data) == 1: + raise e + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] + + bulk_insert_dicts(logger, first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + bulk_insert_dicts(logger, second_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + + else: + logger.error("Unable to insert and return data in 10 attempts") + return None + + if deadlock_detected is True: + logger.error("Made it through even though Deadlock was detected") + + return_data = [dict(row) for row in return_data_tuples.mappings()] + + #no longer working in sqlalchemy 2.x + #for data_tuple in return_data_tuples: + # return_data.append(dict(data_tuple)) + + # using on confilict do nothing does not return the + # present values so this does gets the return values + if not on_conflict_update: + + conditions = [] + for column in natural_keys: + + column_values = [value[column] for value in data] + + column = getattr(table, column) + + conditions.append(column.in_(tuple(column_values))) + + with get_session() as session: + + result = ( + session.query(table).filter(*conditions).all() + ) + + for row in result: + + return_dict = {} + for field in return_columns: + + return_dict[field] = getattr(row, field) + + return_data.append(return_dict) + + + return return_data + + + +def get_issues_by_repo_id(repo_id): + + with get_session() as session: + + return session.query(Issue).filter(Issue.repo_id == repo_id).all() + +def get_pull_requests_by_repo_id(repo_id): + + with get_session() as session: + + return session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + +def get_pull_request_reviews_by_repo_id(repo_id): + + with get_session() as session: + + return session.query(PullRequestReview).filter(PullRequestReview.repo_id == repo_id).all() + +def get_contributor_aliases_by_email(email): + + with get_session() as session: + + return session.query(ContributorsAlias).filter_by(alias_email=email).all() + +def get_unresolved_commit_emails_by_name(name): + + with get_session() as session: + + return session.query(UnresolvedCommitEmail).filter_by(name=name).all() + +def get_contributors_by_full_name(full_name): + + with get_session() as session: + + return session.query(Contributor).filter_by(cntrb_full_name=full_name).all() + +def get_contributors_by_github_user_id(id): + + with get_session() as session: + + # Look into this, where it was used was doing .all() but this query should really only return one + return session.query(Contributor).filter_by(gh_user_id=id).all() + +def update_issue_closed_cntrbs_by_repo_id(repo_id): + + engine = get_engine() + + get_ranked_issues = s.text(f""" + WITH RankedIssues AS ( + SELECT repo_id, issue_id, cntrb_id, + ROW_NUMBER() OVER(PARTITION BY issue_id ORDER BY created_at DESC) AS rn + FROM issue_events + WHERE "action" = 'closed' + ) + + SELECT issue_id, cntrb_id from RankedIssues where rn=1 and repo_id={repo_id} and cntrb_id is not NULL + """) + + with engine.connect() as conn: + result = conn.execute(get_ranked_issues).fetchall() + + update_data = [] + for row in result: + update_data.append( + { + 'issue_id': row[0], + 'cntrb_id': row[1], + 'repo_id': repo_id + } + ) + + if update_data: + with engine.connect() as connection: + update_stmt = s.text(""" + UPDATE issues + SET cntrb_id = :cntrb_id + WHERE issue_id = :issue_id + AND repo_id = :repo_id + """) + connection.execute(update_stmt, update_data) \ No newline at end of file diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index dffe06535..221ee086d 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -30,6 +30,8 @@ from augur.application.db.models.base import Base from augur.application.db.util import execute_session_query +from augur.application.db import get_session + DEFAULT_REPO_GROUP_ID = 1 metadata = Base.metadata @@ -588,12 +590,13 @@ def is_valid_repo_group_id(session, repo_group_id: int) -> bool: @staticmethod def get_by_name(session, rg_name): - query = session.query(RepoGroup).filter(RepoGroup.rg_name == rg_name) + with get_session() as session: - try: - result = execute_session_query(query, 'one') - except NoResultFound: - return None + try: + query = session.query(RepoGroup).filter(RepoGroup.rg_name == rg_name) + result = execute_session_query(query, 'one') + except NoResultFound: + return None return result diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 029444215..f702f05f2 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -1233,7 +1233,7 @@ class CollectionStatus(Base): repo = relationship("Repo", back_populates="collection_status") @staticmethod - def insert(session, repo_id): + def insert(session, logger, repo_id): from augur.tasks.github.util.util import get_repo_weight_by_issue from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps @@ -1246,13 +1246,13 @@ def insert(session, repo_id): if "github" in repo_git: try: - pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) + pr_issue_count = get_repo_weight_by_issue(logger, repo_git) #session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) except Exception as e: pr_issue_count = None github_weight = None - session.logger.error( + logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) else: try: @@ -1261,7 +1261,7 @@ def insert(session, repo_id): except Exception as e: pr_issue_count = None github_weight = None - session.logger.error( + logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) @@ -1276,7 +1276,7 @@ def insert(session, repo_id): result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) - session.logger.info(f"Trying to insert repo \n issue and pr sum: {record['issue_pr_sum']}") + logger.info(f"Trying to insert repo \n issue and pr sum: {record['issue_pr_sum']}") if not result: return False diff --git a/augur/application/logs.py b/augur/application/logs.py index 11e1cb6ea..0d6649ce4 100644 --- a/augur/application/logs.py +++ b/augur/application/logs.py @@ -36,12 +36,29 @@ def getFormatter(logLevel): return logging.Formatter(fmt=ERROR_FORMAT_STRING) # create a file handler and set the format and log level -def create_file_handler(file, formatter, level): - handler = FileHandler(filename=file, mode='a') - handler.setFormatter(fmt=formatter) - handler.setLevel(level) +# def create_file_handler(file, formatter, level): +# handler = FileHandler(filename=file, mode='a') +# handler.setFormatter(fmt=formatter) +# handler.setLevel(level) + +# return handler - return handler +def create_file_handler(file, formatter, level): + try: + # Ensure the directory exists + directory = os.path.dirname(file) + if not os.path.exists(directory): + os.makedirs(directory) + + # Create the file handler + handler = logging.FileHandler(filename=file, mode='a') + handler.setFormatter(formatter) + handler.setLevel(level) + + return handler + except Exception as e: + print(f"Failed to create file handler: {e}") + return None # function to create two file handlers and add them to a logger def initialize_file_handlers(logger, file, log_level): diff --git a/augur/tasks/data_analysis/clustering_worker/tasks.py b/augur/tasks/data_analysis/clustering_worker/tasks.py index e59951ab0..d548ecf10 100644 --- a/augur/tasks/data_analysis/clustering_worker/tasks.py +++ b/augur/tasks/data_analysis/clustering_worker/tasks.py @@ -20,10 +20,8 @@ from collections import Counter from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value -from augur.application.db.models import Repo, RepoClusterMessage, RepoTopic, TopicWord -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_session, get_repo_by_repo_git +from augur.application.db.models import RepoClusterMessage, RepoTopic, TopicWord from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -37,10 +35,9 @@ def clustering_task(self, repo_git): logger = logging.getLogger(clustering_model.__name__) engine = self.app.engine - with DatabaseSession(logger, engine) as session: - clustering_model(repo_git, logger, engine, session) + clustering_model(repo_git, logger, engine) -def clustering_model(repo_git: str,logger,engine, session) -> None: +def clustering_model(repo_git: str,logger,engine) -> None: logger.info(f"Starting clustering analysis for {repo_git}") @@ -56,8 +53,7 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: tool_version = '0.2.0' data_source = 'Augur Collected Messages' - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo_id = get_repo_by_repo_git(repo_git).repo_id num_clusters = get_value("Clustering_Task", 'num_clusters') max_df = get_value("Clustering_Task", 'max_df') @@ -123,7 +119,7 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: # check if dumped pickle file exists, if exists no need to train the model if not os.path.exists(MODEL_FILE_NAME): logger.info("clustering model not trained. Training the model.........") - train_model(logger, engine, session, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) + train_model(logger, engine, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) else: model_stats = os.stat(MODEL_FILE_NAME) model_age = (time.time() - model_stats.st_mtime) @@ -131,7 +127,7 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: logger.debug(f'model age is: {model_age}') if model_age > 2000000: logger.info("clustering model to old. Retraining the model.........") - train_model(logger, engine, session, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) + train_model(logger, engine, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) else: logger.info("using pre-trained clustering model....") @@ -162,18 +158,20 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: prediction = kmeans_model.predict(feature_matrix_cur_repo) logger.info("prediction: " + str(prediction[0])) - # inserting data - record = { - 'repo_id': int(repo_id), - 'cluster_content': int(prediction[0]), - 'cluster_mechanism': -1, - 'tool_source': tool_source, - 'tool_version': tool_version, - 'data_source': data_source - } - repo_cluster_messages_obj = RepoClusterMessage(**record) - session.add(repo_cluster_messages_obj) - session.commit() + with get_session() as session: + + # inserting data + record = { + 'repo_id': int(repo_id), + 'cluster_content': int(prediction[0]), + 'cluster_mechanism': -1, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + repo_cluster_messages_obj = RepoClusterMessage(**record) + session.add(repo_cluster_messages_obj) + session.commit() # result = db.execute(repo_cluster_messages_table.insert().values(record)) logging.info( @@ -197,22 +195,24 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: logger.debug('prediction vocab') prediction = lda_model.transform(count_matrix_cur_repo) - logger.debug('for loop for vocab') - for i, prob_vector in enumerate(prediction): - # repo_id = msg_df.loc[i]['repo_id'] - for i, prob in enumerate(prob_vector): - record = { - 'repo_id': int(repo_id), - 'topic_id': i + 1, - 'topic_prob': prob, - 'tool_source': tool_source, - 'tool_version': tool_version, - 'data_source': data_source - } - - repo_topic_object = RepoTopic(**record) - session.add(repo_topic_object) - session.commit() + with get_session() as session: + + logger.debug('for loop for vocab') + for i, prob_vector in enumerate(prediction): + # repo_id = msg_df.loc[i]['repo_id'] + for i, prob in enumerate(prob_vector): + record = { + 'repo_id': int(repo_id), + 'topic_id': i + 1, + 'topic_prob': prob, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + repo_topic_object = RepoTopic(**record) + session.add(repo_topic_object) + session.commit() # result = db.execute(repo_topic_table.insert().values(record)) except Exception as e: @@ -260,7 +260,7 @@ def preprocess_and_tokenize(text): stems = [stemmer.stem(t) for t in tokens] return stems -def train_model(logger, engine, session, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source): +def train_model(logger, engine, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source): def visualize_labels_PCA(features, labels, annotations, num_components, title): labels_color_map = {-1: "red"} for label in labels: @@ -372,32 +372,35 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): # twid = self.db.execute(key_sequence_words_sql) # logger.info("twid variable is: {}".format(twid)) # insert topic list into database - topic_id = 1 - for topic in topic_list: - # twid = self.get_max_id('topic_words', 'topic_words_id') + 1 - # logger.info("twid variable is: {}".format(twid)) - for i in topic.argsort()[:-num_words_per_topic - 1:-1]: - # twid+=1 - # logger.info("in loop incremented twid variable is: {}".format(twid)) + + with get_session() as session: + + topic_id = 1 + for topic in topic_list: + # twid = self.get_max_id('topic_words', 'topic_words_id') + 1 # logger.info("twid variable is: {}".format(twid)) - record = { - # 'topic_words_id': twid, - # 'word_prob': word_prob[i], - 'topic_id': int(topic_id), - 'word': feature_names[i], - 'tool_source': tool_source, - 'tool_version': tool_version, - 'data_source': data_source - } - - topic_word_obj = TopicWord(**record) - session.add(topic_word_obj) - session.commit() - - # result = db.execute(topic_words_table.insert().values(record)) - logger.info( - "Primary key inserted into the topic_words table: {}".format(topic_word_obj.topic_words_id)) - topic_id += 1 + for i in topic.argsort()[:-num_words_per_topic - 1:-1]: + # twid+=1 + # logger.info("in loop incremented twid variable is: {}".format(twid)) + # logger.info("twid variable is: {}".format(twid)) + record = { + # 'topic_words_id': twid, + # 'word_prob': word_prob[i], + 'topic_id': int(topic_id), + 'word': feature_names[i], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + topic_word_obj = TopicWord(**record) + session.add(topic_word_obj) + session.commit() + + # result = db.execute(topic_words_table.insert().values(record)) + logger.info( + "Primary key inserted into the topic_words table: {}".format(topic_word_obj.topic_words_id)) + topic_id += 1 # insert topic list into database diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 15660e763..232614ad1 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -4,9 +4,10 @@ from datetime import datetime from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.util.github_paginator import GithubPaginator from augur.application.db.models import ContributorRepo +from augur.application.db.lib import bulk_insert_dicts +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth ### This worker scans all the platform users in Augur, and pulls their platform activity ### logs. Those are then used to analyze what repos each is working in (which will include repos not @@ -25,6 +26,8 @@ def contributor_breadth_model(self) -> None: tool_version = '0.0.1' data_source = 'GitHub API' + key_auth = GithubRandomKeyAuth(logger) + # This version of the query pulls contributors who have not had any data collected yet # To the top of the list cntrb_login_query = s.sql.text(""" @@ -60,7 +63,6 @@ def contributor_breadth_model(self) -> None: current_cntrb_logins = [dict(row) for row in result.mappings()] - cntrb_newest_events_query = s.sql.text(""" SELECT c.gh_login, MAX(cr.created_at) as newest_event_date FROM contributor_repo AS cr @@ -81,43 +83,40 @@ def contributor_breadth_model(self) -> None: cntrb_newest_events_map[gh_login] = newest_event_date + index = 1 + total = len(current_cntrb_logins) + for cntrb in current_cntrb_logins: - with GithubTaskManifest(logger) as manifest: - - index = 1 - total = len(current_cntrb_logins) - for cntrb in current_cntrb_logins: - - print(f"Processing cntrb {index} of {total}") - index += 1 + print(f"Processing cntrb {index} of {total}") + index += 1 - repo_cntrb_url = f"https://api.github.com/users/{cntrb['gh_login']}/events" + repo_cntrb_url = f"https://api.github.com/users/{cntrb['gh_login']}/events" - newest_event_in_db = datetime(1970, 1, 1) - if cntrb["gh_login"] in cntrb_newest_events_map: - newest_event_in_db = cntrb_newest_events_map[cntrb["gh_login"]] - + newest_event_in_db = datetime(1970, 1, 1) + if cntrb["gh_login"] in cntrb_newest_events_map: + newest_event_in_db = cntrb_newest_events_map[cntrb["gh_login"]] + - cntrb_events = [] - for page_data, page in GithubPaginator(repo_cntrb_url, manifest.key_auth, logger).iter_pages(): + cntrb_events = [] + for page_data, page in GithubPaginator(repo_cntrb_url, key_auth, logger).iter_pages(): - if page_data: - cntrb_events += page_data + if page_data: + cntrb_events += page_data - oldest_event_on_page = datetime.strptime(page_data[-1]["created_at"], "%Y-%m-%dT%H:%M:%SZ") - if oldest_event_on_page < newest_event_in_db: - print("Found cntrb events we already have...skipping the rest") - break + oldest_event_on_page = datetime.strptime(page_data[-1]["created_at"], "%Y-%m-%dT%H:%M:%SZ") + if oldest_event_on_page < newest_event_in_db: + print("Found cntrb events we already have...skipping the rest") + break - if len(cntrb_events) == 0: - logger.info("There are no cntrb events, or new events for this user.\n") - continue + if len(cntrb_events) == 0: + logger.info("There are no cntrb events, or new events for this user.\n") + continue - events = process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source) + events = process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source) - logger.info(f"Inserting {len(events)} events") - natural_keys = ["event_id", "tool_version"] - manifest.augur_db.insert_data(events, ContributorRepo, natural_keys) + logger.info(f"Inserting {len(events)} events") + natural_keys = ["event_id", "tool_version"] + bulk_insert_dicts(logger, events, ContributorRepo, natural_keys) def process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source): diff --git a/augur/tasks/data_analysis/discourse_analysis/tasks.py b/augur/tasks/data_analysis/discourse_analysis/tasks.py index 450ec15a2..e78e030e6 100644 --- a/augur/tasks/data_analysis/discourse_analysis/tasks.py +++ b/augur/tasks/data_analysis/discourse_analysis/tasks.py @@ -8,7 +8,7 @@ from collections import Counter from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import get_session, get_repo_by_repo_git from augur.application.db.models import Repo, DiscourseInsight from augur.application.db.util import execute_session_query from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -47,10 +47,7 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: tool_version = '0.1.0' data_source = 'Analysis of Issue/PR Messages' - with DatabaseSession(logger, engine) as session: - - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo_id = get_repo_by_repo_git(repo_git).repo_id get_messages_for_repo_sql = s.sql.text(""" (SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id @@ -96,7 +93,7 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: logger.debug(f"y_pred_git_flat len: {len(y_pred_git_flat)}") msg_df_cur_repo['discourse_act'] = y_pred_git_flat - with DatabaseSession(logger, engine) as session: + with get_session() as session: for index, row in msg_df_cur_repo.iterrows(): record = { 'msg_id': row['msg_id'], diff --git a/augur/tasks/data_analysis/insight_worker/tasks.py b/augur/tasks/data_analysis/insight_worker/tasks.py index 5bf159d2f..97a6580d6 100644 --- a/augur/tasks/data_analysis/insight_worker/tasks.py +++ b/augur/tasks/data_analysis/insight_worker/tasks.py @@ -10,10 +10,8 @@ import warnings from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value -from augur.application.db.models import Repo, ChaossMetricStatus, RepoInsight, RepoInsightsRecord -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_repo_by_repo_git, get_session +from augur.application.db.models import ChaossMetricStatus, RepoInsight, RepoInsightsRecord from augur.tasks.init.celery_app import AugurMlRepoCollectionTask warnings.filterwarnings('ignore') @@ -25,11 +23,10 @@ def insight_task(self, repo_git): logger = logging.getLogger(insight_task.__name__) engine = self.app.engine - with DatabaseSession(logger, engine) as session: - insight_model(repo_git, logger, engine, session) + insight_model(repo_git, logger, engine) -def insight_model(repo_git: str,logger,engine,session) -> None: +def insight_model(repo_git: str,logger,engine) -> None: refresh = True send_insights = True @@ -40,8 +37,8 @@ def insight_model(repo_git: str,logger,engine,session) -> None: metrics = {"issues-new": "issues", "code-changes": "commit_count", "code-changes-lines": "added", "reviews": "pull_requests", "contributors-new": "new_contributors"} - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id anomaly_days = get_value('Insight_Task', 'anomaly_days') training_days = get_value('Insight_Task', 'training_days') @@ -247,7 +244,7 @@ def classify_anomalies(df, metric): "data_source": data_source } - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo_insight_record_obj = RepoInsightsRecord(**record) session.add(repo_insight_record_obj) session.commit() @@ -292,7 +289,7 @@ def classify_anomalies(df, metric): "data_source": data_source } - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo_insight_obj = RepoInsight(**data_point) session.add(repo_insight_obj) session.commit() diff --git a/augur/tasks/data_analysis/message_insights/tasks.py b/augur/tasks/data_analysis/message_insights/tasks.py index 6cc0446ab..fe12bb960 100644 --- a/augur/tasks/data_analysis/message_insights/tasks.py +++ b/augur/tasks/data_analysis/message_insights/tasks.py @@ -12,10 +12,8 @@ from augur.tasks.data_analysis.message_insights.message_sentiment import get_senti_score from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value -from augur.application.db.models import Repo, MessageAnalysis, MessageAnalysisSummary -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_repo_by_repo_git, get_session +from augur.application.db.models import MessageAnalysis, MessageAnalysisSummary from augur.tasks.init.celery_app import AugurMlRepoCollectionTask #SPDX-License-Identifier: MIT @@ -28,12 +26,11 @@ def message_insight_task(self, repo_git): logger = logging.getLogger(message_insight_task.__name__) engine = self.app.engine - with DatabaseSession(logger, engine) as session: - message_insight_model(repo_git, logger, engine, session) + message_insight_model(repo_git, logger, engine) -def message_insight_model(repo_git: str,logger,engine, session) -> None: +def message_insight_model(repo_git: str,logger,engine) -> None: full_train = True begin_date = '' @@ -45,8 +42,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: now = datetime.datetime.utcnow() run_id = int(now.timestamp())+5 - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) insight_days = get_value("Message_Insights", 'insight_days') @@ -193,32 +190,34 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: logger.info('Begin message_analysis data insertion...') logger.info(f'{df_message.shape[0]} data records to be inserted') - for row in df_message.itertuples(index=False): - try: - msg = { - "msg_id": row.msg_id, - "worker_run_id": run_id, - "sentiment_score": row.sentiment_score, - "reconstruction_error": row.rec_err, - "novelty_flag": row.novel_label, - "feedback_flag": None, - "tool_source": tool_source, - "tool_version": tool_version, - "data_source": data_source, - } - - message_analysis_object = MessageAnalysis(**msg) - session.add(message_analysis_object) - session.commit() - - # result = create_database_engine().execute(message_analysis_table.insert().values(msg)) - logger.info( - f'Primary key inserted into the message_analysis table: {message_analysis_object.msg_analysis_id}') - # logger.info( - # f'Inserted data point {results_counter} with msg_id {row.msg_id} and timestamp {row.msg_timestamp}') - except Exception as e: - logger.error(f'Error occurred while storing datapoint {repr(e)}') - break + with get_session() as session: + + for row in df_message.itertuples(index=False): + try: + msg = { + "msg_id": row.msg_id, + "worker_run_id": run_id, + "sentiment_score": row.sentiment_score, + "reconstruction_error": row.rec_err, + "novelty_flag": row.novel_label, + "feedback_flag": None, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + } + + message_analysis_object = MessageAnalysis(**msg) + session.add(message_analysis_object) + session.commit() + + # result = create_database_engine().execute(message_analysis_table.insert().values(msg)) + logger.info( + f'Primary key inserted into the message_analysis table: {message_analysis_object.msg_analysis_id}') + # logger.info( + # f'Inserted data point {results_counter} with msg_id {row.msg_id} and timestamp {row.msg_timestamp}') + except Exception as e: + logger.error(f'Error occurred while storing datapoint {repr(e)}') + break logger.info('Data insertion completed\n') @@ -318,27 +317,30 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: # Insertion of sentiment ratios & novel counts to repo level table logger.info('Begin repo wise insights insertion...') logger.info(f'{df_senti.shape[0]} data records to be inserted\n') - for row in df_trend.itertuples(): - msg = { - "repo_id": repo_id, - "worker_run_id": run_id, - "positive_ratio": row.PosR, - "negative_ratio": row.NegR, - "novel_count": row.Novel, - "period": row.Index, - "tool_source": tool_source, - "tool_version": tool_version, - "data_source": data_source - } - - message_analysis_summary_object = MessageAnalysisSummary(**msg) - session.add(message_analysis_summary_object) - session.commit() - - # result = create_database_engine().execute(message_analysis_summary_table.insert().values(msg)) - logger.info( - f'Primary key inserted into the message_analysis_summary table: {message_analysis_summary_object.msg_summary_id}') - # logger.info(f'Inserted data point {results_counter} for insight_period {row.Index}') + + with get_session() as session: + + for row in df_trend.itertuples(): + msg = { + "repo_id": repo_id, + "worker_run_id": run_id, + "positive_ratio": row.PosR, + "negative_ratio": row.NegR, + "novel_count": row.Novel, + "period": row.Index, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + message_analysis_summary_object = MessageAnalysisSummary(**msg) + session.add(message_analysis_summary_object) + session.commit() + + # result = create_database_engine().execute(message_analysis_summary_table.insert().values(msg)) + logger.info( + f'Primary key inserted into the message_analysis_summary table: {message_analysis_summary_object.msg_summary_id}') + # logger.info(f'Inserted data point {results_counter} for insight_period {row.Index}') logger.info('Data insertion completed\n') diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py index af806bcdd..2347eb109 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -9,10 +9,8 @@ from augur.tasks.data_analysis.message_insights.message_sentiment import get_senti_score from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value -from augur.application.db.models import Repo, PullRequestAnalysis -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_session, get_repo_by_repo_git +from augur.application.db.models import PullRequestAnalysis from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -40,14 +38,11 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: insight_days = 200 - with DatabaseSession(logger, engine) as session: + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + senti_models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) - senti_models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) - - logger.info(f'Sentiment model dir located - {senti_models_dir}') + logger.info(f'Sentiment model dir located - {senti_models_dir}') # Any initial database instructions, like finding the last tuple inserted or generate the next ID value @@ -211,7 +206,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: logger.info('Begin PR_analysis data insertion...') logger.info(f'{df.shape[0]} data records to be inserted') - with DatabaseSession(logger, engine) as session: + with get_session() as session: for row in df.itertuples(index=False): try: msg = { diff --git a/augur/tasks/db/refresh_materialized_views.py b/augur/tasks/db/refresh_materialized_views.py index c191b5603..09faffe0c 100644 --- a/augur/tasks/db/refresh_materialized_views.py +++ b/augur/tasks/db/refresh_materialized_views.py @@ -3,7 +3,7 @@ import sqlalchemy as s from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import execute_sql @celery.task(bind=True) def refresh_materialized_views(self): @@ -86,92 +86,79 @@ def refresh_materialized_views(self): """) try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv1_refresh) + execute_sql(mv1_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv2_refresh) + execute_sql(mv2_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv3_refresh) + execute_sql(mv3_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv4_refresh) + execute_sql(mv4_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv5_refresh) + execute_sql(mv5_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv6_refresh) + execute_sql(mv6_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv7_refresh) + execute_sql(mv7_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv8_refresh) + execute_sql(mv8_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv9_refresh) + execute_sql(mv9_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv10_refresh) + execute_sql(mv10_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv11_refresh) + execute_sql(mv11_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv12_refresh) + execute_sql(mv12_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv13_refresh) + execute_sql(mv13_refresh) except Exception as e: logger.info(f"error is {e}") pass diff --git a/augur/tasks/git/dependency_libyear_tasks/core.py b/augur/tasks/git/dependency_libyear_tasks/core.py index 9e48757d6..b892570ad 100644 --- a/augur/tasks/git/dependency_libyear_tasks/core.py +++ b/augur/tasks/git/dependency_libyear_tasks/core.py @@ -1,43 +1,39 @@ from datetime import datetime from augur.application.db.models import * -from augur.application.db.lib import get_value -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, bulk_insert_dicts, get_repo_by_repo_git from augur.tasks.git.dependency_libyear_tasks.libyear_util.util import get_deps_libyear_data from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -def deps_libyear_model( session, repo_id,repo_git,repo_group_id): +def deps_libyear_model(logger,repo_git): """ Data collection and storage method """ - session.logger.info(f"This is the libyear deps model repo: {repo_git}") + logger.info(f"This is the libyear deps model repo: {repo_git}") #result = re.search(r"https:\/\/(github\.com\/[A-Za-z0-9 \- _]+\/)([A-Za-z0-9 \- _ .]+)$", repo_git).groups() #relative_repo_path = f"{repo_group_id}/{result[0]}{result[1]}" - query = session.query(Repo).filter( - Repo.repo_git == repo_git) - - result = execute_session_query(query, 'one') + + repo = get_repo_by_repo_git(repo_git) - absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo_id,result.repo_path,result.repo_name) + absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) #config.get_section("Facade")['repo_directory'] + relative_repo_path#self.config['repo_directory'] + relative_repo_path - generate_deps_libyear_data(session,repo_id, absolute_repo_path) + generate_deps_libyear_data(logger, repo.repo_id, absolute_repo_path) -def generate_deps_libyear_data(session, repo_id, path): +def generate_deps_libyear_data(logger, repo_id, path): """Scans for package files and calculates libyear - :param session: Task manifest and database session. :param repo_id: Repository ID :param path: Absolute path of the Repostiory """ date_scanned = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') - session.logger.info('Searching for deps in repo') - session.logger.info(f'Repo ID: {repo_id}, Path: {path}') + logger.info('Searching for deps in repo') + logger.info(f'Repo ID: {repo_id}, Path: {path}') - deps = get_deps_libyear_data(path,session.logger) + deps = get_deps_libyear_data(path,logger) if not deps: - session.logger.info(f"No deps found for repo {repo_id} on path {path}") + logger.info(f"No deps found for repo {repo_id} on path {path}") return to_insert = [] @@ -66,6 +62,6 @@ def generate_deps_libyear_data(session, repo_id, path): # VALUES (:repo_id, :name,:requirement,:type,:package_manager,:current_verion,:latest_version,:current_release_date,:latest_release_date,:libyear,:tool_source,:tool_version,:data_source, :data_collection_date) #""").bindparams(**repo_deps) # - #session.execute_sql(insert_statement) to_insert.append(repo_deps) - session.insert_data(to_insert, RepoDepsLibyear, ["repo_id","name","data_collection_date"]) + + bulk_insert_dicts(logger, to_insert, RepoDepsLibyear, ["repo_id","name","data_collection_date"]) diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py index bcfe810a9..6ac9d4d40 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py @@ -1,4 +1,8 @@ import requests +import logging +import traceback + +logger = logging.getLogger(__name__) def get_NPM_data(package): url = "https://registry.npmjs.org/%s" % package @@ -42,10 +46,17 @@ def get_latest_patch(version, data): def get_lastest_minor(version, data): - versions = data['versions'] + try: + versions = data['versions'] + except Exception as e: + logger.info( + ''.join(traceback.format_exception(None, e, e.__traceback__))) + # raise e + try: index = list(versions.keys()).index(version) except ValueError as e: + logger.info(f'error is {e} on the NPM. Some kind of value error. Probably a VALUES error for Node, #AmIRight?') raise e major,minor,patch = split_version(version) diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py index 7aaaf1f19..dab06b1a0 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py @@ -160,7 +160,14 @@ def parse_conda(file_handle): pip = None if not contents: return [] - dependencies = contents['dependencies'] + #dependencies = contents['dependencies'] + dependencies = contents.get('dependencies', []) + + if not dependencies: + print("No dependencies found.") + return [] + else: + print("Dependencies found.") for dep in dependencies: if (type(dep) is dict) and dep['pip']: pip = dep diff --git a/augur/tasks/git/dependency_libyear_tasks/tasks.py b/augur/tasks/git/dependency_libyear_tasks/tasks.py index ff15c61d9..fbf121b2a 100644 --- a/augur/tasks/git/dependency_libyear_tasks/tasks.py +++ b/augur/tasks/git/dependency_libyear_tasks/tasks.py @@ -1,22 +1,12 @@ import logging -from augur.application.db.session import DatabaseSession from augur.tasks.git.dependency_libyear_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.application.db.util import execute_session_query @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) def process_libyear_dependency_metrics(self, repo_git): #raise NotImplementedError - engine = self.app.engine - logger = logging.getLogger(process_libyear_dependency_metrics.__name__) - with DatabaseSession(logger, engine) as session: - logger.info(f"repo_git: {repo_git}") - query = session.query(Repo).filter(Repo.repo_git == repo_git) - - - repo = execute_session_query(query,'one') - deps_libyear_model(session, repo.repo_id,repo_git,repo.repo_group_id) \ No newline at end of file + deps_libyear_model(logger, repo_git) \ No newline at end of file diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index e4c627347..979020f9c 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -1,23 +1,34 @@ from datetime import datetime import os from augur.application.db.models import * +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value, get_session from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc from augur.tasks.util.worker_util import parse_json_from_subprocess_call +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -def generate_deps_data(session, repo_id, path): + +def generate_deps_data(logger, repo_git): """Run dependency logic on repo and stores data in database :param repo_id: Repository ID :param path: Absolute path of the Repostiory """ - - scan_date = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') - session.logger.info('Searching for deps in repo') - session.logger.info(f'Repo ID: {repo_id}, Path: {path}, Scan date: {scan_date}') + logger.info(f"repo_git: {repo_git}") + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id + + path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - deps = dep_calc.get_deps(path,session.logger) + logger.debug(f"This is the deps model repo: {repo_git}.") + + scan_date = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + logger.info('Searching for deps in repo') + logger.info(f'Repo ID: {repo_id}, Path: {path}, Scan date: {scan_date}') + + deps = dep_calc.get_deps(path,logger) to_insert = [] for dep in deps: @@ -33,56 +44,67 @@ def generate_deps_data(session, repo_id, path): } to_insert.append(repo_deps) - - session.insert_data(to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"]) - session.logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}") + bulk_insert_dicts(logger, to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"]) + + logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}") """ def deps_model(session, repo_id,repo_git,repo_path,repo_name): # Data collection and storage method - session.logger.info(f"This is the deps model repo: {repo_git}.") + logger.info(f"This is the deps model repo: {repo_git}.") generate_deps_data(session,repo_id, absolute_repo_path) """ -def generate_scorecard(session,repo_id,path): +def generate_scorecard(logger, repo_git): """Runs scorecard on repo and stores data in database :param repo_id: Repository ID - :param path: URL path of the Repostiory - """ - session.logger.info('Generating scorecard data for repo') - session.logger.info(f"Repo ID: {repo_id}, Path: {path}") + :param repo_git: URL path of the Repository + """ + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id + logger.info('Generating scorecard data for repo') # we convert relative path in the format required by scorecard like github.com/chaoss/augur # raw_path,_ = path.split('-') # scorecard_repo_path = raw_path[2:] - path = path[8:] + path = repo_git[8:] if path[-4:] == '.git': path = path.replace(".git", "") - command = '--repo='+ path + command = '--repo=' + path #this is path where our scorecard project is located path_to_scorecard = os.environ['HOME'] + '/scorecard' #setting the environmental variable which is required by scorecard - key_handler = GithubApiKeyHandler(session, session.logger) - os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + + with get_session() as session: + #key_handler = GithubRandomKeyAuth(logger) + key_handler = GithubApiKeyHandler(logger) + os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + + # This seems outdated + #setting the environmental variable which is required by scorecard + #key_handler = GithubApiKeyHandler(session, session.logger) + #os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() try: - required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) except Exception as e: session.logger.error(f"Could not parse required output! Error: {e}") raise e + + # end - session.logger.info('adding to database...') - session.logger.debug(f"output: {required_output}") + logger.info('adding to database...') + logger.debug(f"output: {required_output}") if not required_output['checks']: - session.logger.info('No scorecard checks found!') + logger.info('No scorecard checks found!') return #Store the overall score first @@ -98,7 +120,7 @@ def generate_scorecard(session,repo_id,path): 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') } to_insert.append(overall_deps_scorecard) - # session.insert_data(overall_deps_scorecard, RepoDepsScorecard, ["repo_id","name"]) + # bulk_insert_dicts(overall_deps_scorecard, RepoDepsScorecard, ["repo_id","name"]) #Store misc data from scorecard in json field. for check in required_output['checks']: @@ -114,8 +136,8 @@ def generate_scorecard(session,repo_id,path): } to_insert.append(repo_deps_scorecard) - session.insert_data(to_insert, RepoDepsScorecard, ["repo_id","name"]) + bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name"]) - session.logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") + logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/augur/tasks/git/dependency_tasks/tasks.py index 152c05308..ddfe11ff4 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/augur/tasks/git/dependency_tasks/tasks.py @@ -1,34 +1,16 @@ import logging import traceback -from augur.application.db.session import DatabaseSession from augur.tasks.git.dependency_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -from augur.application.db.lib import get_value -@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) -def process_dependency_metrics(self, repo_git): - #raise NotImplementedError - - engine = self.app.engine +@celery.task(base=AugurFacadeRepoCollectionTask) +def process_dependency_metrics(repo_git): logger = logging.getLogger(process_dependency_metrics.__name__) - with DatabaseSession(logger, engine) as session: - logger.info(f"repo_git: {repo_git}") - query = session.query(Repo).filter(Repo.repo_git == repo_git) - - - repo = execute_session_query(query,'one') - - absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - - session.logger.debug(f"This is the deps model repo: {repo_git}.") - - generate_deps_data(session,repo.repo_id,absolute_repo_path) + generate_deps_data(logger, repo_git) @celery.task(base=AugurSecondaryRepoCollectionTask, bind=True) @@ -38,10 +20,4 @@ def process_ossf_dependency_metrics(self, repo_git): logger = logging.getLogger(process_ossf_dependency_metrics.__name__) - with DatabaseSession(logger, engine) as session: - logger.info(f"repo_git: {repo_git}") - - query = session.query(Repo).filter(Repo.repo_git == repo_git) - - repo = execute_session_query(query,'one') - generate_scorecard(session, repo.repo_id, repo_git) \ No newline at end of file + generate_scorecard(logger, repo_git) \ No newline at end of file diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 9c699f7e7..97a69a757 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -4,16 +4,17 @@ from celery import group, chain import sqlalchemy as s +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id, facade_bulk_insert_commits from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set from augur.tasks.git.util.facade_worker.facade_worker.analyzecommit import analyze_commit -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count, facade_bulk_insert_commits +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count from augur.tasks.git.util.facade_worker.facade_worker.rebuildcache import fill_empty_affiliations, invalidate_caches, nuke_affiliations, rebuild_unknown_affiliation_and_web_caches -from augur.tasks.git.util.facade_worker.facade_worker.postanalysiscleanup import git_repo_cleanup from augur.tasks.github.facade_github.tasks import * +from augur.tasks.git.util.facade_worker.facade_worker.config import FacadeHelper from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import get_collection_status_repo_git_from_filter from augur.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize, git_repo_updates @@ -58,13 +59,10 @@ def facade_error_handler(request,exc,traceback): def facade_analysis_init_facade_task(repo_git): logger = logging.getLogger(facade_analysis_init_facade_task.__name__) - with FacadeSession(logger) as session: + facade_helper = FacadeHelper(logger) - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id - - session.update_status('Running analysis') - session.log_activity('Info',f"Beginning analysis.") + facade_helper.update_status('Running analysis') + facade_helper.log_activity('Info',f"Beginning analysis.") @celery.task(base=AugurFacadeRepoCollectionTask) @@ -72,107 +70,76 @@ def trim_commits_facade_task(repo_git): logger = logging.getLogger(trim_commits_facade_task.__name__) - with FacadeSession(logger) as session: - - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id - - def update_analysis_log(repos_id,status): - - # Log a repo's analysis status + facade_helper = FacadeHelper(logger) - log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) - VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) - - try: - session.execute_sql(log_message) - except: - pass + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id - session.inc_repos_processed() - update_analysis_log(repo_id,"Beginning analysis.") - # First we check to see if the previous analysis didn't complete + facade_helper.inc_repos_processed() + facade_helper.update_analysis_log(repo_id,"Beginning analysis.") + # First we check to see if the previous analysis didn't complete - get_status = s.sql.text("""SELECT working_commit FROM working_commits WHERE repos_id=:repo_id - """).bindparams(repo_id=repo_id) + working_commits = get_working_commits_by_repo_id(repo_id) - try: - working_commits = session.fetchall_data_from_sql_text(get_status) - except: - working_commits = [] - - # If there's a commit still there, the previous run was interrupted and - # the commit data may be incomplete. It should be trimmed, just in case. - commits_to_trim = [commit['working_commit'] for commit in working_commits] - - trim_commits(session,repo_id,commits_to_trim) - # Start the main analysis + # If there's a commit still there, the previous run was interrupted and + # the commit data may be incomplete. It should be trimmed, just in case. + commits_to_trim = [commit['working_commit'] for commit in working_commits] + + trim_commits(facade_helper,repo_id,commits_to_trim) + # Start the main analysis - update_analysis_log(repo_id,'Collecting data') - logger.info(f"Got past repo {repo_id}") + facade_helper.update_analysis_log(repo_id,'Collecting data') + logger.info(f"Got past repo {repo_id}") @celery.task(base=AugurFacadeRepoCollectionTask) def trim_commits_post_analysis_facade_task(repo_git): logger = logging.getLogger(trim_commits_post_analysis_facade_task.__name__) + facade_helper = FacadeHelper(logger) - with FacadeSession(logger) as session: - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id - - start_date = session.get_setting('start_date') - def update_analysis_log(repos_id,status): - - # Log a repo's analysis status - - log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) - VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) - - - session.execute_sql(log_message) - - session.logger.info(f"Generating sequence for repo {repo_id}") - - query = session.query(Repo).filter(Repo.repo_id == repo_id) - repo = execute_session_query(query, 'one') + repo = repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id - #Get the huge list of commits to process. - absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) - repo_loc = (f"{absoulte_path}/.git") - # Grab the parents of HEAD - - parent_commits = get_parent_commits_set(repo_loc, start_date) + start_date = facade_helper.get_setting('start_date') + + logger.info(f"Generating sequence for repo {repo_id}") - # Grab the existing commits from the database - existing_commits = get_existing_commits_set(session, repo_id) + repo = get_repo_by_repo_git(repo_git) - # Find missing commits and add them + #Get the huge list of commits to process. + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + repo_loc = (f"{absolute_path}/.git") + # Grab the parents of HEAD - missing_commits = parent_commits - existing_commits + parent_commits = get_parent_commits_set(repo_loc, start_date) - session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") - - # Find commits which are out of the analysis range + # Grab the existing commits from the database + existing_commits = get_existing_commits_set(repo_id) - trimmed_commits = existing_commits - parent_commits + # Find missing commits and add them - update_analysis_log(repo_id,'Data collection complete') + missing_commits = parent_commits - existing_commits - update_analysis_log(repo_id,'Beginning to trim commits') + facade_helper.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") + + # Find commits which are out of the analysis range - session.log_activity('Debug',f"Commits to be trimmed from repo {repo_id}: {len(trimmed_commits)}") + trimmed_commits = existing_commits - parent_commits + facade_helper.update_analysis_log(repo_id,'Data collection complete') + facade_helper.update_analysis_log(repo_id,'Beginning to trim commits') - #for commit in trimmed_commits: - trim_commits(session,repo_id,trimmed_commits) - + facade_helper.log_activity('Debug',f"Commits to be trimmed from repo {repo_id}: {len(trimmed_commits)}") - update_analysis_log(repo_id,'Commit trimming complete') + #for commit in trimmed_commits: + trim_commits(facade_helper,repo_id,trimmed_commits) + + facade_helper.update_analysis_log(repo_id,'Commit trimming complete') - update_analysis_log(repo_id,'Complete') + facade_helper.update_analysis_log(repo_id,'Complete') @@ -180,8 +147,8 @@ def update_analysis_log(repos_id,status): def facade_analysis_end_facade_task(): logger = logging.getLogger(facade_analysis_end_facade_task.__name__) - with FacadeSession(logger) as session: - session.log_activity('Info','Running analysis (complete)') + facade_helper = FacadeHelper(logger) + facade_helper.log_activity('Info','Running analysis (complete)') @@ -189,9 +156,9 @@ def facade_analysis_end_facade_task(): def facade_start_contrib_analysis_task(): logger = logging.getLogger(facade_start_contrib_analysis_task.__name__) - with FacadeSession(logger) as session: - session.update_status('Updating Contributors') - session.log_activity('Info', 'Updating Contributors with commits') + facade_helper = FacadeHelper(logger) + facade_helper.update_status('Updating Contributors') + facade_helper.log_activity('Info', 'Updating Contributors with commits') #enable celery multithreading @@ -202,78 +169,69 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: #create new session for celery thread. logger = logging.getLogger(analyze_commits_in_parallel.__name__) - with FacadeSession(logger) as session: - - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id + facade_helper = FacadeHelper(logger) - start_date = session.get_setting('start_date') + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id - session.logger.info(f"Generating sequence for repo {repo_id}") - - query = session.query(Repo).filter(Repo.repo_id == repo_id) - repo = execute_session_query(query, 'one') + start_date = facade_helper.get_setting('start_date') - #Get the huge list of commits to process. - absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) - repo_loc = (f"{absoulte_path}/.git") - # Grab the parents of HEAD - - parent_commits = get_parent_commits_set(repo_loc, start_date) + logger.info(f"Generating sequence for repo {repo_id}") + + repo = get_repo_by_repo_id(repo_id) - # Grab the existing commits from the database - existing_commits = get_existing_commits_set(session, repo_id) + #Get the huge list of commits to process. + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) + repo_loc = (f"{absolute_path}/.git") + # Grab the parents of HEAD - # Find missing commits and add them - missing_commits = parent_commits - existing_commits + parent_commits = get_parent_commits_set(repo_loc, start_date) - session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") + # Grab the existing commits from the database + existing_commits = get_existing_commits_set(repo_id) - - if not len(missing_commits) or repo_id is None: - #session.log_activity('Info','Type of missing_commits: %s' % type(missing_commits)) - return - - queue = list(missing_commits) + # Find missing commits and add them + missing_commits = parent_commits - existing_commits - logger.info(f"Got to analysis!") - absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) - repo_loc = (f"{absoulte_path}/.git") + facade_helper.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") - pendingCommitRecordsToInsert = [] + + if not len(missing_commits) or repo_id is None: + #session.log_activity('Info','Type of missing_commits: %s' % type(missing_commits)) + return + + queue = list(missing_commits) - for count, commitTuple in enumerate(queue): - quarterQueue = int(len(queue) / 4) + logger.info(f"Got to analysis!") + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + repo_loc = (f"{absolute_path}/.git") - if quarterQueue == 0: - quarterQueue = 1 # prevent division by zero with integer math + pendingCommitRecordsToInsert = [] - #Log progress when another quarter of the queue has been processed - if (count + 1) % quarterQueue == 0: - logger.info(f"Progress through current analysis queue is {(count / len(queue)) * 100}%") + for count, commitTuple in enumerate(queue): + quarterQueue = int(len(queue) / 4) + if quarterQueue == 0: + quarterQueue = 1 # prevent division by zero with integer math - #logger.info(f"Got to analysis!") - commitRecords = analyze_commit(session, repo_id, repo_loc, commitTuple) - #logger.debug(commitRecord) - if len(commitRecords): - pendingCommitRecordsToInsert.extend(commitRecords) - if len(pendingCommitRecordsToInsert) >= 1000: - facade_bulk_insert_commits(session,pendingCommitRecordsToInsert) - pendingCommitRecordsToInsert = [] + #Log progress when another quarter of the queue has been processed + if (count + 1) % quarterQueue == 0: + logger.info(f"Progress through current analysis queue is {(count / len(queue)) * 100}%") - - facade_bulk_insert_commits(session,pendingCommitRecordsToInsert) - - + #logger.info(f"Got to analysis!") + commitRecords = analyze_commit(logger, repo_id, repo_loc, commitTuple) + #logger.debug(commitRecord) + if len(commitRecords): + pendingCommitRecordsToInsert.extend(commitRecords) + if len(pendingCommitRecordsToInsert) >= 1000: + facade_bulk_insert_commits(logger,pendingCommitRecordsToInsert) + pendingCommitRecordsToInsert = [] + + facade_bulk_insert_commits(logger,pendingCommitRecordsToInsert) + # Remove the working commit. + remove_working_commits_by_repo_id_and_hashes(repo_id, queue) - # Remove the working commit. - remove_commit = s.sql.text("""DELETE FROM working_commits - WHERE repos_id = :repo_id AND working_commit IN :hashes - """).bindparams(repo_id=repo_id,hashes=tuple(queue)) - session.execute_sql(remove_commit) - logger.info("Analysis complete") return @@ -282,40 +240,31 @@ def nuke_affiliations_facade_task(): logger = logging.getLogger(nuke_affiliations_facade_task.__name__) - with FacadeSession(logger) as session: - nuke_affiliations(session) + facade_helper = FacadeHelper(logger) + nuke_affiliations(facade_helper) @celery.task def fill_empty_affiliations_facade_task(): logger = logging.getLogger(fill_empty_affiliations_facade_task.__name__) - with FacadeSession(logger) as session: - fill_empty_affiliations(session) + facade_helper = FacadeHelper(logger) + fill_empty_affiliations(facade_helper) @celery.task def invalidate_caches_facade_task(): logger = logging.getLogger(invalidate_caches_facade_task.__name__) - with FacadeSession(logger) as session: - invalidate_caches(session) + facade_helper = FacadeHelper(logger) + invalidate_caches(facade_helper) @celery.task def rebuild_unknown_affiliation_and_web_caches_facade_task(): logger = logging.getLogger(rebuild_unknown_affiliation_and_web_caches_facade_task.__name__) - with FacadeSession(logger) as session: - rebuild_unknown_affiliation_and_web_caches(session) - - -@celery.task -def git_repo_cleanup_facade_task(repo_git): - - logger = logging.getLogger(git_repo_cleanup_facade_task.__name__) - - with FacadeSession(logger) as session: - git_repo_cleanup(session, repo_git) + facade_helper = FacadeHelper(logger) + rebuild_unknown_affiliation_and_web_caches(facade_helper) # retry this task indefinitely every 5 minutes if it errors. Since the only way it gets scheduled is by itself, so if it stops running no more clones will happen till the instance is restarted @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) @@ -325,27 +274,29 @@ def clone_repos(): is_pending = CollectionStatus.facade_status == CollectionState.PENDING.value - with FacadeSession(logger) as session: + facade_helper = FacadeHelper(logger) + + with get_session() as session: # process up to 1000 repos at a time repo_git_identifiers = get_collection_status_repo_git_from_filter(session, is_pending, 999999) for repo_git in repo_git_identifiers: # set repo to intializing - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() + repo = Repo.get_by_repo_git(session, repo_git) repoStatus = repo.collection_status[0] setattr(repoStatus,"facade_status", CollectionState.INITIALIZING.value) session.commit() # clone repo try: - git_repo_initialize(session, repo_git) + git_repo_initialize(facade_helper, session, repo_git) session.commit() # get the commit count - commit_count = get_repo_commit_count(session, repo_git) - facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) + commit_count = get_repo_commit_count(logger, facade_helper, repo_git) + facade_weight = get_facade_weight_with_commit_count(repo_git, commit_count) - update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) + update_facade_scheduling_fields(repo_git, facade_weight, commit_count) # set repo to update setattr(repoStatus,"facade_status", CollectionState.UPDATE.value) @@ -361,9 +312,7 @@ def clone_repos(): setattr(repoStatus,"facade_status", CollectionState.ERROR.value) session.commit() - clone_repos.si().apply_async(countdown=60*5) - - + clone_repos.si().apply_async(countdown=60*5) #@celery.task(bind=True) @@ -373,7 +322,7 @@ def clone_repos(): # # logger = logging.getLogger(check_for_repo_updates_facade_task.__name__) # -# with FacadeSession(logger) as session: +# facade_helper = FacadeHelper(logger) # check_for_repo_updates(session, repo_git) @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) @@ -383,11 +332,12 @@ def git_update_commit_count_weight(self, repo_git): logger = logging.getLogger(git_update_commit_count_weight.__name__) # Change facade session to take in engine - with FacadeSession(logger) as session: - commit_count = get_repo_commit_count(session, repo_git) - facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) + facade_helper = FacadeHelper(logger) - update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) + commit_count = get_repo_commit_count(logger, facade_helper, repo_git) + facade_weight = get_facade_weight_with_commit_count(repo_git, commit_count) + + update_facade_scheduling_fields(repo_git, facade_weight, commit_count) @celery.task(base=AugurFacadeRepoCollectionTask) @@ -395,11 +345,12 @@ def git_repo_updates_facade_task(repo_git): logger = logging.getLogger(git_repo_updates_facade_task.__name__) - with FacadeSession(logger) as session: - git_repo_updates(session, repo_git) + facade_helper = FacadeHelper(logger) + + git_repo_updates(facade_helper, repo_git) -def generate_analysis_sequence(logger,repo_git, session): +def generate_analysis_sequence(logger,repo_git, facade_helper): """Run the analysis by looping over all active repos. For each repo, we retrieve the list of commits which lead to HEAD. If any are missing from the database, they are filled in. Then we check to see if any commits in the database are @@ -410,19 +361,16 @@ def generate_analysis_sequence(logger,repo_git, session): commit being analyzed at the time) we can recover. """ - - analysis_sequence = [] - repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - repos = session.fetchall_data_from_sql_text(repo_list) + #repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) + #repos = fetchall_data_from_sql_text(repo_list) - start_date = session.get_setting('start_date') + start_date = facade_helper.get_setting('start_date') - repo_ids = [repo['repo_id'] for repo in repos] + #repo_ids = [repo['repo_id'] for repo in repos] - repo_id = repo_ids.pop(0) + #repo_id = repo_ids.pop(0) analysis_sequence.append(facade_analysis_init_facade_task.si(repo_git)) @@ -439,110 +387,132 @@ def generate_analysis_sequence(logger,repo_git, session): return analysis_sequence + +def generate_contributor_sequence(logger,repo_git, session): + + contributor_sequence = [] + #all_repo_ids = [] + repo_id = None + + #contributor_sequence.append(facade_start_contrib_analysis_task.si()) + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id + + #pdb.set_trace() + #breakpoint() + #for repo in all_repos: + # contributor_sequence.append(insert_facade_contributors.si(repo['repo_id'])) + #all_repo_ids = [repo['repo_id'] for repo in all_repos] + + #contrib_group = create_grouped_task_load(dataList=all_repo_ids,task=insert_facade_contributors)#group(contributor_sequence) + #contrib_group.link_error(facade_error_handler.s()) + #return contrib_group#chain(facade_start_contrib_analysis_task.si(), contrib_group) + return insert_facade_contributors.si(repo_id) + + def facade_phase(repo_git): logger = logging.getLogger(facade_phase.__name__) logger.info("Generating facade sequence") - with FacadeSession(logger) as session: - #Get the repo_id - repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - repos = session.fetchall_data_from_sql_text(repo_list) + facade_helper = FacadeHelper(logger) + #Get the repo_id + #repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) + #repos = fetchall_data_from_sql_text(repo_list) - start_date = session.get_setting('start_date') + start_date = facade_helper.get_setting('start_date') - repo_ids = [repo['repo_id'] for repo in repos] + #repo_ids = [repo['repo_id'] for repo in repos] - repo_id = repo_ids.pop(0) + #repo_id = repo_ids.pop(0) - #Get the collectionStatus - query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id) + #Get the collectionStatus + #query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id) - status = execute_session_query(query,'one') - - # Figure out what we need to do - limited_run = session.limited_run - run_analysis = session.run_analysis - pull_repos = session.pull_repos - #force_analysis = session.force_analysis - run_facade_contributors = session.run_facade_contributors - - facade_sequence = [] - facade_core_collection = [] - - if not limited_run or (limited_run and pull_repos): - facade_core_collection.append(git_repo_updates_facade_task.si(repo_git)) - - facade_core_collection.append(git_update_commit_count_weight.si(repo_git)) + #status = execute_session_query(query,'one') + + # Figure out what we need to do + limited_run = facade_helper.limited_run + run_analysis = facade_helper.run_analysis + pull_repos = facade_helper.pull_repos + #force_analysis = session.force_analysis + run_facade_contributors = facade_helper.run_facade_contributors + + facade_sequence = [] + facade_core_collection = [] + + if not limited_run or (limited_run and pull_repos): + facade_core_collection.append(git_repo_updates_facade_task.si(repo_git)) + + facade_core_collection.append(git_update_commit_count_weight.si(repo_git)) - #Generate commit analysis task order. - if not limited_run or (limited_run and run_analysis): - facade_core_collection.extend(generate_analysis_sequence(logger,repo_git,session)) + #Generate commit analysis task order. + if not limited_run or (limited_run and run_analysis): + facade_core_collection.extend(generate_analysis_sequence(logger,repo_git,facade_helper)) - #Generate contributor analysis task group. - if not limited_run or (limited_run and run_facade_contributors): - facade_core_collection.append(insert_facade_contributors.si(repo_git)) + #Generate contributor analysis task group. + if not limited_run or (limited_run and run_facade_contributors): + facade_core_collection.append(generate_contributor_sequence(logger,repo_git,facade_helper)) - #These tasks need repos to be cloned by facade before they can work. - facade_sequence.append( - group( - chain(*facade_core_collection), - process_dependency_metrics.si(repo_git), - process_libyear_dependency_metrics.si(repo_git), - process_scc_value_metrics.si(repo_git) - ) + #These tasks need repos to be cloned by facade before they can work. + facade_sequence.append( + group( + chain(*facade_core_collection), + process_dependency_metrics.si(repo_git), + process_libyear_dependency_metrics.si(repo_git), + process_scc_value_metrics.si(repo_git) ) + ) - logger.info(f"Facade sequence: {facade_sequence}") - return chain(*facade_sequence) + logger.info(f"Facade sequence: {facade_sequence}") + return chain(*facade_sequence) def generate_non_repo_domain_facade_tasks(logger): logger.info("Generating facade sequence") - with FacadeSession(logger) as session: - - # Figure out what we need to do - limited_run = session.limited_run - delete_marked_repos = session.delete_marked_repos - pull_repos = session.pull_repos - # clone_repos = session.clone_repos - check_updates = session.check_updates - # force_updates = session.force_updates - run_analysis = session.run_analysis - # force_analysis = session.force_analysis - nuke_stored_affiliations = session.nuke_stored_affiliations - fix_affiliations = session.fix_affiliations - force_invalidate_caches = session.force_invalidate_caches - rebuild_caches = session.rebuild_caches - #if abs((datetime.datetime.strptime(session.cfg.get_setting('aliases_processed')[:-3], - # '%Y-%m-%d %I:%M:%S.%f') - datetime.datetime.now()).total_seconds()) // 3600 > int(session.cfg.get_setting( - # 'update_frequency')) else 0 - force_invalidate_caches = session.force_invalidate_caches - create_xlsx_summary_files = session.create_xlsx_summary_files - multithreaded = session.multithreaded - - facade_sequence = [] - - if nuke_stored_affiliations: - #facade_sequence.append(nuke_affiliations_facade_task.si().on_error(facade_error_handler.s()))#nuke_affiliations(session.cfg) - logger.info("Nuke stored affiliations is deprecated.") - # deprecated because the UI component of facade where affiliations would be - # nuked upon change no longer exists, and this information can easily be derived - # from queries and materialized views in the current version of Augur. - # This method is also a major performance bottleneck with little value. - - #session.logger.info(session.cfg) - if not limited_run or (limited_run and fix_affiliations): - #facade_sequence.append(fill_empty_affiliations_facade_task.si().on_error(facade_error_handler.s()))#fill_empty_affiliations(session) - logger.info("Fill empty affiliations is deprecated.") - # deprecated because the UI component of facade where affiliations would need - # to be fixed upon change no longer exists, and this information can easily be derived - # from queries and materialized views in the current version of Augur. - # This method is also a major performance bottleneck with little value. - - if force_invalidate_caches: - facade_sequence.append(invalidate_caches_facade_task.si().on_error(facade_error_handler.s()))#invalidate_caches(session.cfg) - - if not limited_run or (limited_run and rebuild_caches): - facade_sequence.append(rebuild_unknown_affiliation_and_web_caches_facade_task.si().on_error(facade_error_handler.s()))#rebuild_unknown_affiliation_and_web_caches(session.cfg) + facade_helper = FacadeHelper(logger) - return facade_sequence + # Figure out what we need to do + limited_run = facade_helper.limited_run + delete_marked_repos = facade_helper.delete_marked_repos + pull_repos = facade_helper.pull_repos + # clone_repos = facade_helper.clone_repos + check_updates = facade_helper.check_updates + # force_updates = facade_helper.force_updates + run_analysis = facade_helper.run_analysis + # force_analysis = facade_helper.force_analysis + nuke_stored_affiliations = facade_helper.nuke_stored_affiliations + fix_affiliations = facade_helper.fix_affiliations + force_invalidate_caches = facade_helper.force_invalidate_caches + rebuild_caches = facade_helper.rebuild_caches + #if abs((datetime.datetime.strptime(session.cfg.get_setting('aliases_processed')[:-3], + # '%Y-%m-%d %I:%M:%S.%f') - datetime.datetime.now()).total_seconds()) // 3600 > int(session.cfg.get_setting( + # 'update_frequency')) else 0 + force_invalidate_caches = facade_helper.force_invalidate_caches + create_xlsx_summary_files = facade_helper.create_xlsx_summary_files + multithreaded = facade_helper.multithreaded + + facade_sequence = [] + + if nuke_stored_affiliations: + #facade_sequence.append(nuke_affiliations_facade_task.si().on_error(facade_error_handler.s()))#nuke_affiliations(session.cfg) + logger.info("Nuke stored affiliations is deprecated.") + # deprecated because the UI component of facade where affiliations would be + # nuked upon change no longer exists, and this information can easily be derived + # from queries and materialized views in the current version of Augur. + # This method is also a major performance bottleneck with little value. + + #logger.info(session.cfg) + if not limited_run or (limited_run and fix_affiliations): + #facade_sequence.append(fill_empty_affiliations_facade_task.si().on_error(facade_error_handler.s()))#fill_empty_affiliations(session) + logger.info("Fill empty affiliations is deprecated.") + # deprecated because the UI component of facade where affiliations would need + # to be fixed upon change no longer exists, and this information can easily be derived + # from queries and materialized views in the current version of Augur. + # This method is also a major performance bottleneck with little value. + + if force_invalidate_caches: + facade_sequence.append(invalidate_caches_facade_task.si().on_error(facade_error_handler.s()))#invalidate_caches(session.cfg) + + if not limited_run or (limited_run and rebuild_caches): + facade_sequence.append(rebuild_unknown_affiliation_and_web_caches_facade_task.si().on_error(facade_error_handler.s()))#rebuild_unknown_affiliation_and_web_caches(session.cfg) + + return facade_sequence diff --git a/augur/tasks/git/scc_value_tasks/core.py b/augur/tasks/git/scc_value_tasks/core.py index 71993ebcd..38ad34c56 100644 --- a/augur/tasks/git/scc_value_tasks/core.py +++ b/augur/tasks/git/scc_value_tasks/core.py @@ -1,24 +1,31 @@ from datetime import datetime import os from augur.application.db.models import * +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value from augur.tasks.util.worker_util import parse_json_from_subprocess_call +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -def value_model(session,repo_git,repo_id, path): +def value_model(logger,repo_git): """Runs scc on repo and stores data in database :param repo_id: Repository ID - :param path: absolute file path of the Repostiory """ + logger.info(f"repo_git: {repo_git}") - session.logger.info('Generating value data for repo') - session.logger.info(f"Repo ID: {repo_id}, Path: {path}") - session.logger.info('Running scc...') + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id + + path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo_id,repo.repo_path,repo.repo_name) + + logger.info('Generating value data for repo') + logger.info(f"Repo ID: {repo_id}, Path: {path}") + logger.info('Running scc...') path_to_scc = os.environ['HOME'] + '/scc' - required_output = parse_json_from_subprocess_call(session.logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) + required_output = parse_json_from_subprocess_call(logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) - session.logger.info('adding scc data to database... ') - session.logger.debug(f"output: {required_output}") + logger.info('adding scc data to database... ') + logger.debug(f"output: {required_output}") to_insert = [] for record in required_output: @@ -42,6 +49,6 @@ def value_model(session,repo_git,repo_id, path): to_insert.append(repo_labor) - session.insert_data(to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) + bulk_insert_dicts(logger, to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) - session.logger.info(f"Done generating scc data for repo {repo_id} from path {path}") + logger.info(f"Done generating scc data for repo {repo_id} from path {path}") diff --git a/augur/tasks/git/scc_value_tasks/tasks.py b/augur/tasks/git/scc_value_tasks/tasks.py index 37ff4ac4b..dc0cd9472 100644 --- a/augur/tasks/git/scc_value_tasks/tasks.py +++ b/augur/tasks/git/scc_value_tasks/tasks.py @@ -1,26 +1,13 @@ import logging -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import get_session from augur.tasks.git.scc_value_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.application.db.util import execute_session_query -from augur.application.db.lib import get_value -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) -def process_scc_value_metrics(self, repo_git): - - engine = self.app.engine +@celery.task(base=AugurFacadeRepoCollectionTask) +def process_scc_value_metrics(repo_git): logger = logging.getLogger(process_scc_value_metrics.__name__) - with DatabaseSession(logger,engine) as session: - logger.info(f"repo_git: {repo_git}") - - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - - absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - - value_model(session,repo_git,repo.repo_id, absolute_repo_path) \ No newline at end of file + value_model(logger,repo_git,) \ No newline at end of file diff --git a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py index a0ca29701..18a436abb 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py @@ -29,7 +29,9 @@ import os import sqlalchemy as s -def analyze_commit(session, repo_id, repo_loc, commit): +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text + +def analyze_commit(logger, repo_id, repo_loc, commit): # This function analyzes a given commit, counting the additions, removals, and # whitespace changes. It collects all of the metadata about the commit, and @@ -60,7 +62,7 @@ def check_swapped_emails(name,email): # Sometimes people mix up their name and email in their git settings if name.find('@') >= 0 and email.find('@') == -1: - session.logger.debug(f"Found swapped email/name: {email}/{name}") + logger.debug(f"Found swapped email/name: {email}/{name}") return email,name else: return name,email @@ -71,7 +73,7 @@ def strip_extra_amp(email): # matching. This extra info is not used, so we discard it. if email.count('@') > 1: - session.logger.debug(f"Found extra @: {email}") + logger.debug(f"Found extra @: {email}") return email[:email.find('@',email.find('@')+1)] else: return email @@ -84,7 +86,7 @@ def discover_alias(email): WHERE alias_email=:alias_email AND cntrb_active = 1""").bindparams(alias_email=email) - canonical = session.fetchall_data_from_sql_text(fetch_canonical)#list(cursor_people_local) + canonical = fetchall_data_from_sql_text(fetch_canonical)#list(cursor_people_local) if canonical: for email in canonical: @@ -111,7 +113,7 @@ def generate_commit_record(repos_id,commit,filename, #2021-10-11 11:57:46 -0500 placeholder_date = "1970-01-01 00:00:15 -0500" - #session.logger.info(f"Timestamp: {author_timestamp}") + #logger.info(f"Timestamp: {author_timestamp}") commit_record = { 'repo_id' : repos_id, 'cmt_commit_hash' : str(commit), @@ -173,7 +175,7 @@ def generate_commit_record(repos_id,commit,filename, #cursor_local.execute(store_working_commit, (repo_id,commit)) #db_local.commit() - session.execute_sql(store_working_commit) + execute_sql(store_working_commit) #session.log_activity('Debug',f"Stored working commit and analyzing : {commit}") diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index 19539d79d..c62034a94 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -34,8 +34,9 @@ from sqlalchemy.exc import OperationalError from psycopg2.errors import DeadlockDetected -from augur.tasks.github.util.github_task_session import * -from augur.application.config import AugurConfig +from augur.application.db.session import DatabaseSession +from augur.application.db.lib import execute_sql +from augur.application.db.lib import get_section from logging import Logger logger = logging.getLogger(__name__) @@ -77,7 +78,7 @@ def get_database_args_from_env(): #print(credentials) return credentials -class FacadeSession(GithubTaskSession): +class FacadeHelper(): """ORM session used in facade tasks. This class adds the various attributes needed for legacy facade as well as a modified version of the legacy FacadeConfig class. @@ -104,12 +105,12 @@ def __init__(self,logger: Logger): from augur.application.db import get_engine engine = get_engine() - #self.cfg = FacadeConfig(logger) self.repos_processed = 0 - super().__init__(logger=logger, engine=engine) - # Figure out what we need to do + # super().__init__(logger=logger, engine=engine) + + self.logger = logger - worker_options = AugurConfig(logger, self).get_section("Facade") + worker_options = get_section("Facade") self.limited_run = worker_options["limited_run"] self.delete_marked_repos = worker_options["delete_marked_repos"] @@ -150,7 +151,7 @@ def get_setting(self,setting): query = s.sql.text("""SELECT value FROM settings WHERE setting=:settingParam ORDER BY last_modified DESC LIMIT 1""").bindparams(settingParam=setting) - result = self.execute_sql(query).fetchone() + result = execute_sql(query).fetchone() print(result) return result[0] @@ -159,7 +160,7 @@ def update_status(self, status): query = s.sql.text("""UPDATE settings SET value=:statusParam WHERE setting='utility_status' """).bindparams(statusParam=status) - self.execute_sql(query) + execute_sql(query) def log_activity(self, level, status): # Log an activity based upon urgency and user's preference. If the log level is @@ -176,7 +177,7 @@ def log_activity(self, level, status): """).bindparams(levelParam=level,statusParam=status) try: - self.execute_sql(query) + execute_sql(query) except Exception as e: self.logger.error(f"Error encountered: {e}") raise e @@ -187,9 +188,19 @@ def update_repo_log(self,repos_id,status): VALUES (:repo_id,:repo_status)""").bindparams(repo_id=repos_id,repo_status=status) try: - self.execute_sql(log_message) + execute_sql(log_message) except: pass + + def update_analysis_log(self, repos_id,status): + + # Log a repo's analysis status + + log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) + VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) + + execute_sql(log_message) + def insert_or_update_data(self, query, **bind_args)-> None: """Provide deadlock detection for postgres updates, inserts, and deletions for facade. @@ -206,9 +217,9 @@ def insert_or_update_data(self, query, **bind_args)-> None: try: if bind_args: #self.cfg.cursor.execute(query, params) - self.execute_sql(query.bindparams(**bind_args)) + execute_sql(query.bindparams(**bind_args)) else: - self.execute_sql(query) + execute_sql(query) break except OperationalError as e: # print(str(e).split("Process")[1].split(";")[0]) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py b/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py index b41c6f14d..1811c734f 100755 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py @@ -27,7 +27,7 @@ # aliases, and caches data for display. from __future__ import annotations import html.parser -from .config import FacadeSession as FacadeSession +from .config import FacadeHelper as FacadeHelper #.facade06analyze analysis moved to facade_tasks.py - IM 10/12/22 #from contributor_interfaceable.facade08contributorinterfaceable import ContributorInterfaceable diff --git a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py deleted file mode 100644 index 3ec201327..000000000 --- a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py +++ /dev/null @@ -1,183 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2016-2018 Brian Warner -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: Apache-2.0 - -# Git repo maintenance -# -# This script is responsible for cloning new repos and keeping existing repos up -# to date. It can be run as often as you want (and will detect when it's -# already running, so as not to spawn parallel processes), but once or twice per -# day should be more than sufficient. Each time it runs, it updates the repo -# and checks for any parents of HEAD that aren't already accounted for in the -# repos. It also rebuilds analysis data, checks any changed affiliations and -# aliases, and caches data for display. -import subprocess -import sqlalchemy as s -from augur.application.db.util import execute_session_query -from .utilitymethods import get_absolute_repo_path -from augur.application.db.models import * - -#Will delete repos passed and cleanup associated commit data. -def git_repo_cleanup(session,repo_git): - -# Clean up any git repos that are pending deletion - - session.update_status('Purging deleted repos') - #session.logger.info("Processing deletions") - session.log_activity('Info','Processing deletions') - - - query = session.query(Repo).filter( - Repo.repo_git == repo_git)#s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_status='Delete'""") - - delete_repos = execute_session_query(query,'all')#session.fetchall_data_from_sql_text(query) - - for row in delete_repos: - - # Remove the files on disk - - absolute_path = get_absolute_repo_path(session.repo_base_directory, row.repo_id, row.repo_path,row.repo_name) - - cmd = ("rm -rf %s" - % (absolute_path)) - - return_code = subprocess.Popen([cmd],shell=True).wait() - - # Remove the analysis data - - remove_commits = s.sql.text("""DELETE FROM commits WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_commits) - - optimize_table = s.sql.text("""OPTIMIZE TABLE commits""") - session.execute_sql(optimize_table) - - # Remove cached repo data - - remove_dm_repo_weekly = s.sql.text("""DELETE FROM dm_repo_weekly WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_dm_repo_weekly) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_weekly""") - session.execute_sql(optimize_table) - - remove_dm_repo_monthly = s.sql.text("""DELETE FROM dm_repo_monthly WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_dm_repo_monthly) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_monthly""") - session.execute_sql(optimize_table) - - remove_dm_repo_annual = s.sql.text("""DELETE FROM dm_repo_annual WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_dm_repo_annual) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_annual""") - session.execute_sql(optimize_table) - - # Set project to be recached if just removing a repo - - set_project_recache = s.sql.text("""UPDATE projects SET recache=TRUE - WHERE id=:repo_group_id""").bindparams(repo_group_id=row.repo_group_id) - session.execute_sql(set_project_recache) - # Remove the entry from the repos table - - query = s.sql.text("""DELETE FROM repo WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(query) - - #log_activity('Verbose','Deleted repo %s' % row[0]) - #session.logger.debug(f"Deleted repo {row.repo_id}") - session.log_activity('Verbose',f"Deleted repo {row.repo_id}") - cleanup = '%s/%s%s' % (row.repo_group_id,row.repo_path,row.repo_name) - - # Remove any working commits - - remove_working_commits = s.sql.text("""DELETE FROM working_commits WHERE repos_id=:repo_id - """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_working_commits) - - # Remove the repo from the logs - - remove_logs = s.sql.text("""DELETE FROM repos_fetch_log WHERE repos_id =:repo_id - """).bindparams(repo_id=row.repo_id) - - session.execute_sql(remove_logs) - - optimize_table = s.sql.text("""OPTIMIZE TABLE repos_fetch_log""") - session.execute_sql(optimize_table) - - # Attempt to cleanup any empty parent directories - - while (cleanup.find('/',0) > 0): - cleanup = cleanup[:cleanup.rfind('/',0)] - - cmd = "rmdir %s%s" % (session.repo_base_directory,cleanup) - subprocess.Popen([cmd],shell=True).wait() - #log_activity('Verbose','Attempted %s' % cmd) - #session.logger.debug(f"Attempted {cmd}") - session.log_activity('Verbose',f"Attempted {cmd}") - - #update_repo_log(row[0],'Deleted') - session.update_repo_log(row.repo_id,'Deleted') - - # Clean up deleted projects - - get_deleted_projects = s.sql.text("""SELECT repo_group_id FROM repo_groups WHERE rg_name='(Queued for removal)'""") - - deleted_projects = session.fetchall_data_from_sql_text(get_deleted_projects) - - for project in deleted_projects: - - # Remove cached data for projects which were marked for deletion - - clear_annual_cache = s.sql.text("""DELETE FROM dm_repo_group_annual WHERE - repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_annual_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_annual""") - session.execute_sql(optimize_table) - - clear_monthly_cache = s.sql.text("""DELETE FROM dm_repo_group_monthly WHERE - repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_monthly_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_monthly""") - session.execute_sql(optimize_table) - - clear_weekly_cache = s.sql.text("""DELETE FROM dm_repo_group_weekly WHERE - repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_weekly_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_weekly""") - session.execute_sql(optimize_table) - - clear_unknown_cache = s.sql.text("""DELETE FROM unknown_cache WHERE - projects_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_unknown_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_weekly""") - session.execute_sql(optimize_table) - - # Remove any projects which were also marked for deletion - - remove_project = s.sql.text("""DELETE FROM repo_groups WHERE repo_group_id=:repo_group_id - """).bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(remove_project) - - - session.log_activity('Info', 'Processing deletions (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py index e4697dbc1..d92f17b69 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py @@ -26,13 +26,14 @@ # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. import sqlalchemy as s +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text from .utilitymethods import store_working_author, trim_author # if platform.python_implementation() == 'PyPy': # import pymysql # else: # import MySQLdb -def nuke_affiliations(session): +def nuke_affiliations(facade_helper): # Delete all stored affiliations in the database. Normally when you # add/remove/change affiliation data via the web UI, any potentially affected @@ -42,16 +43,16 @@ def nuke_affiliations(session): # this is the scorched earth way: remove them all to force a total rebuild. # Brutal but effective. - session.log_activity('Info','Nuking affiliations') + facade_helper.log_activity('Info','Nuking affiliations') nuke = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL, cmt_committer_affiliation = NULL""") - session.execute_sql(nuke) + execute_sql(nuke) - session.log_activity('Info','Nuking affiliations (complete)') + facade_helper.log_activity('Info','Nuking affiliations (complete)') -def fill_empty_affiliations(session): +def fill_empty_affiliations(facade_helper): @@ -79,13 +80,13 @@ def discover_null_affiliations(attribution,email): - matches = session.fetchall_data_from_sql_text(find_exact_match)#list(cfg.cursor) + matches = fetchall_data_from_sql_text(find_exact_match)#list(cfg.cursor) if not matches and email.find('@') < 0: # It's not a properly formatted email, leave it NULL and log it. - session.log_activity('Info',f"Unmatchable email: {email}") + facade_helper.log_activity('Info',f"Unmatchable email: {email}") return @@ -104,7 +105,7 @@ def discover_null_affiliations(attribution,email): - matches = session.fetchall_data_from_sql_text(find_exact_domain) + matches = fetchall_data_from_sql_text(find_exact_domain) if not matches: @@ -117,7 +118,7 @@ def discover_null_affiliations(attribution,email): ORDER BY ca_start_date DESC""").bindparams(strippedDomain=domain[domain.rfind('.',0,domain.rfind('.',0))+1:]) - matches = session.fetchall_data_from_sql_text(find_domain)#list(cfg.cursor) + matches = fetchall_data_from_sql_text(find_domain)#list(cfg.cursor) if not matches: @@ -130,7 +131,7 @@ def discover_null_affiliations(attribution,email): if matches: - session.log_activity('Debug',f"Found domain match for {email}") + facade_helper.log_activity('Debug',f"Found domain match for {email}") for match in matches: update = s.sql.text(("UPDATE commits " @@ -140,14 +141,14 @@ def discover_null_affiliations(attribution,email): f"AND cmt_{attribution}_date::date >= \'{match['ca_start_date']}\'::date") ).bindparams(affiliation=match['ca_affiliation'],email=email) - session.log_activity('Info', f"attr: {attribution} \nmatch:{match}\nsql: {update}") + facade_helper.log_activity('Info', f"attr: {attribution} \nmatch:{match}\nsql: {update}") try: - session.execute_sql(update) + execute_sql(update) except Exception as e: - session.log_activity('Info', f"Error encountered: {e}") - session.log_activity('Info', f"Affiliation insertion failed for {email} ") - session.log_activity('Info', f"Offending query: {update} ") + facade_helper.log_activity('Info', f"Error encountered: {e}") + facade_helper.log_activity('Info', f"Affiliation insertion failed for {email} ") + facade_helper.log_activity('Info', f"Offending query: {update} ") def discover_alias(email): @@ -158,7 +159,7 @@ def discover_alias(email): WHERE alias_email=:email AND cntrb_active = 1""").bindparams(email=email) - canonical = session.fetchall_data_from_sql_text(fetch_canonical)#list(cfg.cursor) + canonical = fetchall_data_from_sql_text(fetch_canonical)#list(cfg.cursor) if canonical: for email in canonical: @@ -168,8 +169,8 @@ def discover_alias(email): ### The real function starts here ### - session.update_status('Filling empty affiliations') - session.log_activity('Info','Filling empty affiliations') + facade_helper.update_status('Filling empty affiliations') + facade_helper.log_activity('Info','Filling empty affiliations') # Process any changes to the affiliations or aliases, and set any existing # entries in commits to NULL so they are filled properly. @@ -178,41 +179,41 @@ def discover_alias(email): timefetch = s.sql.text("""SELECT current_timestamp(6) as fetched""") - affiliations_fetched = session.execute_sql(timefetch).fetchone()[0] + affiliations_fetched = execute_sql(timefetch).fetchone()[0] print(affiliations_fetched) # Now find the last time we worked on affiliations, to figure out what's new - affiliations_processed = session.get_setting('affiliations_processed') + affiliations_processed = facade_helper.get_setting('affiliations_processed') get_changed_affiliations = s.sql.text("""SELECT ca_domain FROM contributor_affiliations""")# WHERE " #"ca_last_used >= timestamptz %s") - changed_affiliations = session.fetchall_data_from_sql_text(get_changed_affiliations)#list(cfg.cursor) + changed_affiliations = fetchall_data_from_sql_text(get_changed_affiliations)#list(cfg.cursor) # Process any affiliations which changed since we last checked for changed_affiliation in changed_affiliations: - session.log_activity('Debug',f"Resetting affiliation for {changed_affiliation['ca_domain']}") + facade_helper.log_activity('Debug',f"Resetting affiliation for {changed_affiliation['ca_domain']}") set_author_to_null = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL WHERE cmt_author_email LIKE CONCAT('%%',:affiliation)""").bindparams(affiliation=changed_affiliation['ca_domain']) - session.execute_sql(set_author_to_null) + execute_sql(set_author_to_null) set_committer_to_null = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_email LIKE CONCAT('%%',:affiliation)""").bindparams(affiliation=changed_affiliation['ca_domain']) - session.execute_sql(set_committer_to_null) + execute_sql(set_committer_to_null) # Update the last fetched date, so we know where to start next time. update_affiliations_date = s.sql.text("""UPDATE settings SET value=:affiliations WHERE setting = 'affiliations_processed'""").bindparams(affiliations=affiliations_fetched) - session.execute_sql(update_affiliations_date) + execute_sql(update_affiliations_date) # On to the aliases, now @@ -220,61 +221,61 @@ def discover_alias(email): get_time = s.sql.text("""SELECT current_timestamp(6) as fetched""") - aliases_fetched = session.execute_sql(get_time).fetchone()[0]#['fetched'] + aliases_fetched = execute_sql(get_time).fetchone()[0]#['fetched'] # Now find the last time we worked on aliases, to figure out what's new - aliases_processed = session.get_setting('aliases_processed') + aliases_processed = facade_helper.get_setting('aliases_processed') get_changed_aliases = s.sql.text("""SELECT alias_email FROM contributors_aliases WHERE cntrb_last_modified >= :aliases""").bindparams(aliases=aliases_processed) - changed_aliases = session.fetchall_data_from_sql_text(get_changed_aliases)#list(cfg.cursor) + changed_aliases = fetchall_data_from_sql_text(get_changed_aliases)#list(cfg.cursor) # Process any aliases which changed since we last checked for changed_alias in changed_aliases: - session.log_activity('Debug',f"Resetting affiliation for {changed_alias['alias_email']}") + facade_helper.log_activity('Debug',f"Resetting affiliation for {changed_alias['alias_email']}") set_author_to_null = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL WHERE cmt_author_raw_email LIKE CONCAT('%%',:alias)""").bindparams(alias=changed_alias['alias_email']) - session.insert_or_update_data(set_author_to_null) + facade_helper.insert_or_update_data(set_author_to_null) set_committer_to_null = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_raw_email LIKE CONCAT('%%',:alias_email)""").bindparams(alias_email=changed_alias['alias_email']) - session.insert_or_update_data(set_committer_to_null) + facade_helper.insert_or_update_data(set_committer_to_null) reset_author = s.sql.text("""UPDATE commits SET cmt_author_email = :author_email WHERE cmt_author_raw_email = :raw_author_email """).bindparams(author_email=discover_alias(changed_alias['alias_email']),raw_author_email=changed_alias['alias_email']) - session.insert_or_update_data(reset_author) + facade_helper.insert_or_update_data(reset_author) reset_committer = s.sql.text("""UPDATE commits SET cmt_committer_email = :author_email WHERE cmt_committer_raw_email = :raw_author_email """).bindparams(author_email=discover_alias(changed_alias['alias_email']), raw_author_email=changed_alias['alias_email']) - session.insert_or_update_data(reset_committer) + facade_helper.insert_or_update_data(reset_committer) # Update the last fetched date, so we know where to start next time. update_aliases_date = s.sql.text("""UPDATE settings SET value=:aliases WHERE setting = 'aliases_processed'""").bindparams(aliases=aliases_fetched) - session.execute_sql(update_aliases_date) + execute_sql(update_aliases_date) # Now rebuild the affiliation data - working_author = session.get_setting('working_author') + working_author = facade_helper.get_setting('working_author') if working_author != 'done': - session.log_activity('Error',f"Trimming author data in affiliations: {working_author}") - trim_author(session, working_author) + facade_helper.log_activity('Error',f"Trimming author data in affiliations: {working_author}") + trim_author(facade_helper, working_author) # Figure out which projects have NULL affiliations so they can be recached @@ -294,7 +295,7 @@ def discover_alias(email): # "SET rg_recache=TRUE WHERE " # "author_affiliation IS NULL OR " # "committer_affiliation IS NULL") - session.execute_sql(set_recache) + execute_sql(set_recache) # Find any authors with NULL affiliations and fill them @@ -304,19 +305,19 @@ def discover_alias(email): WHERE cmt_author_affiliation IS NULL GROUP BY cmt_author_email""") - null_authors = session.fetchall_data_from_sql_text(find_null_authors) + null_authors = fetchall_data_from_sql_text(find_null_authors) - session.log_activity('Debug',f"Found {len(null_authors)} authors with NULL affiliation") + facade_helper.log_activity('Debug',f"Found {len(null_authors)} authors with NULL affiliation") for null_author in null_authors: email = null_author['email'] - store_working_author(session, email) + store_working_author(facade_helper, email) discover_null_affiliations('author',email) - store_working_author(session, 'done') + store_working_author(facade_helper, 'done') # Find any committers with NULL affiliations and fill them @@ -326,15 +327,15 @@ def discover_alias(email): WHERE cmt_committer_affiliation IS NULL GROUP BY cmt_committer_email""") - null_committers = session.fetchall_data_from_sql_text(find_null_committers) + null_committers = fetchall_data_from_sql_text(find_null_committers) - session.log_activity('Debug',f"Found {len(null_committers)} committers with NULL affiliation") + facade_helper.log_activity('Debug',f"Found {len(null_committers)} committers with NULL affiliation") for null_committer in null_committers: email = null_committer['email'] - store_working_author(session, email) + store_working_author(facade_helper, email) discover_null_affiliations('committer',email) @@ -344,43 +345,43 @@ def discover_alias(email): SET cmt_author_affiliation = '(Unknown)' WHERE cmt_author_affiliation IS NULL""") - session.execute_sql(fill_unknown_author) + execute_sql(fill_unknown_author) fill_unknown_committer = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = '(Unknown)' WHERE cmt_committer_affiliation IS NULL""") - session.execute_sql(fill_unknown_committer) + execute_sql(fill_unknown_committer) - store_working_author(session, 'done') + store_working_author(facade_helper, 'done') - session.log_activity('Info','Filling empty affiliations (complete)') + facade_helper.log_activity('Info','Filling empty affiliations (complete)') -def invalidate_caches(session): +def invalidate_caches(facade_helper): # Invalidate all caches - session.update_status('Invalidating caches') - session.log_activity('Info','Invalidating caches') + facade_helper.update_status('Invalidating caches') + facade_helper.log_activity('Info','Invalidating caches') invalidate_cache = s.sql.text("""UPDATE repo_groups SET rg_recache = 1""") - session.execute_sql(invalidate_cache) + execute_sql(invalidate_cache) - session.log_activity('Info','Invalidating caches (complete)') + facade_helper.log_activity('Info','Invalidating caches (complete)') -def rebuild_unknown_affiliation_and_web_caches(session): +def rebuild_unknown_affiliation_and_web_caches(facade_helper): # When there's a lot of analysis data, calculating display data on the fly gets # pretty expensive. Instead, we crunch the data based upon the user's preferred # statistics (author or committer) and store them. We also store all records # with an (Unknown) affiliation for display to the user. - session.update_status('Caching data for display') - session.log_activity('Info','Caching unknown affiliations and web data for display') + facade_helper.update_status('Caching data for display') + facade_helper.log_activity('Info','Caching unknown affiliations and web data for display') - report_date = session.get_setting('report_date') - report_attribution = session.get_setting('report_attribution') + report_date = facade_helper.get_setting('report_date') + report_attribution = facade_helper.get_setting('report_attribution') # Clear stale caches @@ -491,9 +492,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM unknown_cache c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_unknown_cache) + execute_sql(clear_unknown_cache) - session.log_activity('Verbose','Caching unknown authors and committers') + facade_helper.log_activity('Verbose','Caching unknown authors and committers') # Cache the unknown authors @@ -513,9 +514,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): AND p.rg_recache = 1 GROUP BY r.repo_group_id,a.cmt_author_email, info.a, info.b, info.c - """).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + """).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - session.execute_sql(unknown_authors) + execute_sql(unknown_authors) # Cache the unknown committers @@ -533,13 +534,13 @@ def rebuild_unknown_affiliation_and_web_caches(session): WHERE a.cmt_committer_affiliation = '(Unknown)' AND p.rg_recache = 1 GROUP BY r.repo_group_id,a.cmt_committer_email, info.a, info.b, info.c - """).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + """).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - session.execute_sql(unknown_committers) + execute_sql(unknown_committers) # Start caching by project - session.log_activity('Verbose','Caching projects') + facade_helper.log_activity('Verbose','Caching projects') cache_projects_by_week = s.sql.text(( "INSERT INTO dm_repo_group_weekly (repo_group_id, email, affiliation, week, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -573,7 +574,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email, " "r.repo_group_id, info.a, info.b, info.c") - ).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + ).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) # session.execute_sql(cache_projects_by_week) @@ -609,7 +610,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "r.repo_group_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) # session.execute_sql(cache_projects_by_month) @@ -646,7 +647,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) @@ -654,7 +655,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # session.execute_sql(cache_projects_by_year) # Start caching by repo - session.log_activity('Verbose','Caching repos') + facade_helper.log_activity('Verbose','Caching repos') cache_repos_by_week = s.sql.text( ( @@ -689,7 +690,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) # session.execute_sql(cache_repos_by_week) @@ -725,7 +726,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) # session.execute_sql(cache_repos_by_month) @@ -759,14 +760,14 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) # session.execute_sql(cache_repos_by_year) # Reset cache flags reset_recache = s.sql.text("UPDATE repo_groups SET rg_recache = 0") - session.execute_sql(reset_recache) + execute_sql(reset_recache) - session.log_activity('Info','Caching unknown affiliations and web data for display (complete)') + facade_helper.log_activity('Info','Caching unknown affiliations and web data for display (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index 64571bdd9..874f33890 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -31,30 +31,32 @@ import pathlib import sqlalchemy as s from .utilitymethods import update_repo_log, get_absolute_repo_path +from sqlalchemy.orm.exc import NoResultFound from augur.application.db.models.augur_data import * from augur.application.db.models.augur_operations import CollectionStatus from augur.application.db.util import execute_session_query, convert_orm_list_to_dict_list +from augur.application.db.lib import execute_sql, get_repo_by_repo_git class GitCloneError(Exception): pass -def git_repo_initialize(session, repo_git): +def git_repo_initialize(facade_helper, session, repo_git): # Select any new git repos so we can set up their locations and git clone - session.update_status('Fetching non-cloned repos') - session.log_activity('Info', 'Fetching non-cloned repos') + facade_helper.update_status('Fetching non-cloned repos') + facade_helper.log_activity('Info', 'Fetching non-cloned repos') # Get data as a list of dicts - # new_repos = session.fetchall_data_from_sql_text(query)#list(cfg.cursor) + # new_repos = fetchall_data_from_sql_text(query)#list(cfg.cursor) row = Repo.get_by_repo_git(session, repo_git) if row: - session.log_activity( + facade_helper.log_activity( 'Info', f"Fetching repo with repo id: {row.repo_id}") - update_repo_log(session, row.repo_id, 'Cloning') + update_repo_log(logger, facade_helper, row.repo_id, 'Cloning') git = html.unescape(row.repo_git) @@ -62,28 +64,28 @@ def git_repo_initialize(session, repo_git): if git.find('://', 0) > 0: platform_org_git_url_section = git[git.find( '://', 0)+3:][:git[git.find('://', 0)+3:].rfind('/', 0)+1] - session.log_activity( + facade_helper.log_activity( 'Info', f"Repo Relative Path from facade05, from for row in new_repos, line 79: {platform_org_git_url_section}") - session.log_activity('Info', f"The git path used : {git}") + facade_helper.log_activity('Info', f"The git path used : {git}") else: platform_org_git_url_section = git[:git.rfind('/', 0)+1] - session.log_activity( + facade_helper.log_activity( 'Info', f"Repo Relative Path from facade05, line 80, reset at 86: {platform_org_git_url_section}") # Get the name of repo repo_name = git[git.rfind('/', 0)+1:] if repo_name.endswith('.git'): repo_name = repo_name[:repo_name.find('.git', 0)] - session.log_activity( + facade_helper.log_activity( 'Info', f"Repo Name from facade05, line 93: {repo_name}") path_identifier = f"{platform_org_git_url_section}{repo_name}".replace('/','-') # Get the full path to the directory where we'll clone the repo repo_path = ( - f"{session.repo_base_directory}{row.repo_id}-{path_identifier}") - session.log_activity( + f"{facade_helper.repo_base_directory}{row.repo_id}-{path_identifier}") + facade_helper.log_activity( 'Info', f"Repo Path from facade05, line 86: {repo_path}") @@ -91,21 +93,21 @@ def git_repo_initialize(session, repo_git): # query = s.sql.text("""SELECT NULL FROM repo WHERE CONCAT(repo_group_id,'/',repo_path,repo_name) = :repo_group_id # """).bindparams(repo_group_id=f"{row.repo_group_id}/{platform_org_git_url_section}{repo_name}") # - # result = session.fetchall_data_from_sql_text(query) + # result = fetchall_data_from_sql_text(query) query = s.sql.text("""UPDATE repo SET repo_path=:pathParam, repo_name=:nameParam WHERE repo_id=:idParam """).bindparams(pathParam=path_identifier, nameParam=repo_name, idParam=row.repo_id) - session.execute_sql(query) + execute_sql(query) # Check if there will be a storage path collision # If there is a collision, throw an error so that it updates the existing repo instead of trying # to reclone. if os.path.isdir(repo_path): # len(result): - session.log_activity( + facade_helper.log_activity( 'Verbose', f"Identical repo detected, storing {git} in {repo_name}") - session.logger.warning( + logger.warning( f"Identical repo found in facade directory! Repo git: {git}") statusQuery = session.query(CollectionStatus).filter( CollectionStatus.repo_id == row.repo_id) @@ -119,7 +121,7 @@ def git_repo_initialize(session, repo_git): repo_name=:nameParam WHERE repo_id=:idParam """).bindparams(pathParam=path_identifier, nameParam=repo_name, idParam=row.repo_id) - session.execute_sql(query) + execute_sql(query) return # Create the prerequisite directories @@ -128,23 +130,23 @@ def git_repo_initialize(session, repo_git): except Exception as e: print("COULD NOT CREATE REPO DIRECTORY") - update_repo_log(session, row.repo_id, 'Failed (mkdir)') - session.update_status(f"Failed (mkdir {repo_path})") - session.log_activity( + update_repo_log(logger, facade_helper, row.repo_id, 'Failed (mkdir)') + facade_helper.update_status(f"Failed (mkdir {repo_path})") + facade_helper.log_activity( 'Error', f"Could not create repo directory: {repo_path}") raise e - update_repo_log(session, row.repo_id, 'New (cloning)') + update_repo_log(logger, facade_helper, row.repo_id, 'New (cloning)') #Make sure newly cloned repo path is recorded in repo table query = s.sql.text("""UPDATE repo SET repo_path=:pathParam, repo_name=:nameParam WHERE repo_id=:idParam """).bindparams(pathParam=path_identifier, nameParam=repo_name, idParam=row.repo_id) - session.execute_sql(query) + execute_sql(query) - session.log_activity('Verbose', f"Cloning: {git}") + facade_helper.log_activity('Verbose', f"Cloning: {git}") cmd = f"git -C {repo_path} clone '{git}' {repo_name}" return_code = subprocess.Popen([cmd], shell=True).wait() @@ -153,18 +155,18 @@ def git_repo_initialize(session, repo_git): # If cloning succeeded, repo is ready for analysis # Mark the entire project for an update, so that under normal # circumstances caches are rebuilt only once per waiting period. - update_repo_log(session, row.repo_id, 'Up-to-date') - session.log_activity('Info', f"Cloned {git}") + update_repo_log(logger, facade_helper, row.repo_id, 'Up-to-date') + facade_helper.log_activity('Info', f"Cloned {git}") else: # If cloning failed, log it and set the status back to new - update_repo_log(session, row.repo_id, f"Failed ({return_code})") + update_repo_log(logger, facade_helper, row.repo_id, f"Failed ({return_code})") - session.log_activity('Error', f"Could not clone {git}") + facade_helper.log_activity('Error', f"Could not clone {git}") raise GitCloneError(f"Could not clone {git}") - session.log_activity('Info', f"Fetching new repos (complete)") + facade_helper.log_activity('Info', f"Fetching new repos (complete)") # Deprecated functionality. No longer used @@ -185,8 +187,8 @@ def check_for_repo_updates(session, repo_git): AND repo_status != 'Analyze' AND repo_status != 'Empty' AND repo_git = :value""").bindparams(value=repo_git) - # repos = session.fetchall_data_from_sql_text(get_initialized_repos)#list(cfg.cursor) - repo = session.execute_sql(get_initialized_repos).fetchone() + # repos = fetchall_data_from_sql_text(get_initialized_repos)#list(cfg.cursor) + repo = execute_sql(get_initialized_repos).fetchone() if repo: @@ -196,7 +198,7 @@ def check_for_repo_updates(session, repo_git): repos_id=:repo_id AND status='Up-to-date' AND date >= CURRENT_TIMESTAMP(6) - INTERVAL :update_freq HOUR """).bindparams(repo_id=repo['repo_id'], update_freq=update_frequency[0]) - result = session.fetchall_data_from_sql_text(get_last_update) + result = fetchall_data_from_sql_text(get_last_update) # If the repo has not been updated within the waiting period, mark it. # Also mark any other repos in the project, so we only recache the # project once per waiting period. @@ -213,7 +215,7 @@ def check_for_repo_updates(session, repo_git): # "SET status='Update' WHERE " # "r.id=%s and r.status != 'Empty'") - session.execute_sql(mark_repo) + execute_sql(mark_repo) # Mark the entire project for an update, so that under normal # circumstances caches are rebuilt only once per waiting period. @@ -250,7 +252,7 @@ def force_repo_updates(session, repo_git): get_repo_ids = s.sql.text("""UPDATE repo SET repo_status='Update' WHERE repo_status NOT LIKE 'New%' AND repo_status!='Delete' AND repo_status !='Empty' AND repo_git=:value""").bindparams(value=repo_git) - session.execute_sql(get_repo_ids) + execute_sql(get_repo_ids) session.log_activity('Info', 'Forcing repos to update (complete)') @@ -268,38 +270,35 @@ def force_repo_analysis(session, repo_git): NOT LIKE 'New%' AND repo_status!='Delete' AND repo_status != 'Empty' AND repo_git=:repo_git_ident""").bindparams(repo_git_ident=repo_git) - session.execute_sql(set_to_analyze) + execute_sql(set_to_analyze) session.log_activity('Info', 'Forcing repos to be analyzed (complete)') -def git_repo_updates(session, repo_git): +def git_repo_updates(facade_helper, repo_git): # Update existing repos - session.update_status('Updating repos') - session.log_activity('Info', 'Updating existing repos') + facade_helper.update_status('Updating repos') + facade_helper.log_activity('Info', 'Updating existing repos') # query = s.sql.text("""SELECT repo_id,repo_group_id,repo_git,repo_name,repo_path FROM repo WHERE # repo_status='Update'""") - query = session.query(Repo).filter( - Repo.repo_git == repo_git) - result = execute_session_query(query, 'all') try: - # session.fetchall_data_from_sql_text(query)#list(cfg.cursor) - row = convert_orm_list_to_dict_list(result)[0] - except IndexError: + repo = get_repo_by_repo_git(repo_git) + except NoResultFound: raise Exception( f"Repo git: {repo_git} does not exist or the status is not 'Update'") - if row["repo_path"] is None or row["repo_name"] is None: + + if repo.repo_path is None or repo.repo_name is None: raise Exception( - f"The repo path or repo name is NULL for repo_id: {row['repo_id']}") + f"The repo path or repo name is NULL for repo_id: {repo.repo_id}") - session.log_activity( - 'Verbose', f"Attempting to update {row['repo_git']}") # ['git']) - update_repo_log(session, row['repo_id'], 'Updating') # ['id'],'Updating') + facade_helper.log_activity( + 'Verbose', f"Attempting to update {repo.repo_git}") # ['git']) + update_repo_log(logger, facade_helper, repo.repo_id, 'Updating') # ['id'],'Updating') attempt = 0 @@ -310,7 +309,7 @@ def git_repo_updates(session, repo_git): # default_branch = '' absolute_path = get_absolute_repo_path( - session.repo_base_directory, row["repo_id"], row['repo_path'],row['repo_name']) + facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) while attempt < 2: @@ -321,7 +320,7 @@ def git_repo_updates(session, repo_git): return_code_remote = subprocess.Popen( [firstpull], shell=True).wait() - session.log_activity('Verbose', 'Got to here. 1.') + facade_helper.log_activity('Verbose', 'Got to here. 1.') if return_code_remote == 0: @@ -343,26 +342,26 @@ def git_repo_updates(session, repo_git): remotedefault = remotedefault.decode() - session.log_activity( + facade_helper.log_activity( 'Verbose', f'remote default getting checked out is: {remotedefault}.') getremotedefault = ( f"git -C {absolute_path} checkout {remotedefault}") - session.log_activity( + facade_helper.log_activity( 'Verbose', f"get remote default command is: \n \n {getremotedefault} \n \n ") return_code_remote_default_again = subprocess.Popen( [getremotedefault], shell=True).wait() if return_code_remote_default_again == 0: - session.log_activity('Verbose', "local checkout worked.") + facade_helper.log_activity('Verbose', "local checkout worked.") cmd = (f"git -C {absolute_path} pull") return_code = subprocess.Popen([cmd], shell=True).wait() except Exception as e: - session.log_activity( + facade_helper.log_activity( 'Verbose', f'Error code on branch change is {e}.') pass @@ -378,8 +377,8 @@ def git_repo_updates(session, repo_git): break elif attempt == 0: - session.log_activity( - 'Verbose', f"git pull failed, attempting reset and clean for {row['repo_git']}") + facade_helper.log_activity( + 'Verbose', f"git pull failed, attempting reset and clean for {repo.repo_git}") # remotedefault = 'main' @@ -412,7 +411,7 @@ def git_repo_updates(session, repo_git): return_message_getremotedefault = subprocess.Popen( [getremotedefault], stdout=subprocess.PIPE, shell=True).communicate()[0] - session.log_activity( + facade_helper.log_activity( 'Verbose', f'get remote default result: {return_message_getremotedefault}') getcurrentbranch = (f"git -C {absolute_path} branch") @@ -425,7 +424,7 @@ def git_repo_updates(session, repo_git): localdefault = localdefault.decode() - session.log_activity( + facade_helper.log_activity( 'Verbose', f'remote default is: {remotedefault}, and localdefault is {localdefault}.') cmd_checkout_default = ( @@ -448,7 +447,7 @@ def git_repo_updates(session, repo_git): except Exception as e: - session.log_activity('Verbose', f'Second pass failed: {e}.') + facade_helper.log_activity('Verbose', f'Second pass failed: {e}.') pass cmdpull2 = (f"git -C {absolute_path} pull") @@ -462,12 +461,12 @@ def git_repo_updates(session, repo_git): if return_code == 0: - update_repo_log(session, row['repo_id'], 'Up-to-date') - session.log_activity('Verbose', f"Updated {row['repo_git']}") + update_repo_log(logger, facade_helper, repo.repo_id, 'Up-to-date') + facade_helper.log_activity('Verbose', f"Updated {repo.repo_git}") else: - update_repo_log(session, row['repo_id'], f"Failed ({return_code})") - session.log_activity('Error', f"Could not update {row['repo_git']}") + update_repo_log(logger, facade_helper, repo.repo_id, f"Failed ({return_code})") + facade_helper.log_activity('Error', f"Could not update {repo.repo_git}") - session.log_activity('Info', 'Updating existing repos (complete)') + facade_helper.log_activity('Info', 'Updating existing repos (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 848cb3891..40f3a29e0 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -29,52 +29,44 @@ from subprocess import check_output import os import sqlalchemy as s -from sqlalchemy.exc import DataError from augur.application.db.models import * -from .config import FacadeSession as FacadeSession +from .config import FacadeHelper as FacadeHelper from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_working_commits_by_repo_id_and_hashes, remove_commits_by_repo_id_and_hashes, get_repo_by_repo_git, get_session +from augur.application.db.util import execute_session_query #from augur.tasks.git.util.facade_worker.facade -def update_repo_log(session, repos_id,status): +def update_repo_log(logger, facade_helper, repos_id,status): # Log a repo's fetch status - session.log_activity("Info",f"{status} {repos_id}") + facade_helper.log_activity("Info",f"{status} {repos_id}") #log_message = ("INSERT INTO repos_fetch_log (repos_id,status) " # "VALUES (%s,%s)") try: log_message = s.sql.text("""INSERT INTO repos_fetch_log (repos_id,status) VALUES (:repo_id,:repo_status)""").bindparams(repo_id=repos_id,repo_status=status) - #session.insert_data(data,t_repos_fetch_log,['repos_id','status']) - session.execute_sql(log_message) + #bulk_insert_dicts(data,t_repos_fetch_log,['repos_id','status']) + execute_sql(log_message) except Exception as e: - session.logger.error(f"Ran into error in update_repo_log: {e}") + logger.error(f"Ran into error in update_repo_log: {e}") pass -def trim_commits(session, repo_id,commits): +def trim_commits(facade_helper, repo_id,commits): # Quickly remove a given commit if len(commits): - remove_commit = s.sql.text("""DELETE FROM commits - WHERE repo_id=:repo_id - AND cmt_commit_hash IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commits)) - - - session.execute_sql(remove_commit) + remove_commits_by_repo_id_and_hashes(repo_id, commits) # Remove the working commit. - remove_commit = s.sql.text("""DELETE FROM working_commits - WHERE repos_id = :repo_id AND - working_commit IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commits)) - - session.execute_sql(remove_commit) + remove_working_commits_by_repo_id_and_hashes(repo_id, commits) for commit in commits: - session.log_activity('Debug',f"Trimmed commit: {commit}") - session.log_activity('Debug',f"Removed working commit: {commit}") + facade_helper.log_activity('Debug',f"Trimmed commit: {commit}") + facade_helper.log_activity('Debug',f"Removed working commit: {commit}") -def store_working_author(session, email): +def store_working_author(facade_helper, email): # Store the working author during affiliation discovery, in case it is # interrupted and needs to be trimmed. @@ -84,11 +76,11 @@ def store_working_author(session, email): WHERE setting = 'working_author' """).bindparams(email=email) - session.execute_sql(store) + execute_sql(store) - session.log_activity('Debug',f"Stored working author: {email}") + facade_helper.log_activity('Debug',f"Stored working author: {email}") -def trim_author(session, email): +def trim_author(facade_helper, email): # Remove the affiliations associated with an email. Used when an analysis is # interrupted during affiliation layering, and the data will be corrupt. @@ -97,21 +89,17 @@ def trim_author(session, email): SET cmt_author_affiliation = NULL WHERE cmt_author_email = :email """).bindparams(email=email) - - - - session.execute_sql(trim) + execute_sql(trim) trim = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_email = :email """).bindparams(email=email) + execute_sql(trim) - session.execute_sql(trim) - - store_working_author(session, 'done') + store_working_author(facade_helper, 'done') - session.log_activity('Debug',f"Trimmed working author: {email}") + facade_helper.log_activity('Debug',f"Trimmed working author: {email}") def get_absolute_repo_path(repo_base_dir, repo_id, repo_path,repo_name): @@ -134,12 +122,12 @@ def get_parent_commits_set(absolute_repo_path, start_date): return parent_commits -def get_existing_commits_set(session, repo_id): +def get_existing_commits_set(repo_id): find_existing = s.sql.text("""SELECT DISTINCT cmt_commit_hash FROM commits WHERE repo_id=:repo_id """).bindparams(repo_id=repo_id) - existing_commits = [commit['cmt_commit_hash'] for commit in session.fetchall_data_from_sql_text(find_existing)] + existing_commits = [commit['cmt_commit_hash'] for commit in fetchall_data_from_sql_text(find_existing)] return set(existing_commits) @@ -148,15 +136,15 @@ def count_branches(git_dir): branches_dir = os.path.join(git_dir, 'refs', 'heads') return sum(1 for _ in os.scandir(branches_dir)) -def get_repo_commit_count(session, repo_git): - - repo = Repo.get_by_repo_git(session, repo_git) +def get_repo_commit_count(logger, facade_helper, repo_git): - absolute_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + repo = get_repo_by_repo_git(repo_git) + + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) repo_loc = (f"{absolute_path}/.git") - session.logger.debug(f"loc: {repo_loc}") - session.logger.debug(f"path: {repo.repo_path}") + logger.debug(f"loc: {repo_loc}") + logger.debug(f"path: {repo.repo_path}") # Check if the .git directory exists if not os.path.exists(repo_loc): @@ -171,77 +159,47 @@ def get_repo_commit_count(session, repo_git): return commit_count -def get_facade_weight_time_factor(session,repo_git): - repo = Repo.get_by_repo_git(session, repo_git) - - try: - status = repo.collection_status[0] - time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) - except IndexError: - time_factor = calculate_date_weight_from_timestamps(repo.repo_added, None) - - #Adjust for commits. - time_factor *= 1.2 +def get_facade_weight_time_factor(repo_git): - return time_factor + with get_session() as session: -def get_facade_weight_with_commit_count(session, repo_git, commit_count): - return commit_count - get_facade_weight_time_factor(session, repo_git) + query = session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + try: + status = repo.collection_status[0] + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) + except IndexError: + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, None) + + #Adjust for commits. + time_factor *= 1.2 + + return time_factor + +def get_facade_weight_with_commit_count(repo_git, commit_count): + return commit_count - get_facade_weight_time_factor(repo_git) -def get_repo_weight_by_commit(logger,repo_git): - with FacadeSession(logger) as session: - return get_repo_commit_count(session, repo_git) - get_facade_weight_time_factor(session, repo_git) + +def get_repo_weight_by_commit(logger, repo_git): + facade_helper = FacadeHelper(logger) + return get_repo_commit_count(logger, facade_helper, repo_git) - get_facade_weight_time_factor(repo_git) -def update_facade_scheduling_fields(session, repo_git, weight, commit_count): - repo = Repo.get_by_repo_git(session, repo_git) +def update_facade_scheduling_fields(repo_git, weight, commit_count): - update_query = ( - s.update(CollectionStatus) - .where(CollectionStatus.repo_id == repo.repo_id) - .values(facade_weight=weight,commit_sum=commit_count) - ) + repo = get_repo_by_repo_git(repo_git) - session.execute(update_query) - session.commit() + with get_session() as session: -def facade_bulk_insert_commits(session,records): + update_query = ( + s.update(CollectionStatus) + .where(CollectionStatus.repo_id == repo.repo_id) + .values(facade_weight=weight,commit_sum=commit_count) + ) - try: - session.execute( - s.insert(Commit), - records, - ) + session.execute(update_query) session.commit() - except Exception as e: - - if len(records) > 1: - session.logger.error(f"Ran into issue when trying to insert commits \n Error: {e}") - - #split list into halves and retry insert until we isolate offending record - firsthalfRecords = records[:len(records)//2] - secondhalfRecords = records[len(records)//2:] - - facade_bulk_insert_commits(session,firsthalfRecords) - facade_bulk_insert_commits(session,secondhalfRecords) - elif len(records) == 1 and isinstance(e,DataError) and "time zone displacement" in f"{e}": - commit_record = records[0] - #replace incomprehensible dates with epoch. - #2021-10-11 11:57:46 -0500 - placeholder_date = "1970-01-01 00:00:15 -0500" - - #Check for improper utc timezone offset - #UTC timezone offset should be betwen -14:00 and +14:00 - - commit_record['author_timestamp'] = placeholder_date - commit_record['committer_timestamp'] = placeholder_date - - session.execute( - s.insert(Commit), - [commit_record], - ) - session.commit() - else: - raise e + + diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index 882725d20..8c2eed255 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -4,10 +4,12 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.github.util.github_paginator import hit_api -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.facade_github.tasks import * -from augur.application.db.models import Contributor, Repo +from augur.application.db.models import Contributor from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts, get_session +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + @celery.task @@ -19,48 +21,48 @@ def process_contributors(): tool_version = "2.0" data_source = "Github API" - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) - augur_db = manifest.augur_db + with get_session() as session: - query = augur_db.session.query(Contributor).filter(Contributor.data_source == data_source, Contributor.cntrb_created_at is None, Contributor.cntrb_last_used is None) + query = session.query(Contributor).filter(Contributor.data_source == data_source, Contributor.cntrb_created_at is None, Contributor.cntrb_last_used is None) contributors = execute_session_query(query, 'all') - contributors_len = len(contributors) + contributors_len = len(contributors) - if contributors_len == 0: - logger.info("No contributors to enrich...returning...") - return + if contributors_len == 0: + logger.info("No contributors to enrich...returning...") + return - print(f"Length of contributors to enrich: {contributors_len}") - enriched_contributors = [] - for index, contributor in enumerate(contributors): + print(f"Length of contributors to enrich: {contributors_len}") + enriched_contributors = [] + for index, contributor in enumerate(contributors): - logger.info(f"Contributor {index + 1} of {contributors_len}") + logger.info(f"Contributor {index + 1} of {contributors_len}") - contributor_dict = contributor.__dict__ + contributor_dict = contributor.__dict__ - del contributor_dict["_sa_instance_state"] + del contributor_dict["_sa_instance_state"] - url = f"https://api.github.com/users/{contributor_dict['cntrb_login']}" + url = f"https://api.github.com/users/{contributor_dict['cntrb_login']}" - data = retrieve_dict_data(url, manifest.key_auth, logger) + data = retrieve_dict_data(url, key_auth, logger) - if data is None: - print(f"Unable to get contributor data for: {contributor_dict['cntrb_login']}") - continue + if data is None: + print(f"Unable to get contributor data for: {contributor_dict['cntrb_login']}") + continue - new_contributor_data = { - "cntrb_created_at": data["created_at"], - "cntrb_last_used": data["updated_at"] - } + new_contributor_data = { + "cntrb_created_at": data["created_at"], + "cntrb_last_used": data["updated_at"] + } - contributor_dict.update(new_contributor_data) + contributor_dict.update(new_contributor_data) - enriched_contributors.append(contributor_dict) + enriched_contributors.append(contributor_dict) - logger.info(f"Enriching {len(enriched_contributors)} contributors") - augur_db.insert_data(enriched_contributors, Contributor, ["cntrb_id"]) + logger.info(f"Enriching {len(enriched_contributors)} contributors") + bulk_insert_dicts(enriched_contributors, Contributor, ["cntrb_id"]) @@ -109,14 +111,10 @@ def grab_comitters(self, repo_git,platform="github"): engine = self.app.engine logger = logging.getLogger(grab_comitters.__name__) - with DatabaseSession(logger,engine) as session: - - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id try: - with GithubTaskManifest(logger) as manifest: - grab_committer_list(manifest, repo_id,platform) + key_auth = GithubRandomKeyAuth(logger) + grab_committer_list(logger, key_auth, repo_git, platform) except Exception as e: logger.error(f"Could not grab committers from github endpoint!\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 2bf96ffa1..db005ce22 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -6,10 +6,11 @@ from datetime import datetime from augur.tasks.util.collection_state import CollectionState from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts -def update_repo_with_dict(repo,new_dict,logger,db): +def update_repo_with_dict(repo,new_dict,logger): """ Update a repository record in the database using a dictionary tagged with the appropriate table fields @@ -25,7 +26,7 @@ def update_repo_with_dict(repo,new_dict,logger,db): del to_insert['_sa_instance_state'] to_insert.update(new_dict) - result = db.insert_data(to_insert, Repo, ['repo_id']) + result = bulk_insert_dicts(logger, to_insert, Repo, ['repo_id']) url = to_insert['repo_git'] logger.info(f"Updated repo for {url}\n") @@ -43,7 +44,7 @@ def extract_owner_and_repo_from_endpoint(key_auth, url, logger): return splits[0], splits[-1] -def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook='core'): +def ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='core'): owner, name = get_owner_repo(repo.repo_git) url = f"https://api.github.com/repos/{owner}/{name}" @@ -76,7 +77,7 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' 'description': f"(Originally hosted at {url}) {old_description}" } - update_repo_with_dict(repo, repo_update_dict, logger,augur_db) + update_repo_with_dict(repo, repo_update_dict, logger) raise Exception("ERROR: Repo has moved! Resetting Collection!") @@ -90,9 +91,9 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' 'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') } - update_repo_with_dict(repo, repo_update_dict, logger, augur_db) + update_repo_with_dict(repo, repo_update_dict, logger) - statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) + statusQuery = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) collectionRecord = execute_session_query(statusQuery,'one') @@ -113,7 +114,7 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' collectionRecord.ml_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - augur_db.session.commit() + session.commit() raise Exception("ERROR: Repo has moved! Resetting Collection!") diff --git a/augur/tasks/github/detect_move/tasks.py b/augur/tasks/github/detect_move/tasks.py index c9da0d3ca..f542d8928 100644 --- a/augur/tasks/github/detect_move/tasks.py +++ b/augur/tasks/github/detect_move/tasks.py @@ -1,11 +1,10 @@ import logging -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.detect_move.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query - +from augur.application.db.lib import get_repo_by_repo_git, get_session +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth @celery.task(base=AugurCoreRepoCollectionTask) @@ -14,14 +13,18 @@ def detect_github_repo_move_core(repo_git : str) -> None: logger = logging.getLogger(detect_github_repo_move_core.__name__) logger.info(f"Starting repo_move operation with {repo_git}") - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + + repo = get_repo_by_repo_git(repo_git) + + logger.info(f"Pinging repo: {repo_git}") + + key_auth = GithubRandomKeyAuth(logger) + + with get_session() as session: + #Ping each repo with the given repo_git to make sure #that they are still in place. - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - logger.info(f"Pinging repo: {repo_git}") - ping_github_for_repo_move(augur_db, manifest.key_auth, repo, logger) + ping_github_for_repo_move(session, key_auth, repo, logger) @celery.task(base=AugurSecondaryRepoCollectionTask) @@ -30,11 +33,15 @@ def detect_github_repo_move_secondary(repo_git : str) -> None: logger = logging.getLogger(detect_github_repo_move_secondary.__name__) logger.info(f"Starting repo_move operation with {repo_git}") - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + + repo = get_repo_by_repo_git(repo_git) + + logger.info(f"Pinging repo: {repo_git}") + + key_auth = GithubRandomKeyAuth(logger) + + with get_session() as session: + #Ping each repo with the given repo_git to make sure #that they are still in place. - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - logger.info(f"Pinging repo: {repo_git}") - ping_github_for_repo_move(augur_db, manifest.key_auth, repo, logger,collection_hook='secondary') \ No newline at end of file + ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='secondary') \ No newline at end of file diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events/tasks.py index 442af9922..ee4f40761 100644 --- a/augur/tasks/github/events/tasks.py +++ b/augur/tasks/github/events/tasks.py @@ -6,11 +6,12 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.util import get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequest, PullRequestEvent, Issue, IssueEvent, Contributor, Repo -from augur.application.db.util import execute_session_query +from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id + platform_id = 1 @@ -19,32 +20,26 @@ def collect_events(repo_git: str): logger = logging.getLogger(collect_events.__name__) - with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db - - try: - - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + try: + + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id - owner, repo = get_owner_repo(repo_git) + owner, repo = get_owner_repo(repo_git) - logger.info(f"Collecting Github events for {owner}/{repo}") + logger.info(f"Collecting Github events for {owner}/{repo}") - url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" + key_auth = GithubRandomKeyAuth(logger) - event_data = retrieve_all_event_data(repo_git, logger, manifest.key_auth) + event_data = retrieve_all_event_data(repo_git, logger, key_auth) - if event_data: - - process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger, manifest.augur_db) + if event_data: + process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger) + else: + logger.info(f"{owner}/{repo} has no events") - else: - logger.info(f"{owner}/{repo} has no events") - except Exception as e: - logger.error(f"Could not collect events for {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + except Exception as e: + logger.error(f"Could not collect events for {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") def retrieve_all_event_data(repo_git: str, logger, key_auth): @@ -77,7 +72,7 @@ def retrieve_all_event_data(repo_git: str, logger, key_auth): return all_data -def process_events(events, task_name, repo_id, logger, augur_db): +def process_events(events, task_name, repo_id, logger): tool_source = "Github events task" tool_version = "2.0" @@ -90,13 +85,13 @@ def process_events(events, task_name, repo_id, logger, augur_db): # create mapping from issue url to issue id of current issues issue_url_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + issues = get_issues_by_repo_id(repo_id) for issue in issues: issue_url_to_id_map[issue.issue_url] = issue.issue_id # create mapping from pr url to pr id of current pull requests pr_url_to_id_map = {} - prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + prs = get_pull_requests_by_repo_id(repo_id) for pr in prs: pr_url_to_id_map[pr.pr_url] = pr.pull_request_id @@ -160,7 +155,7 @@ def process_events(events, task_name, repo_id, logger, augur_db): # remove contributors that were found in the data more than once contributors = remove_duplicate_dicts(contributors) - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) issue_events_len = len(issue_event_dicts) pr_events_len = len(pr_event_dicts) @@ -174,12 +169,12 @@ def process_events(events, task_name, repo_id, logger, augur_db): # TODO: Could replace this with "id" but it isn't stored on the table for some reason pr_event_natural_keys = ["node_id"] - augur_db.insert_data(pr_event_dicts, PullRequestEvent, pr_event_natural_keys) + bulk_insert_dicts(logger, pr_event_dicts, PullRequestEvent, pr_event_natural_keys) issue_event_natural_keys = ["issue_id", "issue_event_src_id"] - augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) + bulk_insert_dicts(logger, issue_event_dicts, IssueEvent, issue_event_natural_keys) - update_issue_closed_cntrbs_from_events(augur_db.engine, repo_id) + update_issue_closed_cntrbs_by_repo_id(repo_id) # TODO: Should we skip an event if there is no contributor to resolve it o def process_github_event_contributors(logger, event, tool_source, tool_version, data_source): @@ -195,41 +190,3 @@ def process_github_event_contributors(logger, event, tool_source, tool_version, return event, event_cntrb - -def update_issue_closed_cntrbs_from_events(engine, repo_id): - - get_ranked_issues = s.text(f""" - WITH RankedIssues AS ( - SELECT repo_id, issue_id, cntrb_id, - ROW_NUMBER() OVER(PARTITION BY issue_id ORDER BY created_at DESC) AS rn - FROM issue_events - WHERE "action" = 'closed' - ) - - SELECT issue_id, cntrb_id from RankedIssues where rn=1 and repo_id={repo_id} and cntrb_id is not NULL - """) - - with engine.connect() as conn: - result = conn.execute(get_ranked_issues).fetchall() - - update_data = [] - for row in result: - update_data.append( - { - 'issue_id': row[0], - 'cntrb_id': row[1], - 'repo_id': repo_id - } - ) - - if update_data: - with engine.connect() as connection: - update_stmt = s.text(""" - UPDATE issues - SET cntrb_id = :cntrb_id - WHERE issue_id = :issue_id - AND repo_id = :repo_id - """) - connection.execute(update_stmt, update_data) - - diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index 44b6c706f..49ff2dc14 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -7,7 +7,7 @@ # Debugger import traceback from augur.tasks.github.util.github_paginator import GithubApiResult -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql, get_contributors_by_github_user_id ##TODO: maybe have a TaskSession class that holds information about the database, logger, config, etc. @@ -24,8 +24,8 @@ # Hit the endpoint specified by the url and return the json that it returns if it returns a dict. # Returns None on failure. # NOTE: This function is being deprecated in favor of retrieve_dict_from_endpoint -def request_dict_from_endpoint(session, url, timeout_wait=10): - #session.logger.info(f"Hitting endpoint: {url}") +def request_dict_from_endpoint(logger, session, url, timeout_wait=10): + #logger.info(f"Hitting endpoint: {url}") attempts = 0 response_data = None @@ -33,9 +33,9 @@ def request_dict_from_endpoint(session, url, timeout_wait=10): while attempts < 10: try: - response = hit_api(session.oauths, url, session.logger) + response = hit_api(session.oauths, url, logger) except TimeoutError: - session.logger.info( + logger.info( f"User data request for enriching contributor data failed with {attempts} attempts! Trying again...") time.sleep(timeout_wait) continue @@ -50,34 +50,34 @@ def request_dict_from_endpoint(session, url, timeout_wait=10): response_data = json.loads(json.dumps(response.text)) if type(response_data) == dict: - err = process_dict_response(session.logger,response,response_data) + err = process_dict_response(logger,response,response_data) #If we get an error message that's not None if err and err != GithubApiResult.SUCCESS: attempts += 1 - session.logger.info(f"err: {err}") + logger.info(f"err: {err}") continue - #session.logger.info(f"Returned dict: {response_data}") + #logger.info(f"Returned dict: {response_data}") success = True break elif type(response_data) == list: - session.logger.warning("Wrong type returned, trying again...") - session.logger.info(f"Returned list: {response_data}") + logger.warning("Wrong type returned, trying again...") + logger.info(f"Returned list: {response_data}") elif type(response_data) == str: - session.logger.info( + logger.info( f"Warning! page_data was string: {response_data}") if "" in response_data: - session.logger.info("HTML was returned, trying again...\n") + logger.info("HTML was returned, trying again...\n") elif len(response_data) == 0: - session.logger.warning("Empty string, trying again...\n") + logger.warning("Empty string, trying again...\n") else: try: # Sometimes raw text can be converted to a dict response_data = json.loads(response_data) - err = process_dict_response(session.logger,response,response_data) + err = process_dict_response(logger,response,response_data) #If we get an error message that's not None if err and err != GithubApiResult.SUCCESS: @@ -105,7 +105,7 @@ def create_endpoint_from_email(email): return url -def create_endpoint_from_commit_sha(logger,db,commit_sha, repo_id): +def create_endpoint_from_commit_sha(logger, commit_sha, repo_id): logger.info( f"Trying to create endpoint from commit hash: {commit_sha}") @@ -113,15 +113,13 @@ def create_endpoint_from_commit_sha(logger,db,commit_sha, repo_id): #stmnt = s.select(Repo.repo_path, Repo.repo_name).where(Repo.repo_id == repo_id) - - query = db.query(Repo).filter_by(repo_id=repo_id) - result = execute_session_query(query, 'one') + result = get_repo_by_repo_id(repo_id) if result.repo_path is None or result.repo_name is None: raise KeyError # Else put into a more readable local var - #session.logger.info(f"Result: {result}") + #logger.info(f"Result: {result}") split_git = result.repo_git.split('/') repo_name_and_org = split_git[-2] + "/" + result.repo_name @@ -154,14 +152,13 @@ def create_endpoint_from_name(contributor): return url -def insert_alias(logger,db, contributor, email): +def insert_alias(logger, contributor, email): # Insert cntrb_id and email of the corresponding record into the alias table # Another database call to get the contributor id is needed because its an autokeyincrement that is accessed by multiple workers # Same principle as enrich_cntrb_id method. - query = db.query(Contributor).filter_by(gh_user_id=contributor["gh_user_id"]) - contributor_table_data = execute_session_query(query, 'all') + contributor_table_data = get_contributors_by_github_user_id(contributor["gh_user_id"]) # self.logger.info(f"Contributor query: {contributor_table_data}") # Handle potential failures @@ -175,9 +172,9 @@ def insert_alias(logger,db, contributor, email): logger.info( f"There are more than one contributors in the table with gh_user_id={contributor['gh_user_id']}") - #session.logger.info(f"Creating alias for email: {email}") + #logger.info(f"Creating alias for email: {email}") - #session.logger.info(f"{contributor_table_data} has type {type(contributor_table_data)}") + #logger.info(f"{contributor_table_data} has type {type(contributor_table_data)}") # Insert a new alias that corresponds to where the contributor was found # use the email of the new alias for canonical_email if the api returns NULL # TODO: It might be better to have the canonical_email allowed to be NUll because right now it has a null constraint. @@ -192,7 +189,7 @@ def insert_alias(logger,db, contributor, email): # Insert new alias - db.insert_data(alias, ContributorsAlias, ['alias_email']) + bulk_insert_dicts(logger, alias, ContributorsAlias, ['alias_email']) return @@ -200,7 +197,7 @@ def insert_alias(logger,db, contributor, email): # Takes the user data from the endpoint as arg # Updates the alias table if the login is already in the contributor's table with the new email. # Returns whether the login was found in the contributors table -def resolve_if_login_existing(session, contributor): +def resolve_if_login_existing(logger, contributor): # check if login exists in contributors table select_cntrbs_query = s.sql.text(""" SELECT cntrb_id from contributors @@ -210,7 +207,7 @@ def resolve_if_login_existing(session, contributor): # Bind parameter select_cntrbs_query = select_cntrbs_query.bindparams( gh_login_value=contributor['cntrb_login']) - result = session.execute_sql(select_cntrbs_query) + result = execute_sql(select_cntrbs_query) # if yes if len(result.fetchall()) >= 1: @@ -218,7 +215,7 @@ def resolve_if_login_existing(session, contributor): return True # If not found, return false - session.logger.info( + logger.info( f"Contributor not found in contributors table but can be added. Adding...") return False """ @@ -276,7 +273,7 @@ def fetch_username_from_email(logger, auth, commit): # Default to failed state login_json = None - #session.logger.info(f"Here is the commit: {commit}") + #logger.info(f"Here is the commit: {commit}") # email = commit['email_raw'] if 'email_raw' in commit else commit['email_raw'] @@ -311,7 +308,7 @@ def fetch_username_from_email(logger, auth, commit): # Method to return the login given commit data using the supplemental data in the commit # -email # -name -def get_login_with_supplemental_data(logger,db,auth, commit_data): +def get_login_with_supplemental_data(logger, auth, commit_data): # Try to get login from all possible emails # Is None upon failure. @@ -329,7 +326,7 @@ def get_login_with_supplemental_data(logger,db,auth, commit_data): try: unresolved_natural_keys = ['email'] - db.insert_data(unresolved, UnresolvedCommitEmail, unresolved_natural_keys) + bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys) except Exception as e: logger.error( f"Could not create new unresolved email {unresolved['email']}. Error: {e}") @@ -372,11 +369,11 @@ def get_login_with_supplemental_data(logger,db,auth, commit_data): return match['login'] -def get_login_with_commit_hash(logger,db,auth, commit_data, repo_id): +def get_login_with_commit_hash(logger, auth, commit_data, repo_id): # Get endpoint for login from hash url = create_endpoint_from_commit_sha( - logger,db,commit_data['hash'], repo_id) + logger, commit_data['hash'], repo_id) #TODO: here. # Send api request @@ -392,23 +389,3 @@ def get_login_with_commit_hash(logger,db,auth, commit_data, repo_id): match = None return match - - - -def create_endpoint_from_repo_id(logger,db, repo_id): - - """ - SELECT repo_git from repo - WHERE repo_id = :repo_id_bind - """ - #ORM syntax of above statement - query = db.session.query(Repo).filter_by(repo_id=repo_id) - result = execute_session_query(query, 'one') - - url = result.repo_git - logger.info(f"Url: {url}") - - return url - - - diff --git a/augur/tasks/github/facade_github/core.py b/augur/tasks/github/facade_github/core.py index 10f4affc6..d8a35ca58 100644 --- a/augur/tasks/github/facade_github/core.py +++ b/augur/tasks/github/facade_github/core.py @@ -4,20 +4,24 @@ from augur.tasks.github.util.github_paginator import * from augur.application.db.models import * from augur.tasks.util.AugurUUID import GithubUUID +from augur.application.db.lib import bulk_insert_dicts -def query_github_contributors(manifest, github_url): +def query_github_contributors(logger, key_auth, github_url): """ Data collection function Query the GitHub API for contributors """ + # Set platform id to 1 since it is a github method + platform_id = 1 + # Extract owner/repo from the url for the endpoint try: owner, name = get_owner_repo(github_url) except IndexError as e: - manifest.logger.error(f"Encountered bad url: {github_url}") + logger.error(f"Encountered bad url: {github_url}") raise e # Set the base of the url and place to hold contributors to insert @@ -35,11 +39,11 @@ def query_github_contributors(manifest, github_url): duplicate_col_map = {'cntrb_login': 'login'} #list to hold contributors needing insertion or update - contributor_list = GithubPaginator(contributors_url, manifest.key_auth,manifest.logger)#paginate(contributors_url, duplicate_col_map, update_col_map, table, table_pkey) + contributor_list = GithubPaginator(contributors_url, key_auth, logger)#paginate(contributors_url, duplicate_col_map, update_col_map, table, table_pkey) len_contributor_list = len(contributor_list) - manifest.logger.info("Count of contributors needing insertion: " + str(len_contributor_list) + "\n") + logger.info("Count of contributors needing insertion: " + str(len_contributor_list) + "\n") if len_contributor_list == 0: return @@ -52,13 +56,13 @@ def query_github_contributors(manifest, github_url): cntrb_url = ("https://api.github.com/users/" + repo_contributor['login']) - manifest.logger.info("Hitting endpoint: " + cntrb_url + " ...\n") - #r = hit_api(session.oauths, cntrb_url, session.logger) + logger.info("Hitting endpoint: " + cntrb_url + " ...\n") + #r = hit_api(session.oauths, cntrb_url, logger) #contributor = r.json() - contributor, result = retrieve_dict_from_endpoint(manifest.logger,manifest.key_auth, cntrb_url) + contributor, result = retrieve_dict_from_endpoint(logger, key_auth, cntrb_url) - #manifest.logger.info(f"Contributor: {contributor} \n") + #logger.info(f"Contributor: {contributor} \n") company = None location = None email = None @@ -76,7 +80,7 @@ def query_github_contributors(manifest, github_url): #cntrb_id = AugurUUID(session.platform_id,contributor['id']).to_UUID() cntrb_id = GithubUUID() cntrb_id["user"] = int(contributor['id']) - cntrb_id["platform"] = manifest.platform_id + cntrb_id["platform"] = platform_id cntrb = { "cntrb_id" : cntrb_id.to_UUID(), @@ -115,20 +119,17 @@ def query_github_contributors(manifest, github_url): cntrb_natural_keys = ['cntrb_id'] #insert cntrb to table. #session.logger.info(f"Contributor: {cntrb} \n") - manifest.augur_db.insert_data(cntrb,Contributor,cntrb_natural_keys) + bulk_insert_dicts(logger, cntrb,Contributor,cntrb_natural_keys) except Exception as e: - manifest.logger.error("Caught exception: {}".format(e)) - manifest.logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) + logger.error("Caught exception: {}".format(e)) + logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) raise e # Get all the committer data for a repo. # Used by facade in facade03analyzecommit -def grab_committer_list(manifest, repo_id, platform="github"): +def grab_committer_list(logger, key_auth, repo_git, platform="github"): # Create API endpoint from repo_id - - endpoint = create_endpoint_from_repo_id(manifest.logger,manifest.augur_db, repo_id) - - query_github_contributors(manifest,endpoint) + query_github_contributors(logger, key_auth, repo_git) \ No newline at end of file diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 6bf9888c0..4a3806d50 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -4,14 +4,14 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask from augur.tasks.github.util.github_paginator import retrieve_dict_from_endpoint -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.application.db.models import Contributor from augur.tasks.github.facade_github.core import * -from augur.application.db.util import execute_session_query +from augur.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_name, get_contributors_by_full_name from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * -def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id): +def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id): for contributor in contributorQueue: # Get the email from the commit data @@ -22,8 +22,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) # check the email to see if it already exists in contributor_aliases # Look up email to see if resolved - query = db.query(ContributorsAlias).filter_by(alias_email=email) - alias_table_data = execute_session_query(query, 'all') + alias_table_data = get_contributor_aliases_by_email(email) if len(alias_table_data) >= 1: # Move on if email resolved logger.info( @@ -34,8 +33,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) #Check the unresolved_commits table to avoid hitting endpoints that we know don't have relevant data needlessly - query = db.query(UnresolvedCommitEmail).filter_by(name=name) - unresolved_query_result = execute_session_query(query, 'all') + unresolved_query_result = get_unresolved_commit_emails_by_name(name) if len(unresolved_query_result) >= 1: @@ -46,8 +44,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) #Check the contributors table for a login for the given name - query = db.query(Contributor).filter_by(cntrb_full_name=name) - contributors_with_matching_name = execute_session_query(query, 'first') + contributors_with_matching_name = get_contributors_by_full_name(name) if not contributors_with_matching_name: logger.debug("Failed local login lookup") @@ -57,12 +54,12 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) # Try to get the login from the commit sha if login == None or login == "": - login = get_login_with_commit_hash(logger,db,auth,contributor, repo_id) + login = get_login_with_commit_hash(logger, auth, contributor, repo_id) if login == None or login == "": logger.info("Failed to get login from commit hash") # Try to get the login from supplemental data if not found with the commit hash - login = get_login_with_supplemental_data(logger, db, auth,contributor) + login = get_login_with_supplemental_data(logger, auth,contributor) if login == None or login == "": logger.error("Failed to get login from supplemental data!") @@ -130,12 +127,11 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) #Executes an upsert with sqlalchemy cntrb_natural_keys = ['cntrb_id'] - db.insert_data(cntrb,Contributor,cntrb_natural_keys) - + bulk_insert_dicts(logger, cntrb,Contributor,cntrb_natural_keys) try: # Update alias after insertion. Insertion needs to happen first so we can get the autoincrementkey - insert_alias(logger, db,cntrb, emailFromCommitData) + insert_alias(logger, cntrb, emailFromCommitData) except LookupError as e: logger.info( ''.join(traceback.format_exception(None, e, e.__traceback__))) @@ -159,7 +155,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) logger.info(f"Updating now resolved email {email}") try: - db.execute_sql(query) + execute_sql(query) except Exception as e: logger.info( f"Deleting now resolved email failed with error: {e}") @@ -169,11 +165,11 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) return -def link_commits_to_contributor(session,contributorQueue): +def link_commits_to_contributor(logger, facade_helper, contributorQueue): # # iterate through all the commits with emails that appear in contributors and give them the relevant cntrb_id. for cntrb in contributorQueue: - session.logger.debug( + logger.debug( f"These are the emails and cntrb_id's returned: {cntrb}") query = s.sql.text(""" @@ -186,7 +182,7 @@ def link_commits_to_contributor(session,contributorQueue): """).bindparams(cntrb_id=cntrb["cntrb_id"],cntrb_email=cntrb["email"]) #engine.execute(query, **data) - session.insert_or_update_data(query) + facade_helper.insert_or_update_data(query) return @@ -196,121 +192,114 @@ def link_commits_to_contributor(session,contributorQueue): @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) def insert_facade_contributors(self, repo_git): + # Set platform id to 1 since this task is github specific + platform_id = 1 + engine = self.app.engine logger = logging.getLogger(insert_facade_contributors.__name__) repo_id = None - with GithubTaskManifest(logger) as manifest: - - #contributor_sequence.append(facade_start_contrib_analysis_task.si()) - query = s.sql.text("""SELECT repo_id FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - - repo = manifest.augur_db.execute_sql(query).fetchone() - logger.info(f"repo: {repo}") - repo_id = repo[0] - - # Get all of the commit data's emails and names from the commit table that do not appear - # in the contributors table or the contributors_aliases table. - - manifest.logger.info( - "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) - new_contrib_sql = s.sql.text(""" - SELECT DISTINCT - commits.cmt_author_name AS NAME, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'not_unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) - or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - UNION - SELECT DISTINCT - commits.cmt_author_name AS NAME,--commits.cmt_id AS id, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - ORDER BY - hash - """).bindparams(repo_id=repo_id) - - #Execute statement with session. - result = manifest.augur_db.execute_sql(new_contrib_sql) - new_contribs = [dict(row) for row in result.mappings()] - - #print(new_contribs) - - #json.loads(pd.read_sql(new_contrib_sql, self.db, params={ - # 'repo_id': repo_id}).to_json(orient="records")) - - - - process_commit_metadata(manifest.logger,manifest.augur_db,manifest.key_auth,list(new_contribs),repo_id,manifest.platform_id) - - manifest.logger.debug("DEBUG: Got through the new_contribs") - + # Get all of the commit data's emails and names from the commit table that do not appear + # in the contributors table or the contributors_aliases table. - with FacadeSession(logger) as session: - # sql query used to find corresponding cntrb_id's of emails found in the contributor's table - # i.e., if a contributor already exists, we use it! - resolve_email_to_cntrb_id_sql = s.sql.text(""" + logger.info( + "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) + new_contrib_sql = s.sql.text(""" SELECT DISTINCT - cntrb_id, - contributors.cntrb_login AS login, - contributors.cntrb_canonical AS email, - commits.cmt_author_raw_email + commits.cmt_author_name AS NAME, + commits.cmt_commit_hash AS hash, + commits.cmt_author_raw_email AS email_raw, + 'not_unresolved' as resolution_status FROM - contributors, commits WHERE - contributors.cntrb_canonical = commits.cmt_author_raw_email - AND commits.repo_id = :repo_id + commits.repo_id = :repo_id + AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) + or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) + AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) + GROUP BY + commits.cmt_author_name, + commits.cmt_commit_hash, + commits.cmt_author_raw_email UNION SELECT DISTINCT - contributors_aliases.cntrb_id, - contributors.cntrb_login as login, - contributors_aliases.alias_email AS email, - commits.cmt_author_raw_email + commits.cmt_author_name AS NAME,--commits.cmt_id AS id, + commits.cmt_commit_hash AS hash, + commits.cmt_author_raw_email AS email_raw, + 'unresolved' as resolution_status FROM - contributors, - contributors_aliases, commits WHERE - contributors_aliases.alias_email = commits.cmt_author_raw_email - AND contributors.cntrb_id = contributors_aliases.cntrb_id - AND commits.repo_id = :repo_id - """).bindparams(repo_id=repo_id) + commits.repo_id = :repo_id + AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) + AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) + GROUP BY + commits.cmt_author_name, + commits.cmt_commit_hash, + commits.cmt_author_raw_email + ORDER BY + hash + """).bindparams(repo_id=repo_id) + + #Execute statement with session. + result = execute_sql(new_contrib_sql) + new_contribs = [dict(row) for row in result.mappings()] - #self.logger.info("DEBUG: got passed the sql statement declaration") - # Get a list of dicts that contain the emails and cntrb_id's of commits that appear in the contributor's table. - #existing_cntrb_emails = json.loads(pd.read_sql(resolve_email_to_cntrb_id_sql, self.db, params={ - # 'repo_id': repo_id}).to_json(orient="records")) + #print(new_contribs) - result = session.execute_sql(resolve_email_to_cntrb_id_sql) - existing_cntrb_emails = [dict(row) for row in result.mappings()] + #json.loads(pd.read_sql(new_contrib_sql, self.db, params={ + # 'repo_id': repo_id}).to_json(orient="records")) - print(existing_cntrb_emails) - link_commits_to_contributor(session,list(existing_cntrb_emails)) - session.logger.info("Done with inserting and updating facade contributors") + key_auth = GithubRandomKeyAuth(logger) + + process_commit_metadata(logger, key_auth, list(new_contribs), repo_id, platform_id) + + logger.debug("DEBUG: Got through the new_contribs") + + facade_helper = FacadeHelper(logger) + # sql query used to find corresponding cntrb_id's of emails found in the contributor's table + # i.e., if a contributor already exists, we use it! + resolve_email_to_cntrb_id_sql = s.sql.text(""" + SELECT DISTINCT + cntrb_id, + contributors.cntrb_login AS login, + contributors.cntrb_canonical AS email, + commits.cmt_author_raw_email + FROM + contributors, + commits + WHERE + contributors.cntrb_canonical = commits.cmt_author_raw_email + AND commits.repo_id = :repo_id + UNION + SELECT DISTINCT + contributors_aliases.cntrb_id, + contributors.cntrb_login as login, + contributors_aliases.alias_email AS email, + commits.cmt_author_raw_email + FROM + contributors, + contributors_aliases, + commits + WHERE + contributors_aliases.alias_email = commits.cmt_author_raw_email + AND contributors.cntrb_id = contributors_aliases.cntrb_id + AND commits.repo_id = :repo_id + """).bindparams(repo_id=repo_id) + + #self.logger.info("DEBUG: got passed the sql statement declaration") + # Get a list of dicts that contain the emails and cntrb_id's of commits that appear in the contributor's table. + #existing_cntrb_emails = json.loads(pd.read_sql(resolve_email_to_cntrb_id_sql, self.db, params={ + # 'repo_id': repo_id}).to_json(orient="records")) + + result = execute_sql(resolve_email_to_cntrb_id_sql) + existing_cntrb_emails = [dict(row) for row in result.mappings()] + + print(existing_cntrb_emails) + link_commits_to_contributor(logger, facade_helper,list(existing_cntrb_emails)) + + logger.info("Done with inserting and updating facade contributors") return diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index baccfdc60..98a8067eb 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -8,59 +8,43 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Contributor, Repo +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Contributor from augur.application.config import get_development_flag -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts + development = get_development_flag() @celery.task(base=AugurCoreRepoCollectionTask) def collect_issues(repo_git : str) -> int: - logger = logging.getLogger(collect_issues.__name__) - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - logger.info(f'this is the manifest.key_auth value: {str(manifest.key_auth)}') + owner, repo = get_owner_repo(repo_git) - try: - - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id - - #try this - # the_key = manifest.key_auth - # try: - # randomon = GithubApiKeyHandler(augur_db.session) - # the_key = randomon.get_random_key() - # logger.info(f'The Random Key {the_key}') - # except Exception as e: - # logger.info(f'error: {e}') - # the_key = manifest.key_auth - # pass - - owner, repo = get_owner_repo(repo_git) - - issue_data = retrieve_all_issue_data(repo_git, logger, manifest.key_auth) - #issue_data = retrieve_all_issue_data(repo_git, logger, the_key) - - if issue_data: - total_issues = len(issue_data) - process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger, augur_db) - - return total_issues - else: - logger.info(f"{owner}/{repo} has no issues") - return 0 - except Exception as e: - logger.error(f"Could not collect issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") - return -1 + key_auth = GithubRandomKeyAuth(logger) + + logger.info(f'this is the manifest.key_auth value: {str(key_auth)}') + + try: + issue_data = retrieve_all_issue_data(repo_git, logger, key_auth) + + if issue_data: + total_issues = len(issue_data) + process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger) + + return total_issues + else: + logger.info(f"{owner}/{repo} has no issues") + return 0 + except Exception as e: + logger.error(f"Could not collect issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + return -1 @@ -99,7 +83,7 @@ def retrieve_all_issue_data(repo_git, logger, key_auth) -> None: return all_data -def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: +def process_issues(issues, task_name, repo_id, logger) -> None: # get repo_id or have it passed tool_source = "Issue Task" @@ -153,7 +137,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # insert contributors from these issues logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) # insert the issues into the issues table. @@ -164,7 +148,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: issue_return_columns = ["issue_url", "issue_id"] issue_string_columns = ["issue_title", "issue_body"] try: - issue_return_data = augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + issue_return_data = bulk_insert_dicts(logger, issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) except IntegrityError as e: logger.error(f"Ran into integrity error:{e} \n Offending data: \n{issue_dicts}") @@ -197,13 +181,13 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # we are using label_src_id and issue_id to determine if the label is already in the database. issue_label_natural_keys = ['label_src_id', 'issue_id'] issue_label_string_fields = ["label_text", "label_description"] - augur_db.insert_data(issue_label_dicts, IssueLabel, + bulk_insert_dicts(logger, issue_label_dicts, IssueLabel, issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + bulk_insert_dicts(logger, issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index f3a30a54f..3e104fc6d 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -287,4 +287,4 @@ def process_github_comment_contributors(message, tool_source, tool_version, data # This is done by searching all the dicts for the given key that has the specified value def find_dict_in_list_of_dicts(data, key, value): - return next((item for item in data if item[key] == value), None) + return next((item for item in data if item[key] == value), None) \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/augur/tasks/github/pull_requests/commits_model/core.py index ea91a597d..7c6f36abf 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/augur/tasks/github/pull_requests/commits_model/core.py @@ -2,25 +2,20 @@ from augur.tasks.github.util.github_paginator import GithubPaginator from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts, fetchall_data_from_sql_text - -def pull_request_commits_model(repo_id,logger, augur_db, key_auth): +def pull_request_commits_model(repo,logger, key_auth): # query existing PRs and the respective url we will append the commits url to pr_url_sql = s.sql.text(""" SELECT DISTINCT pr_url, pull_requests.pull_request_id FROM pull_requests--, pull_request_meta WHERE repo_id = :repo_id - """).bindparams(repo_id=repo_id) + """).bindparams(repo_id=repo.repo_id) pr_urls = [] #pd.read_sql(pr_number_sql, self.db, params={}) - pr_urls = augur_db.fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() - - query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) - repo = execute_session_query(query, 'one') - + pr_urls = fetchall_data_from_sql_text(pr_url_sql) owner, name = get_owner_repo(repo.repo_git) task_name = f"{owner}/{name} Pr commits" @@ -49,7 +44,7 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth): 'tool_source': 'pull_request_commits_model', 'tool_version': '0.41', 'data_source': 'GitHub API', - 'repo_id': repo_id, + 'repo_id': repo.repo_id, } all_data.append(pr_commit_row) @@ -57,7 +52,7 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth): if len(all_data) > 0: logger.info(f"{task_name}: Inserting {len(all_data)} rows") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] - augur_db.insert_data(all_data,PullRequestCommit,pr_commits_natural_keys) + bulk_insert_dicts(logger, all_data,PullRequestCommit,pr_commits_natural_keys) diff --git a/augur/tasks/github/pull_requests/commits_model/tasks.py b/augur/tasks/github/pull_requests/commits_model/tasks.py index f0a065bdd..b1d920e98 100644 --- a/augur/tasks/github/pull_requests/commits_model/tasks.py +++ b/augur/tasks/github/pull_requests/commits_model/tasks.py @@ -2,8 +2,9 @@ from augur.tasks.github.pull_requests.commits_model.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.application.db.lib import get_repo_by_repo_git + @celery.task(base=AugurSecondaryRepoCollectionTask) @@ -11,11 +12,8 @@ def process_pull_request_commits(repo_git: str) -> None: logger = logging.getLogger(process_pull_request_commits.__name__) - with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db + repo = get_repo_by_repo_git(repo_git) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') + key_auth = GithubRandomKeyAuth(logger) - pull_request_commits_model(repo.repo_id, logger, augur_db, manifest.key_auth) + pull_request_commits_model(repo, logger, key_auth) diff --git a/augur/tasks/github/pull_requests/core.py b/augur/tasks/github/pull_requests/core.py index 5bc86cd67..38d1136eb 100644 --- a/augur/tasks/github/pull_requests/core.py +++ b/augur/tasks/github/pull_requests/core.py @@ -4,6 +4,7 @@ from augur.application.db.data_parse import * from augur.application.db.session import DatabaseSession +from augur.application.db.lib import bulk_insert_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, Contributor @@ -129,12 +130,12 @@ def extract_data_from_pr_list(pull_requests: List[dict], return pr_dicts, pr_mapping_data, pr_numbers, contributors -def insert_pr_contributors(contributors: List[dict], session: DatabaseSession, task_name: str) -> None: +def insert_pr_contributors(contributors: List[dict], logger, task_name: str) -> None: """Insert pr contributors Args: contributors: the contributor data that is being inserted - session: database session to insert the data with + logger task_name: to differiante between log statements since there are multiple tasks of the same type """ @@ -142,16 +143,16 @@ def insert_pr_contributors(contributors: List[dict], session: DatabaseSession, t contributors = remove_duplicate_dicts(contributors) # insert contributors from these prs - session.logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - session.insert_data(contributors, Contributor, ["cntrb_id"]) + logger.info(f"{task_name}: Inserting {len(contributors)} contributors") + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) -def insert_prs(pr_dicts: List[dict], session: DatabaseSession, task_name: str) -> Optional[List[dict]]: +def insert_prs(pr_dicts: List[dict], logger, task_name: str) -> Optional[List[dict]]: """Insert pull requests Args: pr_dicts: the pull request data that is being inserted - session: database session to insert the data with + logger task_name: to differiante between log statements since there are multiple tasks of the same type Returns: @@ -159,10 +160,10 @@ def insert_prs(pr_dicts: List[dict], session: DatabaseSession, task_name: str) - So we can determine what labels, assigness, and other data belong to each pr """ - session.logger.info(f"{task_name}: Inserting prs of length: {len(pr_dicts)}") + logger.info(f"{task_name}: Inserting prs of length: {len(pr_dicts)}") pr_natural_keys = ["pr_url"] pr_return_columns = ["pull_request_id", "pr_url"] - pr_return_data = session.insert_data(pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns) + pr_return_data = bulk_insert_dicts(logger, pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns) return pr_return_data @@ -211,7 +212,7 @@ def map_other_pr_data_to_pr( return pr_label_dicts, pr_assignee_dicts, pr_reviewer_dicts, pr_metadata_dicts -def insert_pr_labels(labels: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_labels(labels: List[dict], logger: logging.Logger) -> None: """Insert pull request labels Note: @@ -223,10 +224,10 @@ def insert_pr_labels(labels: List[dict], logger: logging.Logger, session) -> Non """ # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] - session.insert_data(labels, PullRequestLabel, pr_label_natural_keys) + bulk_insert_dicts(labels, PullRequestLabel, pr_label_natural_keys) -def insert_pr_assignees(assignees: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_assignees(assignees: List[dict], logger: logging.Logger) -> None: """Insert pull request assignees Note: @@ -238,10 +239,10 @@ def insert_pr_assignees(assignees: List[dict], logger: logging.Logger, session) """ # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - session.insert_data(assignees, PullRequestAssignee, pr_assignee_natural_keys) + bulk_insert_dicts(logger, assignees, PullRequestAssignee, pr_assignee_natural_keys) -def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger) -> None: """Insert pull request reviewers Note: @@ -253,10 +254,10 @@ def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger, session) """ # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_reviewer_natural_keys = ["pull_request_id", "pr_reviewer_src_id"] - session.insert_data(reviewers, PullRequestReviewer, pr_reviewer_natural_keys) + bulk_insert_dicts(reviewers, PullRequestReviewer, pr_reviewer_natural_keys) -def insert_pr_metadata(metadata: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_metadata(metadata: List[dict], logger: logging.Logger) -> None: """Insert pull request metadata Note: @@ -269,7 +270,7 @@ def insert_pr_metadata(metadata: List[dict], logger: logging.Logger, session) -> # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] - session.insert_data(metadata, PullRequestMeta, pr_metadata_natural_keys) + bulk_insert_dicts(logger, metadata, PullRequestMeta, pr_metadata_natural_keys) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 138aa61cb..537d2bd20 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -2,7 +2,10 @@ from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo +from augur.application.db.lib import bulk_insert_dicts, execute_sql from augur.application.db.util import execute_session_query +import traceback +import httpx # Import httpx def pull_request_files_model(repo_id,logger, augur_db, key_auth): @@ -14,28 +17,22 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): """).bindparams(repo_id=repo_id) pr_numbers = [] #pd.read_sql(pr_number_sql, self.db, params={}) - - result = augur_db.execute_sql(pr_number_sql)#.fetchall() - pr_numbers = [dict(row) for row in result.mappings()] - + query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') - owner, name = get_owner_repo(repo.repo_git) pr_file_rows = [] logger.info(f"Getting pull request files for repo: {repo.repo_git}") - for index,pr_info in enumerate(pr_numbers): + for index, pr_info in enumerate(pr_numbers): logger.info(f'Querying files for pull request #{index + 1} of {len(pr_numbers)}') query = """ - query($repo: String!, $owner: String!,$pr_number: Int!, $numRecords: Int!, $cursor: String) { repository(name: $repo, owner: $owner) { pullRequest(number: $pr_number) { - files ( first: $numRecords, after: $cursor) - { + files ( first: $numRecords, after: $cursor) { edges { node { additions @@ -54,28 +51,31 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): } """ - values = ("repository","pullRequest","files") + values = ("repository", "pullRequest", "files") params = { - 'owner' : owner, - 'repo' : name, - 'pr_number' : pr_info['pr_src_number'], - 'values' : values + 'owner': owner, + 'repo': name, + 'pr_number': pr_info['pr_src_number'], + 'values': values } - - file_collection = GraphQlPageCollection(query, key_auth, logger,bind=params) + try: + file_collection = GraphQlPageCollection(query, key_auth, logger, bind=params) - pr_file_rows += [{ - 'pull_request_id': pr_info['pull_request_id'], - 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, - 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, - 'pr_file_path': pr_file['path'], - 'data_source': 'GitHub API', - 'repo_id': repo_id, + pr_file_rows += [{ + 'pull_request_id': pr_info['pull_request_id'], + 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, + 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, + 'pr_file_path': pr_file['path'], + 'data_source': 'GitHub API', + 'repo_id': repo.repo_id, } for pr_file in file_collection if pr_file and 'path' in pr_file] - + except httpx.RequestError as e: + logger.error(f"An error occurred while requesting data from the GitHub API: {e}") + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error occurred while requesting data from the GitHub API: {e}") if len(pr_file_rows) > 0: - #Execute a bulk upsert with sqlalchemy + # Execute a bulk upsert with sqlalchemy pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - augur_db.insert_data(pr_file_rows, PullRequestFile, pr_file_natural_keys) + bulk_insert_dicts(logger, pr_file_rows, PullRequestFile, pr_file_natural_keys) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 73ea1b025..ace6fa7d2 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -5,25 +5,29 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask from augur.application.db.data_parse import * from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo +from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.application.db.lib import get_session, get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + +import httpx from typing import Generator, List, Dict platform_id = 1 - @celery.task(base=AugurCoreRepoCollectionTask) def collect_pull_requests(repo_git: str) -> int: logger = logging.getLogger(collect_pull_requests.__name__) with GithubTaskManifest(logger) as manifest: + #with GithubTaskManifest() as manifest: augur_db = manifest.augur_db @@ -82,7 +86,6 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth): #-> Generator[List[Di yield page_data - def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): """ Parse and insert all retrieved PR data. @@ -188,6 +191,11 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): + + + + + @@ -206,7 +214,6 @@ def process_pull_request_review_contributor(pr_review: dict, tool_source: str, t return pr_review_cntrb - @celery.task(base=AugurSecondaryRepoCollectionTask) def collect_pull_request_review_comments(repo_git: str) -> None: @@ -217,112 +224,105 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger = logging.getLogger(collect_pull_request_review_comments.__name__) logger.info(f"Collecting pull request review comments for {owner}/{repo}") - # define GithubTaskSession to handle insertions, and store oauth keys - with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db - - # get repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(PullRequestReview).filter(PullRequestReview.repo_id == repo_id) - pr_reviews = execute_session_query(query, 'all') + pr_reviews = get_pull_request_reviews_by_repo_id(repo_id) - # maps the github pr_review id to the auto incrementing pk that augur stores as pr_review id - pr_review_id_mapping = {} - for review in pr_reviews: - pr_review_id_mapping[review.pr_review_src_id] = review.pr_review_id + # maps the github pr_review id to the auto incrementing pk that augur stores as pr_review id + pr_review_id_mapping = {} + for review in pr_reviews: + pr_review_id_mapping[review.pr_review_src_id] = review.pr_review_id - tool_source = "Pr review comment task" - tool_version = "2.0" - data_source = "Github API" - - pr_review_messages = GithubPaginator(review_msg_url, manifest.key_auth, logger) - num_pages = pr_review_messages.get_num_pages() + tool_source = "Pr review comment task" + tool_version = "2.0" + data_source = "Github API" - all_raw_pr_review_messages = [] - for page_data, page in pr_review_messages.iter_pages(): + key_auth = GithubRandomKeyAuth(logger) + pr_review_messages = GithubPaginator(review_msg_url, key_auth, logger) + num_pages = pr_review_messages.get_num_pages() - if page_data is None: - break + all_raw_pr_review_messages = [] + for page_data, page in pr_review_messages.iter_pages(): - if len(page_data) == 0: - logger.debug(f"{owner}/{repo} Pr Review Messages Page {page} contains no data...returning") - logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}") - break + if page_data is None: + break + if len(page_data) == 0: + logger.debug(f"{owner}/{repo} Pr Review Messages Page {page} contains no data...returning") logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}") + break - all_raw_pr_review_messages += page_data + logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}") - contributors = [] - for comment in all_raw_pr_review_messages: - - _, contributor = process_github_comment_contributors(comment, tool_source, tool_version, data_source) - if contributor is not None: - contributors.append(contributor) + all_raw_pr_review_messages += page_data + + contributors = [] + for comment in all_raw_pr_review_messages: + + _, contributor = process_github_comment_contributors(comment, tool_source, tool_version, data_source) + if contributor is not None: + contributors.append(contributor) - logger.info(f"{owner}/{repo} Pr review messages: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + logger.info(f"{owner}/{repo} Pr review messages: Inserting {len(contributors)} contributors") + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) - pr_review_comment_dicts = [] - pr_review_msg_mapping_data = {} + pr_review_comment_dicts = [] + pr_review_msg_mapping_data = {} - pr_review_comments_len = len(all_raw_pr_review_messages) - logger.info(f"{owner}/{repo}: Pr review comments len: {pr_review_comments_len}") - for index, comment in enumerate(all_raw_pr_review_messages): + pr_review_comments_len = len(all_raw_pr_review_messages) + logger.info(f"{owner}/{repo}: Pr review comments len: {pr_review_comments_len}") + for index, comment in enumerate(all_raw_pr_review_messages): - # pull_request_review_id is required to map it to the correct pr review - if not comment["pull_request_review_id"]: - continue + # pull_request_review_id is required to map it to the correct pr review + if not comment["pull_request_review_id"]: + continue - pr_review_comment_dicts.append( - extract_needed_message_data(comment, platform_id, repo_id, tool_source, tool_version, data_source) - ) + pr_review_comment_dicts.append( + extract_needed_message_data(comment, platform_id, repo_id, tool_source, tool_version, data_source) + ) - # map github message id to the data that maps it to the pr review - github_msg_id = comment["id"] - pr_review_msg_mapping_data[github_msg_id] = comment + # map github message id to the data that maps it to the pr review + github_msg_id = comment["id"] + pr_review_msg_mapping_data[github_msg_id] = comment - logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") - message_natural_keys = ["platform_msg_id", "pltfrm_id"] - message_return_columns = ["msg_id", "platform_msg_id"] - message_return_data = augur_db.insert_data(pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) - if message_return_data is None: - return + logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") + message_natural_keys = ["platform_msg_id", "pltfrm_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_return_data = bulk_insert_dicts(logger, pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) + if message_return_data is None: + return - pr_review_message_ref_insert_data = [] - for data in message_return_data: + pr_review_message_ref_insert_data = [] + for data in message_return_data: - augur_msg_id = data["msg_id"] - github_msg_id = data["platform_msg_id"] + augur_msg_id = data["msg_id"] + github_msg_id = data["platform_msg_id"] - comment = pr_review_msg_mapping_data[github_msg_id] - comment["msg_id"] = augur_msg_id + comment = pr_review_msg_mapping_data[github_msg_id] + comment["msg_id"] = augur_msg_id - github_pr_review_id = comment["pull_request_review_id"] + github_pr_review_id = comment["pull_request_review_id"] - try: - augur_pr_review_id = pr_review_id_mapping[github_pr_review_id] - except KeyError: - logger.info(f"{owner}/{repo}: Could not find related pr review") - logger.info(f"{owner}/{repo}: We were searching for pr review with id: {github_pr_review_id}") - logger.info("Skipping") - continue + try: + augur_pr_review_id = pr_review_id_mapping[github_pr_review_id] + except KeyError: + logger.info(f"{owner}/{repo}: Could not find related pr review") + logger.info(f"{owner}/{repo}: We were searching for pr review with id: {github_pr_review_id}") + logger.info("Skipping") + continue - pr_review_message_ref = extract_pr_review_message_ref_data(comment, augur_pr_review_id, github_pr_review_id, repo_id, tool_version, data_source) - pr_review_message_ref_insert_data.append(pr_review_message_ref) + pr_review_message_ref = extract_pr_review_message_ref_data(comment, augur_pr_review_id, github_pr_review_id, repo_id, tool_version, data_source) + pr_review_message_ref_insert_data.append(pr_review_message_ref) - logger.info(f"Inserting {len(pr_review_message_ref_insert_data)} pr review refs") - pr_comment_ref_natural_keys = ["pr_review_msg_src_id"] - augur_db.insert_data(pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) + logger.info(f"Inserting {len(pr_review_message_ref_insert_data)} pr review refs") + pr_comment_ref_natural_keys = ["pr_review_msg_src_id"] + bulk_insert_dicts(logger, pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) @@ -338,72 +338,82 @@ def collect_pull_request_reviews(repo_git: str) -> None: tool_source = "pull_request_reviews" data_source = "Github API" - with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + key_auth = GithubRandomKeyAuth(logger) - query = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) + with get_session() as session: + + query = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) prs = execute_session_query(query, 'all') - pr_count = len(prs) + pr_count = len(prs) - all_pr_reviews = {} - for index, pr in enumerate(prs): + all_pr_reviews = {} + for index, pr in enumerate(prs): - pr_number = pr.pr_src_number - pull_request_id = pr.pull_request_id + pr_number = pr.pr_src_number + pull_request_id = pr.pull_request_id - logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") + logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") - pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" + pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" - pr_reviews = [] - pr_reviews_generator = GithubPaginator(pr_review_url, manifest.key_auth, logger) - for page_data, page in pr_reviews_generator.iter_pages(): + pr_reviews = [] + pr_reviews_generator = GithubPaginator(pr_review_url, key_auth, logger) + for page_data, page in pr_reviews_generator.iter_pages(): - if page_data is None: - break + if page_data is None: + break - if len(page_data) == 0: - break + if len(page_data) == 0: + break - pr_reviews.extend(page_data) + if isinstance(page_data, list): + page_data = [ + element.decode('utf-8').replace('\x00', ' ') if isinstance(element, bytes) else element + for element in page_data + ] + logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") + elif isinstance(page_data, bytes): + page_data = page_data.decode('utf-8').replace('\x00', ' ') + logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") + - if pr_reviews: - all_pr_reviews[pull_request_id] = pr_reviews + pr_reviews.extend(page_data) + + if pr_reviews: + all_pr_reviews[pull_request_id] = pr_reviews - if not list(all_pr_reviews.keys()): - logger.info(f"{owner}/{repo} No pr reviews for repo") - return + if not list(all_pr_reviews.keys()): + logger.info(f"{owner}/{repo} No pr reviews for repo") + return - contributors = [] - for pull_request_id in all_pr_reviews.keys(): + contributors = [] + for pull_request_id in all_pr_reviews.keys(): - reviews = all_pr_reviews[pull_request_id] - for review in reviews: - contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) - if contributor: - contributors.append(contributor) + reviews = all_pr_reviews[pull_request_id] + for review in reviews: + contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) + if contributor: + contributors.append(contributor) - logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) - pr_reviews = [] - for pull_request_id in all_pr_reviews.keys(): + pr_reviews = [] + for pull_request_id in all_pr_reviews.keys(): - reviews = all_pr_reviews[pull_request_id] - for review in reviews: - - if "cntrb_id" in review: - pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) + reviews = all_pr_reviews[pull_request_id] + for review in reviews: + + if "cntrb_id" in review: + pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) - logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") - pr_review_natural_keys = ["pr_review_src_id",] - augur_db.insert_data(pr_reviews, PullRequestReview, pr_review_natural_keys) + logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") + pr_review_natural_keys = ["pr_review_src_id",] + bulk_insert_dicts(logger, pr_reviews, PullRequestReview, pr_review_natural_keys) diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index b7f953c61..3192401ae 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -4,6 +4,7 @@ from augur.tasks.github.util.util import get_owner_repo from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts def get_release_inf(repo_id, release, tag_only): @@ -63,11 +64,11 @@ def get_release_inf(repo_id, release, tag_only): return release_inf -def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): +def insert_release(session, logger, repo_id, owner, release, tag_only = False): # Get current table values logger.info('Getting release table values\n') - query = augur_db.session.query(Release.release_id).filter(Release.repo_id == repo_id) + query = session.query(Release.release_id).filter(Release.repo_id == repo_id) release_id_data = execute_session_query(query, 'all')#pd.read_sql(release_id_data_sql, self.db, params={'repo_id': repo_id}) release_id_data = [str(r_id).strip() for r_id in release_id_data]#release_id_data.apply(lambda x: x.str.strip()) @@ -77,7 +78,7 @@ def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): #Do an upsert string_fields = ["release_name", "release_description", "release_author", "release_tag_name"] - augur_db.insert_data(release_inf,Release,['release_id'], string_fields=string_fields) + bulk_insert_dicts(logger, release_inf,Release,['release_id'], string_fields=string_fields) logger.info(f"Inserted info for {owner}/{repo_id}/{release['name']}\n") @@ -166,7 +167,7 @@ def fetch_data(key_auth, logger, github_url, repo_id, tag_only = False): return data -def releases_model(augur_db, key_auth, logger, repo_git, repo_id): +def releases_model(session, key_auth, logger, repo_git, repo_id): try: data = fetch_data(key_auth, logger, repo_git, repo_id) @@ -181,7 +182,7 @@ def releases_model(augur_db, key_auth, logger, repo_git, repo_id): if 'node' in n: release = n['node'] #self.insert_release(task, repo_id, data['owner'], release) - insert_release(augur_db, logger, repo_id, data['owner'], release) + insert_release(session, logger, repo_id, data['owner'], release) else: logger.info("There's no release to insert. Current node is not available in releases: {}\n".format(n)) elif 'edges' in data['releases'] and not data['releases']['edges']: @@ -194,7 +195,7 @@ def releases_model(augur_db, key_auth, logger, repo_git, repo_id): if 'node' in n: release = n['node'] #self.insert_release(task, repo_id, data['owner'], release, True) - insert_release(augur_db,logger, repo_id, data['owner'], release, True) + insert_release(session, logger, repo_id, data['owner'], release, True) else: logger.info("There's no release to insert. Current node is not available in releases: {}\n".format(n)) else: diff --git a/augur/tasks/github/releases/tasks.py b/augur/tasks/github/releases/tasks.py index 310da90d7..3e2210a7c 100644 --- a/augur/tasks/github/releases/tasks.py +++ b/augur/tasks/github/releases/tasks.py @@ -1,22 +1,22 @@ import logging -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.releases.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git, get_session +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + @celery.task(base=AugurCoreRepoCollectionTask) def collect_releases(repo_git): logger = logging.getLogger(collect_releases.__name__) - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GithubRandomKeyAuth(logger) + with get_session() as session: - releases_model(augur_db, manifest.key_auth, logger, repo_git, repo_id) \ No newline at end of file + releases_model(session, key_auth, logger, repo_git, repo_id) \ No newline at end of file diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index 2a9f21af7..0cf6705fc 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -6,6 +6,7 @@ from augur.tasks.github.util.util import get_owner_repo from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict from augur.application.db.models import * +from augur.application.db.lib import execute_sql from augur.tasks.github.util.github_task_session import * from augur.application.db.models.augur_data import RepoBadging from urllib.parse import quote @@ -92,7 +93,7 @@ def grab_repo_info_from_graphql_endpoint(key_auth, logger, query): return data -def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): +def repo_info_model(key_auth, repo_orm_obj, logger): logger.info("Beginning filling the repo_info model for repo: " + repo_orm_obj.repo_git + "\n") owner, repo = get_owner_repo(repo_orm_obj.repo_git) @@ -255,7 +256,7 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): 'data_source': "Github" } - #result = session.insert_data(rep_inf,RepoInfo,['repo_info_id']) #result = self.db.execute(self.repo_info_table.insert().values(rep_inf)) + #result = bulk_insert_dicts(rep_inf,RepoInfo,['repo_info_id']) #result = self.db.execute(self.repo_info_table.insert().values(rep_inf)) insert_statement = s.sql.text("""INSERT INTO repo_info (repo_id,last_updated,issues_enabled, open_issues,pull_requests_enabled,wiki_enabled,pages_enabled,fork_count, default_branch,watchers_count,license,stars_count, @@ -270,7 +271,7 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): :tool_source, :tool_version, :data_source) """).bindparams(**rep_inf) - augur_db.execute_sql(insert_statement) + execute_sql(insert_statement) # Note that the addition of information about where a repository may be forked from, and whether a repository is archived, updates the `repo` table, not the `repo_info` table. forked = is_forked(key_auth, logger, owner, repo) @@ -283,7 +284,7 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): archived = 0 update_repo_data = s.sql.text("""UPDATE repo SET forked_from=:forked, repo_archived=:archived, repo_archived_date_collected=:archived_date_collected WHERE repo_id=:repo_id""").bindparams(forked=forked, archived=archived, archived_date_collected=archived_date_collected, repo_id=repo_orm_obj.repo_id) - augur_db.execute_sql(update_repo_data) + execute_sql(update_repo_data) logger.info(f"Inserted info for {owner}/{repo}\n") diff --git a/augur/tasks/github/repo_info/tasks.py b/augur/tasks/github/repo_info/tasks.py index b31bc7bf6..85d639d2a 100644 --- a/augur/tasks/github/repo_info/tasks.py +++ b/augur/tasks/github/repo_info/tasks.py @@ -1,10 +1,12 @@ import logging -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.application.db.session import DatabaseSession from augur.tasks.github.repo_info.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.application.db import get_engine #Task to get regular misc github info @@ -13,23 +15,23 @@ def collect_repo_info(repo_git: str): logger = logging.getLogger(collect_repo_info.__name__) - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - - repo_info_model(augur_db, manifest.key_auth, repo, logger) + repo = get_repo_by_repo_git(repo_git) + + key_auth = GithubRandomKeyAuth(logger) + + repo_info_model(key_auth, repo, logger) #Task to get CII api data for linux badge info using github data. @celery.task(base=AugurCoreRepoCollectionTask) def collect_linux_badge_info(repo_git: str): + engine = get_engine() + logger = logging.getLogger(collect_linux_badge_info.__name__) - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') + repo = get_repo_by_repo_git(repo_git) + + with DatabaseSession(logger, engine=engine) as session: - badges_model(logger, repo_git, repo.repo_id, augur_db) + badges_model(logger, repo_git, repo.repo_id, session) diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py index 068c9616b..4101faa3f 100644 --- a/augur/tasks/github/traffic/tasks.py +++ b/augur/tasks/github/traffic/tasks.py @@ -3,62 +3,61 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.data_parse import extract_needed_clone_history_data from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import RepoClone, Repo -from augur.application.db.util import execute_session_query +from augur.application.db.models import RepoClone +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + @celery.task def collect_github_repo_clones_data(repo_git: str) -> None: logger = logging.getLogger(collect_github_repo_clones_data.__name__) + + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id - # using GithubTaskSession to get our repo_obj for which we will store data of clones - with GithubTaskManifest(logger) as manifest: + owner, repo = get_owner_repo(repo_git) - query = manifest.augur_db.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + logger.info(f"Collecting Github repository clone data for {owner}/{repo}") - owner, repo = get_owner_repo(repo_git) + key_auth = GithubRandomKeyAuth(logger) - logger.info(f"Collecting Github repository clone data for {owner}/{repo}") - - clones_data = retrieve_all_clones_data(repo_git, logger, manifest.key_auth) + clones_data = retrieve_all_clones_data(repo_git, logger, key_auth) - if clones_data: - process_clones_data(clones_data, f"{owner}/{repo}: Traffic task", repo_id, manifest.augur_db) - else: - logger.info(f"{owner}/{repo} has no clones") + if clones_data: + process_clones_data(clones_data, f"{owner}/{repo}: Traffic task", repo_id) + else: + logger.info(f"{owner}/{repo} has no clones") def retrieve_all_clones_data(repo_git: str, logger, key_auth): - owner, repo = get_owner_repo(repo_git) + # owner, repo = get_owner_repo(repo_git) - url = f"https://api.github.com/repos/{owner}/{repo}/traffic/clones" + # url = f"https://api.github.com/repos/{owner}/{repo}/traffic/clones" - clones = GithubPaginator(url, key_auth, logger) + # clones = GithubPaginator(url, key_auth, logger) - num_pages = clones.get_num_pages() + # num_pages = clones.get_num_pages() all_data = [] - for page_data, page in clones.iter_pages(): + # for page_data, page in clones.iter_pages(): - if page_data is None: - return all_data + # if page_data is None: + # return all_data - elif len(page_data) == 0: - logger.debug(f"{repo.capitalize()} Traffic Page {page} contains no data...returning") - logger.info(f"Traffic Page {page} of {num_pages}") - return all_data + # elif len(page_data) == 0: + # logger.debug(f"{repo.capitalize()} Traffic Page {page} contains no data...returning") + # logger.info(f"Traffic Page {page} of {num_pages}") + # return all_data - logger.info(f"{repo} Traffic Page {page} of {num_pages}") + # logger.info(f"{repo} Traffic Page {page} of {num_pages}") - all_data += page_data + # all_data += page_data return all_data -def process_clones_data(clones_data, task_name, repo_id, logger, augur_db) -> None: +def process_clones_data(clones_data, task_name, repo_id, logger) -> None: clone_history_data = clones_data[0]['clones'] clone_history_data_dicts = extract_needed_clone_history_data(clone_history_data, repo_id) @@ -66,4 +65,4 @@ def process_clones_data(clones_data, task_name, repo_id, logger, augur_db) -> No clone_history_data = remove_duplicate_dicts(clone_history_data_dicts, 'clone_data_timestamp') logger.info(f"{task_name}: Inserting {len(clone_history_data_dicts)} clone history records") - augur_db.insert_data(clone_history_data_dicts, RepoClone, ['repo_id']) + bulk_insert_dicts(logger, clone_history_data_dicts, RepoClone, ['repo_id']) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index 574adbbaf..0667ab331 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -250,9 +250,11 @@ def hit_api(self,query,variables={}): def extract_paginate_result(self,responseDict): if not responseDict: + self.logger.error(f"DEBUG CHECK THIS {responseDict}") raise TimeoutError("No data received from endpoint.") #err = process_graphql_dict_response(self.logger, responseObject, response) if 'data' not in responseDict: + self.logger.error(f"DEBUG CHECK THIS {responseDict}") self.logger.error(responseDict) raise KeyError @@ -293,6 +295,8 @@ def __getitem__(self, index):# -> dict: #extract the content from the graphql query result coreData = self.extract_paginate_result(data) + self.logger.debug(f"for page in range 298: {data}") + content = [data['node'] for data in list(coreData['edges'])] if self.repaginate: @@ -323,6 +327,8 @@ def __len__(self): data = self.request_graphql_dict(variables=params) coreData = self.extract_paginate_result(data) + self.logger.debug(f"__len__: debug: {data}") + totalCount = int(coreData['totalCount']) return totalCount @@ -342,7 +348,7 @@ def __iter__(self): coreData = self.extract_paginate_result(data) if coreData is not None: if coreData.get('totalCount') is not None: - self.logger.info("... core data obtained") + self.logger.info("debug-gog: ... core data obtained") else: self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") coreData['totalCount'] = 0 @@ -384,6 +390,7 @@ def __iter__(self): data = self.request_graphql_dict(variables=params) coreData = self.extract_paginate_result(data) + self.logger.debug(f"while core data: {data}") #print(coreData) if len(coreData['edges']) == 0: diff --git a/augur/tasks/github/util/github_api_key_handler.py b/augur/tasks/github/util/github_api_key_handler.py index d87d7495e..4f8178e7c 100644 --- a/augur/tasks/github/util/github_api_key_handler.py +++ b/augur/tasks/github/util/github_api_key_handler.py @@ -6,8 +6,7 @@ from sqlalchemy.orm import Session from augur.tasks.util.redis_list import RedisList -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value +from augur.application.db.lib import get_value, get_worker_oauth_keys from sqlalchemy import func @@ -19,7 +18,6 @@ class GithubApiKeyHandler(): """Handles Github API key retrieval from the database and redis Attributes: - session (DatabaseSession): Database connection logger (logging.Logger): Handles all logs oauth_redis_key (str): The key where the github api keys are cached in redis redis_key_list (RedisList): Acts like a python list, and interacts directly with the redis cache @@ -27,9 +25,8 @@ class GithubApiKeyHandler(): key: (List[str]): List of keys retrieve from database or cache """ - def __init__(self, session: Session, logger): + def __init__(self, logger): - self.session = session self.logger = logger self.oauth_redis_key = "github_oauth_keys_list" @@ -69,16 +66,12 @@ def get_api_keys_from_database(self) -> List[str]: Returns: Github api keys that are in the database """ - from augur.application.db.models import WorkerOauth - select = WorkerOauth.access_token - # randomizing the order at db time - #select.order_by(func.random()) - where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'github'] + keys = get_worker_oauth_keys('github') - return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] - #return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] + filtered_keys = [item for item in keys if item != self.config_key] + return filtered_keys def get_api_keys(self) -> List[str]: """Retrieves all valid Github API Keys diff --git a/augur/tasks/github/util/github_paginator.py b/augur/tasks/github/util/github_paginator.py index 1818aef31..90593cedf 100644 --- a/augur/tasks/github/util/github_paginator.py +++ b/augur/tasks/github/util/github_paginator.py @@ -303,8 +303,7 @@ def __iter__(self) -> Generator[Optional[dict], None, None]: return # yield the first page data - for data in data_list: - yield data + yield from data_list while 'next' in response.links.keys(): next_page = response.links['next']['url'] @@ -315,9 +314,8 @@ def __iter__(self) -> Generator[Optional[dict], None, None]: if result != GithubApiResult.SUCCESS: self.logger.debug("Failed to retrieve the data even though 10 attempts were given") return - - for data in data_list: - yield data + + yield from data_list def iter_pages(self) -> Generator[Tuple[Optional[List[dict]], int], None, None]: """Provide data from Github API via a generator that yields a page of dicts at a time. @@ -389,9 +387,37 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. if response.status_code == 204: return [], response, GithubApiResult.SUCCESS + if response.status_code == 404: + return None, response, GithubApiResult.REPO_NOT_FOUND - page_data = parse_json_response(self.logger, response) + if response.status_code in [403, 429]: + + if "Retry-After" in response.headers: + retry_after = int(response.headers["Retry-After"]) + self.logger.info( + f'\n\n\n\nSleeping for {retry_after} seconds due to secondary rate limit issue.\n\n\n\n') + time.sleep(retry_after) + + elif "X-RateLimit-Remaining" in response.headers and int(response.headers["X-RateLimit-Remaining"]) == 0: + current_epoch = int(time.time()) + epoch_when_key_resets = int(response.headers["X-RateLimit-Reset"]) + key_reset_time = epoch_when_key_resets - current_epoch + + if key_reset_time < 0: + self.logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") + key_reset_time = 0 + + self.logger.info(f"\n\n\nAPI rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") + time.sleep(key_reset_time) + num_attempts = 0 + + else: + time.sleep(60) + + continue + + page_data = parse_json_response(self.logger, response) # if the data is a list, then return it and the response if isinstance(page_data, list) is True: @@ -401,6 +427,8 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. if isinstance(page_data, dict) is True: dict_processing_result = process_dict_response(self.logger, response, page_data) + self.logger.info(f"Used string interogation of dict to determine result. Response code: {response.status_code}. Processing result: {dict_processing_result}. Response body: {page_data}") + if dict_processing_result == GithubApiResult.NEW_RESULT: self.logger.info(f"Encountered new dict response from api on url: {url}. Response: {page_data}") return None, None, GithubApiResult.NEW_RESULT diff --git a/augur/tasks/github/util/github_random_key_auth.py b/augur/tasks/github/util/github_random_key_auth.py index ed539430d..1c7fc74e8 100644 --- a/augur/tasks/github/util/github_random_key_auth.py +++ b/augur/tasks/github/util/github_random_key_auth.py @@ -9,12 +9,12 @@ class GithubRandomKeyAuth(RandomKeyAuth): github collections can have a class randomly selects an api key for each request """ - def __init__(self, session: Session, logger): + def __init__(self, logger): """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" # gets the github api keys from the database via the GithubApiKeyHandler - github_api_keys = GithubApiKeyHandler(session, logger).keys + github_api_keys = GithubApiKeyHandler(logger).keys #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) if not github_api_keys: @@ -24,4 +24,23 @@ def __init__(self, session: Session, logger): header_name = "Authorization" key_format = "token {0}" - super().__init__(github_api_keys, header_name, logger, key_format) \ No newline at end of file + super().__init__(github_api_keys, header_name, logger, key_format) + + # It needs to be this at some point, however not all the method calls are sending 3 arguments + + # def __init__(self, session: Session, logger): + # """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" + + + # # gets the github api keys from the database via the GithubApiKeyHandler + # github_api_keys = GithubApiKeyHandler(session, logger).keys + # #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) + + # if not github_api_keys: + # print("Failed to find github api keys. This is usually because your key has expired") + + # # defines the structure of the github api key + # header_name = "Authorization" + # key_format = "token {0}" + + # super().__init__(github_api_keys, header_name, logger, key_format) \ No newline at end of file diff --git a/augur/tasks/github/util/github_task_session.py b/augur/tasks/github/util/github_task_session.py index 0acbbf64c..2869643bd 100644 --- a/augur/tasks/github/util/github_task_session.py +++ b/augur/tasks/github/util/github_task_session.py @@ -11,7 +11,9 @@ def __init__(self, logger): engine = get_engine() self.augur_db = DatabaseSession(logger, engine) - self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) + #self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) + #totalHack + self.key_auth = GithubRandomKeyAuth(logger) self.logger = logger self.platform_id = 1 @@ -35,10 +37,28 @@ class GithubTaskSession(DatabaseSession): platform_id (int): The id that refers to the Github platform """ + #def __init__(self, logger: Logger, engine=None): def __init__(self, logger: Logger, engine=None): super().__init__(logger, engine=engine) - - self.oauths = GithubRandomKeyAuth(self, logger) - self.platform_id = 1 + self.oauths = GithubRandomKeyAuth(logger) + #self.oauths = GithubRandomKeyAuth(self, logger) # Removed and replaced for the issue below in frontend.py + ''' + Debugging this issue: + Traceback (most recent call last): + File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 451, in trace_task + R = retval = fun(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 734, in __protected_call__ + return self.run(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/github/augur/augur/tasks/frontend.py", line 24, in add_org_repo_list + with GithubTaskSession(logger) as session: + ^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/github/augur/augur/tasks/github/util/github_task_session.py", line 44, in __init__ + self.oauths = GithubRandomKeyAuth(self, logger) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + TypeError: GithubRandomKeyAuth.__init__() takes 2 positional arguments but 3 were given + ''' + self.platform_id = 1 \ No newline at end of file diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 5dfe10097..8dd6e4d81 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -3,9 +3,8 @@ import logging import json import httpx -from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.application.db.session import DatabaseSession -from augur.application.db.models import Repo +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.application.db.lib import get_repo_by_repo_git from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps @@ -72,29 +71,27 @@ def get_repo_weight_by_issue(logger,repo_git): owner,name = get_owner_repo(repo_git) - with GithubTaskManifest(logger) as manifest: - repo_graphql = GitHubRepoGraphql(logger, manifest.key_auth, owner, name) - number_of_issues_and_prs = len(repo_graphql.get_issues_collection()) + len(repo_graphql.get_pull_requests_collection()) + key_auth = GithubRandomKeyAuth(logger) + + repo_graphql = GitHubRepoGraphql(logger, key_auth, owner, name) + number_of_issues_and_prs = len(repo_graphql.get_issues_collection()) + len(repo_graphql.get_pull_requests_collection()) return number_of_issues_and_prs #Get the weight for each repo for the core collection hook def get_repo_weight_core(logger,repo_git): - from augur.application.db import get_engine - engine = get_engine() - - with DatabaseSession(logger,engine) as session: - repo = Repo.get_by_repo_git(session, repo_git) - if not repo: - raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") - - #try to get the collection status if it exists at this point - try: - status = repo.collection_status[0] - time_factor = calculate_date_weight_from_timestamps(repo.repo_added,status.core_data_last_collected) - except IndexError: - time_factor = calculate_date_weight_from_timestamps(repo.repo_added,None) + repo = get_repo_by_repo_git(repo_git) + + if not repo: + raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") + + #try to get the collection status if it exists at this point + try: + status = repo.collection_status[0] + time_factor = calculate_date_weight_from_timestamps(repo.repo_added,status.core_data_last_collected) + except IndexError: + time_factor = calculate_date_weight_from_timestamps(repo.repo_added,None) #Don't go below zero. diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index a7b886da2..c8d9a8f8a 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -6,11 +6,12 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest, PullRequestEvent -from augur.application.db.util import execute_session_query +from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth + platform_id = 2 @@ -26,19 +27,18 @@ def collect_gitlab_issue_events(repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id + + key_auth = GitlabRandomKeyAuth(logger) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + events = retrieve_all_gitlab_event_data("issue", repo_git, logger, key_auth) - events = retrieve_all_gitlab_event_data("issue", repo_git, logger, manifest.key_auth) + with get_session() as session: if events: logger.info(f"Length of gitlab issue events: {len(events)}") - process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, augur_db) + process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab issue events") @@ -52,23 +52,21 @@ def collect_gitlab_merge_request_events(repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id + + key_auth = GitlabRandomKeyAuth(logger) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, key_auth) - events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, manifest.key_auth) + with get_session() as session: if events: logger.info(f"Length of gitlab merge request events: {len(events)}") - process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, augur_db) + process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request events") @@ -110,7 +108,7 @@ def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None: return all_data -def process_issue_events(events, task_name, repo_id, logger, augur_db): +def process_issue_events(events, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr label data from the api response @@ -119,7 +117,7 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Gitlab issue events task" @@ -130,7 +128,7 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): # create mapping from issue number to issue id of current issues issue_url_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + issues = session.query(Issue).filter(Issue.repo_id == repo_id).all() for issue in issues: issue_url_to_id_map[issue.gh_issue_number] = issue.issue_id @@ -153,10 +151,10 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(issue_event_dicts)} gitlab issue events") issue_event_natural_keys = ["issue_id", "issue_event_src_id"] - augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) + bulk_insert_dicts(logger, issue_event_dicts, IssueEvent, issue_event_natural_keys) -def process_mr_events(events, task_name, repo_id, logger, augur_db): +def process_mr_events(events, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr events from the api response @@ -180,7 +178,7 @@ def process_mr_events(events, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -203,6 +201,6 @@ def process_mr_events(events, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(mr_event_dicts)} gitlab mr events") mr_event_natural_keys = ["platform_id", "node_id"] - augur_db.insert_data(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) + bulk_insert_dicts(logger, mr_event_dicts, PullRequestEvent, mr_event_natural_keys) diff --git a/augur/tasks/gitlab/gitlab_api_key_handler.py b/augur/tasks/gitlab/gitlab_api_key_handler.py index c3a76f6dd..40b37d62c 100644 --- a/augur/tasks/gitlab/gitlab_api_key_handler.py +++ b/augur/tasks/gitlab/gitlab_api_key_handler.py @@ -8,11 +8,9 @@ import random from typing import List -from sqlalchemy.orm import Session from augur.tasks.util.redis_list import RedisList -from augur.application.db.lib import get_value -from sqlalchemy import func +from augur.application.db.lib import get_value, get_worker_oauth_keys class NoValidKeysError(Exception): @@ -23,7 +21,6 @@ class GitlabApiKeyHandler(): """Handles Gitlab API key retrieval from the database and redis Attributes: - session (DatabaseSession): Database connection logger (logging.Logger): Handles all logs oauth_redis_key (str): The key where the gitlab api keys are cached in redis redis_key_list (RedisList): Acts like a python list, and interacts directly with the redis cache @@ -31,9 +28,8 @@ class GitlabApiKeyHandler(): key: (List[str]): List of keys retrieve from database or cache """ - def __init__(self, session: Session, logger): + def __init__(self, logger): - self.session = session self.logger = logger self.oauth_redis_key = "gitlab_oauth_keys_list" @@ -72,15 +68,11 @@ def get_api_keys_from_database(self) -> List[str]: Returns: Github api keys that are in the database """ - from augur.application.db.models import WorkerOauth + keys = get_worker_oauth_keys('gitlab') - select = WorkerOauth.access_token - # randomizing the order at db time - #select.order_by(func.random()) - where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'gitlab'] + filtered_keys = [item for item in keys if item != self.config_key] - return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] - #return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] + return filtered_keys def get_api_keys(self) -> List[str]: diff --git a/augur/tasks/gitlab/gitlab_random_key_auth.py b/augur/tasks/gitlab/gitlab_random_key_auth.py index b2afded3a..3269d1ec3 100644 --- a/augur/tasks/gitlab/gitlab_random_key_auth.py +++ b/augur/tasks/gitlab/gitlab_random_key_auth.py @@ -1,7 +1,4 @@ """Defines the GitlabRandomKeyAuth class""" - -from sqlalchemy.orm import Session - from augur.tasks.util.random_key_auth import RandomKeyAuth from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler @@ -11,12 +8,12 @@ class GitlabRandomKeyAuth(RandomKeyAuth): gitlab collections can have a class randomly selects an api key for each request """ - def __init__(self, session: Session, logger): + def __init__(self, logger): """Creates a GitlabRandomKeyAuth object and initializes the RandomKeyAuth parent class""" # gets the gitlab api keys from the database via the GitlabApiKeyHandler - gitlab_api_keys = GitlabApiKeyHandler(session, logger).keys + gitlab_api_keys = GitlabApiKeyHandler(logger).keys if not gitlab_api_keys: print("Failed to find github api keys. This is usually because your key has expired") @@ -24,4 +21,4 @@ def __init__(self, session: Session, logger): header_name = "Authorization" key_format = "Bearer {0}" - super().__init__(gitlab_api_keys, header_name, session.logger, key_format) \ No newline at end of file + super().__init__(gitlab_api_keys, header_name, logger, key_format) \ No newline at end of file diff --git a/augur/tasks/gitlab/gitlab_task_session.py b/augur/tasks/gitlab/gitlab_task_session.py index 0892087d2..3f65f89f4 100644 --- a/augur/tasks/gitlab/gitlab_task_session.py +++ b/augur/tasks/gitlab/gitlab_task_session.py @@ -7,35 +7,6 @@ from augur.application.db.session import DatabaseSession from augur.application.db import get_engine -class GitlabTaskManifest: - """ - Manifest object that represents the state and common elements of - the specified task. GitLab version for the GitLab tasks. - - Attributes: - augur_db: sqlalchemy db object - key_auth: GitLab specific key auth retrieval collection - logger: logging object - platform_id: GitLab specific platform id (github is 1) - """ - - def __init__(self, logger): - - engine = get_engine() - - self.augur_db = DatabaseSession(logger, engine) - self.key_auth = GitlabRandomKeyAuth(self.augur_db.session, logger) - self.logger = logger - self.platform_id = 2 - - def __enter__(self): - - return self - - def __exit__(self, exception_type, exception_value, exception_traceback): - - self.augur_db.close() - class GitlabTaskSession(DatabaseSession): """ORM session used in gitlab tasks. This class adds the platform_id and the gitlab key authentication class, @@ -51,6 +22,6 @@ def __init__(self, logger: Logger, engine=None): super().__init__(logger, engine=engine) - self.oauths = GitlabRandomKeyAuth(self, logger) + self.oauths = GitlabRandomKeyAuth(logger) self.platform_id = 2 diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index b96650c9a..8a987a774 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -7,12 +7,12 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo, Contributor -from augur.application.db.util import execute_session_query +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor from augur.tasks.util.worker_util import remove_duplicate_dicts +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth platform_id = 2 @@ -26,28 +26,24 @@ def collect_gitlab_issues(repo_git : str) -> int: """ logger = logging.getLogger(collect_gitlab_issues.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - try: - - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - owner, repo = get_owner_repo(repo_git) - - issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, manifest.key_auth) + try: + owner, repo = get_owner_repo(repo_git) + + issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) - if issue_data: - issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + if issue_data: + issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger) - return issue_ids - else: - logger.info(f"{owner}/{repo} has no issues") - return [] - except Exception as e: + return issue_ids + else: + logger.info(f"{owner}/{repo} has no issues") + return [] + except Exception as e: logger.error(f"Could not collect gitlab issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") return -1 @@ -87,7 +83,7 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: return all_data -def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: +def process_issues(issues, task_name, repo_id, logger) -> None: """ Retrieve only the needed data for issues from the api response @@ -96,7 +92,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ # get repo_id or have it passed @@ -142,14 +138,14 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # insert contributors from these issues logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(issue_dicts)} gitlab issues") issue_natural_keys = ["repo_id", "gh_issue_id"] issue_string_columns = ["issue_title", "issue_body"] issue_return_columns = ["gh_issue_id", "issue_id"] - issue_return_data = augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + issue_return_data = bulk_insert_dicts(logger, issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) issue_label_dicts = [] issue_assignee_dicts = [] @@ -176,12 +172,12 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # we are using label_src_id and issue_id to determine if the label is already in the database. issue_label_natural_keys = ['label_src_id', 'issue_id'] issue_label_string_fields = ["label_text", "label_description"] - augur_db.insert_data(issue_label_dicts, IssueLabel, + bulk_insert_dicts(logger, issue_label_dicts, IssueLabel, issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + bulk_insert_dicts(logger, issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) return issue_ids @@ -214,19 +210,18 @@ def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_gitlab_issues.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id + + key_auth = GitlabRandomKeyAuth(logger) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + comments = retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git) - comments = retrieve_all_gitlab_issue_comments(manifest.key_auth, logger, issue_ids, repo_git) + with get_session() as session: if comments: logger.info(f"Length of comments: {len(comments)}") - process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, augur_db) + process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab issue comments") @@ -271,7 +266,7 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): return all_comments -def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): +def process_gitlab_issue_messages(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for issue messages from the api response @@ -280,7 +275,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Gitlab issue comments" @@ -289,7 +284,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs issue_number_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + issues = session.session.query(Issue).filter(Issue.repo_id == repo_id).all() for issue in issues: issue_number_to_id_map[issue.gh_issue_number] = issue.issue_id @@ -326,13 +321,13 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) issue_message_ref_dicts = [] @@ -349,7 +344,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} gitlab issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + bulk_insert_dicts(logger, issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) def process_gitlab_issue_comment_contributors(message, tool_source, tool_version, data_source): diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index d5212a52d..5e56067c5 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -3,12 +3,12 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor -from augur.application.db.util import execute_session_query +from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth from augur.tasks.util.worker_util import remove_duplicate_dicts +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session platform_id = 2 @@ -24,23 +24,21 @@ def collect_gitlab_merge_requests(repo_git: str) -> int: logger = logging.getLogger(collect_gitlab_merge_requests.__name__) - with GitlabTaskManifest(logger) as manifest: + repo_id = get_repo_by_repo_git(repo_git).repo_id - augur_db = manifest.augur_db + owner, repo = get_owner_repo(repo_git) - repo_id = augur_db.session.query(Repo).filter( - Repo.repo_git == repo_git).one().repo_id + key_auth = GitlabRandomKeyAuth(logger) - owner, repo = get_owner_repo(repo_git) - mr_data = retrieve_all_mr_data(repo_git, logger, manifest.key_auth) + mr_data = retrieve_all_mr_data(repo_git, logger, key_auth) - if mr_data: - mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger, augur_db) + if mr_data: + mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger) - return mr_ids - else: - logger.info(f"{owner}/{repo} has no merge requests") - return [] + return mr_ids + else: + logger.info(f"{owner}/{repo} has no merge requests") + return [] def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: @@ -80,7 +78,7 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: return all_data -def process_merge_requests(data, task_name, repo_id, logger, augur_db): +def process_merge_requests(data, task_name, repo_id, logger): """ Retrieve only the needed data for mr label data from the api response @@ -89,7 +87,6 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object Returns: List of parsed MR ids. @@ -126,13 +123,13 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting mrs of length: {len(merge_requests)}") pr_natural_keys = ["repo_id", "pr_src_id"] pr_string_fields = ["pr_src_title", "pr_body"] pr_return_columns = ["pull_request_id", "pr_src_id"] - pr_return_data = augur_db.insert_data(merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) + pr_return_data = bulk_insert_dicts(logger, merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) mr_assignee_dicts = [] @@ -154,11 +151,11 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) + bulk_insert_dicts(logger, mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - augur_db.insert_data(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + bulk_insert_dicts(logger, mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) return mr_ids @@ -177,25 +174,24 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_merge_request_comments.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") - comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger, response_type="list") + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") + comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, key_auth, logger, response_type="list") + + with get_session() as session: if comments: logger.info(f"Length of merge request comments: {len(comments)}") - process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, augur_db) + process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request comments") -def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): +def process_gitlab_mr_messages(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr label data from the api response @@ -204,7 +200,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Gitlab mr comments" @@ -213,7 +209,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -250,13 +246,13 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} mr message contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} mr messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) mr_message_ref_dicts = [] @@ -273,7 +269,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(mr_message_ref_dicts)} mr messages ref rows") mr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - augur_db.insert_data(mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) + bulk_insert_dicts(logger, mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -289,24 +285,23 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_merge_request_metadata.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id + + key_auth = GitlabRandomKeyAuth(logger) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") + metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, key_auth, logger, response_type="dict") - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") - metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger, response_type="dict") + with get_session() as session: if metadata_list: logger.info(f"Length of merge request metadata: {len(metadata_list)}") - process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, augur_db) + process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request metadata") -def process_mr_metadata(data, task_name, repo_id, logger, augur_db): +def process_mr_metadata(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr label data from the api response @@ -315,7 +310,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Mr Metadata Task" @@ -324,7 +319,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -339,7 +334,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_metadata)} merge request metadata") pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] - augur_db.insert_data(all_metadata, PullRequestMeta, pr_metadata_natural_keys) + bulk_insert_dicts(logger, all_metadata, PullRequestMeta, pr_metadata_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -355,24 +350,23 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_merge_request_reviewers.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") - reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger, response_type="dict") + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") + reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, key_auth, logger, response_type="dict") + + with get_session() as session: if reviewers: logger.info(f"Length of merge request reviewers: {len(reviewers)}") - process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, augur_db) + process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") -def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): +def process_mr_reviewers(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr Reviewer data from the api response @@ -380,7 +374,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): data: List of dictionaries of mr Reviewer data repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Mr Reviewer Task" @@ -391,7 +385,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -406,7 +400,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): # TODO: Need to add unique key with pull_request_id and cntrb_id to insert gitlab reviewers # pr_reviewer_natural_keys = ["pull_request_id", "cntrb_id"] - # augur_db.insert_data(all_reviewers, PullRequestReviewer, pr_reviewer_natural_keys) + # bulk_insert_dicts(all_reviewers, PullRequestReviewer, pr_reviewer_natural_keys) @@ -423,25 +417,24 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_merge_request_commits.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id + + key_auth = GitlabRandomKeyAuth(logger) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") + commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, key_auth, logger, response_type="list") - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") - commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger, response_type="list") + with get_session() as session: if commits: logger.info(f"Length of merge request commits: {len(commits)}") - process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, augur_db) + process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request commits") -def process_mr_commits(data, task_name, repo_id, logger, augur_db): +def process_mr_commits(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr commits from the api response @@ -450,7 +443,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Mr Commit Task" @@ -459,7 +452,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -475,7 +468,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_commits)} merge request commits") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] - augur_db.insert_data(all_commits,PullRequestCommit,pr_commits_natural_keys) + bulk_insert_dicts(logger, all_commits,PullRequestCommit,pr_commits_natural_keys) @@ -489,27 +482,26 @@ def collect_merge_request_files(mr_ids, repo_git) -> int: repo_git: the repo url string """ + logger = logging.getLogger(collect_merge_request_files.__name__) + owner, repo = get_owner_repo(repo_git) - logger = logging.getLogger(collect_merge_request_files.__name__) - with GitlabTaskManifest(logger) as manifest: + repo_id = get_repo_by_repo_git(repo_git).repo_id - augur_db = manifest.augur_db + key_auth = GitlabRandomKeyAuth(logger) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") + files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, key_auth, logger, response_type="dict") - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") - files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger, response_type="dict") + with get_session() as session: if files: logger.info(f"Length of merge request files: {len(files)}") - process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, augur_db) + process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request files") -def process_mr_files(data, task_name, repo_id, logger, augur_db): +def process_mr_files(data, task_name, repo_id, logger, session): tool_source = "Mr files Task" tool_version = "2.0" @@ -517,7 +509,7 @@ def process_mr_files(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -530,7 +522,7 @@ def process_mr_files(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_files)} merge request files") pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - augur_db.insert_data(all_files, PullRequestFile, pr_file_natural_keys) + bulk_insert_dicts(logger, all_files, PullRequestFile, pr_file_natural_keys) def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index a18284186..e57fb674d 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -13,8 +13,8 @@ from augur.application.logs import TaskLogConfig, AugurLogger from augur.application.db.session import DatabaseSession -from augur.application.db.engine import DatabaseEngine from augur.application.db import get_engine +from augur.application.db.lib import get_session from augur.application.config import AugurConfig from augur.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string from augur.application.db.models import Repo @@ -83,7 +83,7 @@ def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_h logger.error(f"Task {task_id} raised exception: {exc}\n Traceback: {''.join(traceback.format_exception(None, exc, exc.__traceback__))}") - with DatabaseSession(logger,engine) as session: + with get_session() as session: logger.info(f"Repo git: {repo_git}") repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 866b7a028..6b35881d6 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -27,6 +27,7 @@ from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import * from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor +from augur.application.db.lib import execute_sql, get_session CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) @@ -153,7 +154,7 @@ def non_repo_domain_tasks(self): tasks.apply_async() -def build_primary_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_primary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 15): #Add all required tasks to a list and pass it to the CollectionRequest primary_enabled_phases = [] primary_gitlab_enabled_phases = [] @@ -172,11 +173,11 @@ def core_task_success_util_gen(repo_git): primary_enabled_phases.append(core_task_success_util_gen) primary_gitlab_enabled_phases.append(core_task_success_util_gen) - primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7, gitlab_phases=primary_gitlab_enabled_phases) + primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=15, gitlab_phases=primary_gitlab_enabled_phases) primary_request.get_valid_repos(session) return primary_request -def build_secondary_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_secondary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Deal with secondary collection secondary_enabled_phases = [] @@ -190,13 +191,13 @@ def secondary_task_success_util_gen(repo_git): return secondary_task_success_util.si(repo_git) secondary_enabled_phases.append(secondary_task_success_util_gen) - request = CollectionRequest("secondary",secondary_enabled_phases,max_repo=10, days_until_collect_again=10) + request = CollectionRequest("secondary",secondary_enabled_phases,max_repo=60, days_until_collect_again=10) request.get_valid_repos(session) return request -def build_facade_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_facade_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 10): #Deal with facade collection facade_enabled_phases = [] @@ -212,12 +213,12 @@ def facade_task_update_weight_util_gen(repo_git): facade_enabled_phases.append(facade_task_update_weight_util_gen) - request = CollectionRequest("facade",facade_enabled_phases,max_repo=30, days_until_collect_again=7) + request = CollectionRequest("facade",facade_enabled_phases,max_repo=30, days_until_collect_again=10) request.get_valid_repos(session) return request -def build_ml_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_ml_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 40): ml_enabled_phases = [] ml_enabled_phases.append(machine_learning_phase) @@ -227,7 +228,7 @@ def ml_task_success_util_gen(repo_git): ml_enabled_phases.append(ml_task_success_util_gen) - request = CollectionRequest("ml",ml_enabled_phases,max_repo=5, days_until_collect_again=10) + request = CollectionRequest("ml",ml_enabled_phases,max_repo=5, days_until_collect_again=40) request.get_valid_repos(session) return request @@ -240,29 +241,32 @@ def augur_collection_monitor(self): logger.info("Checking for repos to collect") - with DatabaseSession(logger, engine) as session: - #Get list of enabled phases - enabled_phase_names = get_enabled_phase_names_from_config() + + #Get list of enabled phases + enabled_phase_names = get_enabled_phase_names_from_config() + + enabled_collection_hooks = [] - enabled_collection_hooks = [] + with DatabaseSession(logger, self.app.engine) as session: if primary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_primary_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_primary_repo_collect_request(session, logger, enabled_phase_names)) if secondary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_secondary_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_secondary_repo_collect_request(session, logger, enabled_phase_names)) #start_secondary_collection(session, max_repo=10) if facade_phase.__name__ in enabled_phase_names: #start_facade_collection(session, max_repo=30) - enabled_collection_hooks.append(build_facade_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_facade_repo_collect_request(session, logger, enabled_phase_names)) if machine_learning_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_ml_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_ml_repo_collect_request(session, logger, enabled_phase_names)) #start_ml_collection(session,max_repo=5) logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") - main_routine = AugurTaskRoutine(session,enabled_collection_hooks) + + main_routine = AugurTaskRoutine(logger, enabled_collection_hooks) main_routine.start_data_collection() @@ -278,7 +282,7 @@ def augur_collection_update_weights(self): logger.info("Updating stale collection weights") - with DatabaseSession(logger,engine) as session: + with get_session() as session: core_weight_update_repos = session.query(CollectionStatus).filter(CollectionStatus.core_weight != None).all() @@ -301,7 +305,7 @@ def augur_collection_update_weights(self): repo = Repo.get_by_id(session, status.repo_id) commit_count = status.commit_sum - date_factor = get_facade_weight_time_factor(session, repo.repo_git) + date_factor = get_facade_weight_time_factor(repo.repo_git) weight = commit_count - date_factor update_query = ( @@ -324,27 +328,26 @@ def retry_errored_repos(self): #TODO: Isaac needs to normalize the status's to be abstract in the #collection_status table once augur dev is less unstable. - with DatabaseSession(logger,engine) as session: - query = s.sql.text(f"""UPDATE collection_status SET secondary_status = '{CollectionState.PENDING.value}'""" - f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is NULL;""" - f"""UPDATE collection_status SET core_status = '{CollectionState.PENDING.value}'""" - f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is NULL;""" - f"""UPDATE collection_status SET facade_status = '{CollectionState.PENDING.value}'""" - f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is NULL;""" - f"""UPDATE collection_status SET ml_status = '{CollectionState.PENDING.value}'""" - f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is NULL;""" - - f"""UPDATE collection_status SET secondary_status = '{CollectionState.SUCCESS.value}'""" - f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is not NULL;""" - f"""UPDATE collection_status SET core_status = '{CollectionState.SUCCESS.value}'""" - f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is not NULL;;""" - f"""UPDATE collection_status SET facade_status = '{CollectionState.SUCCESS.value}'""" - f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is not NULL;;""" - f"""UPDATE collection_status SET ml_status = '{CollectionState.SUCCESS.value}'""" - f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is not NULL;;""" - ) + query = s.sql.text(f"""UPDATE collection_status SET secondary_status = '{CollectionState.PENDING.value}'""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is NULL;""" + f"""UPDATE collection_status SET core_status = '{CollectionState.PENDING.value}'""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is NULL;""" + f"""UPDATE collection_status SET facade_status = '{CollectionState.PENDING.value}'""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is NULL;""" + f"""UPDATE collection_status SET ml_status = '{CollectionState.PENDING.value}'""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is NULL;""" + + f"""UPDATE collection_status SET secondary_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is not NULL;""" + f"""UPDATE collection_status SET core_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is not NULL;;""" + f"""UPDATE collection_status SET facade_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is not NULL;;""" + f"""UPDATE collection_status SET ml_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is not NULL;;""" + ) - session.execute_sql(query) + execute_sql(query) @@ -361,16 +364,17 @@ def create_collection_status_records(self): engine = self.app.engine logger = logging.getLogger(create_collection_status_records.__name__) - with DatabaseSession(logger,engine) as session: - query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.collection_status) - """) + query = s.sql.text(""" + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.collection_status) + """) + + repo = execute_sql(query).first() - repo = session.execute_sql(query).first() + with DatabaseSession(logger) as session: while repo is not None: - CollectionStatus.insert(session,repo[0]) - repo = session.execute_sql(query).first() - + CollectionStatus.insert(session, logger, repo[0]) + repo = execute_sql(query).first() + #Check for new repos every seven minutes to be out of step with the clone_repos task create_collection_status_records.si().apply_async(countdown=60*7) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 977625862..f8156c8bf 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -12,13 +12,13 @@ from augur.application.db.util import execute_session_query from augur.application.db.lib import get_section from augur.tasks.github.util.util import get_repo_weight_core, get_repo_weight_by_issue -from augur.application.db.session import DatabaseSession from augur.application.db import get_engine +from augur.application.db.lib import execute_sql, get_session, get_active_repo_count, get_repo_by_repo_git from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps from augur.tasks.util.collection_state import CollectionState -def get_list_of_all_users(session): +def get_list_of_all_users(): #Get a list of all users. query = s.sql.text(""" SELECT @@ -26,7 +26,7 @@ def get_list_of_all_users(session): FROM augur_operations.users """) - users = session.execute_sql(query).fetchall() + users = execute_sql(query).fetchall() return users @@ -129,13 +129,9 @@ def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab if name == "facade": self.new_status = CollectionState.UPDATE.value - def get_active_repo_count(self,session): - return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all()) - - def get_valid_repos(self,session): - active_repo_count = self.get_active_repo_count(session) + active_repo_count = get_active_repo_count(self.name) limit = self.max_repo-active_repo_count if limit <= 0: @@ -242,7 +238,7 @@ def task_failed_util(self, request,exc,traceback): # log traceback to error file logger.error(f"Task {request.id} raised exception: {exc}\n{traceback}") - with DatabaseSession(logger,engine) as session: + with get_session() as session: core_id_match = CollectionStatus.core_task_id == request.id secondary_id_match = CollectionStatus.secondary_task_id == request.id facade_id_match = CollectionStatus.facade_task_id == request.id @@ -301,7 +297,7 @@ def issue_pr_task_update_weight_util(self, issue_and_pr_nums,repo_git=None,sessi if session is not None: update_issue_pr_weights(logger, session, repo_git, sum(issue_and_pr_nums)) else: - with DatabaseSession(logger,engine=engine) as session: + with get_session() as session: update_issue_pr_weights(logger,session,repo_git,sum(issue_and_pr_nums)) @@ -314,7 +310,7 @@ def core_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through core collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -381,7 +377,7 @@ def secondary_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through secondary collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -407,7 +403,7 @@ def get_repo_weight_secondary(logger,repo_git): engine = get_engine() - with DatabaseSession(logger,engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") @@ -434,7 +430,7 @@ def facade_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through facade task collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -457,7 +453,7 @@ def ml_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through machine learning task collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -482,7 +478,7 @@ def facade_clone_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through facade update/clone") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -553,21 +549,21 @@ class to keep track of various groups of collection tasks for a group of repos. collection_hook (str): String determining the attributes to update when collection for a repo starts. e.g. core session: Database session to use """ - def __init__(self,session,collection_hooks): - self.logger = session.logger + def __init__(self, logger,collection_hooks): + self.logger = logger self.collection_hooks = collection_hooks - self.session = session - def update_status_and_id(self,repo_git, task_id, name): - repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() + def update_status_and_id(self,repo_git, task_id, name, session): + # NOTE: Can't simply replace with lib method because it is doing .collection_status[0] afterwards + repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() #Set status in database to collecting repoStatus = repo.collection_status[0] # setattr(repoStatus,f"{name}_task_id",task_id) setattr(repoStatus,f"{name}_status", CollectionState.COLLECTING.value) - self.session.commit() + session.commit() def start_data_collection(self): @@ -581,8 +577,11 @@ def start_data_collection(self): #Send messages starts each repo and yields its running info #to concurrently update the correct field in the database. - for repo_git, task_id, hook_name in self.send_messages(): - self.update_status_and_id(repo_git,task_id,hook_name) + + with get_session() as session: + + for repo_git, task_id, hook_name in self.send_messages(): + self.update_status_and_id(repo_git,task_id,hook_name, session) def send_messages(self): augur_collection_list = [] @@ -593,7 +592,7 @@ def send_messages(self): for repo_git in col_hook.repo_list: - repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() + repo = get_repo_by_repo_git(repo_git) if "github" in repo.repo_git: augur_collection_sequence = [] for job in col_hook.phases: diff --git a/augur/tasks/util/random_key_auth.py b/augur/tasks/util/random_key_auth.py index 7f7bd6555..c9f865db9 100644 --- a/augur/tasks/util/random_key_auth.py +++ b/augur/tasks/util/random_key_auth.py @@ -33,7 +33,7 @@ def auth_flow(self, request: Request) -> Generator[Request, Response, None]: if self.list_of_keys: key_value = choice(self.list_of_keys) - self.logger.debug(f'Key value used: {key_value}') + self.logger.info(f'Key value used in request: {key_value}') # formats the key string into a format GitHub will accept if self.key_format: diff --git a/augur/templates/settings.j2 b/augur/templates/settings.j2 index c75b6522a..c10a0c914 100644 --- a/augur/templates/settings.j2 +++ b/augur/templates/settings.j2 @@ -56,7 +56,7 @@
If you encounter any issues loading new repositories, please open an issue - at https://github.com/chaoss/augur/issues@@ -577,11 +577,27 @@ });