From 7d65f631ea93f941b1d23dd0e1e4510c4b603fc2 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Thu, 23 Jan 2025 16:31:56 -0600 Subject: [PATCH 01/11] Update README.md version update Signed-off-by: Sean P. Goggins --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 184a578a0..7467c59dd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.76.7 +# Augur NEW Release v0.80.0 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data - less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot), a public instance of 8Knot is available [here](https://metrix.chaoss.io) - this is tied to a public instance of [Augur](https://ai.chaoss.io). @@ -12,7 +12,7 @@ We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy o **If you want to jump right in, the updated docker, docker-compose and bare metal installation instructions are available [here](docs/new-install.md)**. -Augur is now releasing a dramatically improved new version to the ```main``` branch. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.76.7). +Augur is now releasing a dramatically improved new version to the ```main``` branch. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.80.0). - The `main` branch is a stable version of our new architecture, which features: From f63d9c21af357f5df94ad8e6cdbb97e53df2f1de Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Thu, 23 Jan 2025 16:33:46 -0600 Subject: [PATCH 02/11] Update metadata.py Signed-off-by: Sean P. Goggins --- metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata.py b/metadata.py index 5b8dd83fe..6a1154f5f 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.76.7" -__release__ = "v0.76.7 (Captain Tuttle)" +__version__ = "0.80.0" +__release__ = "v0.80.0 (Data Monster)" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2112" From 4d66bb1b7c879abcb9093556f8c7cbc7edd9e277 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 4 Feb 2025 18:55:16 -0600 Subject: [PATCH 03/11] prioritize small repos first to speed up initial collection of most repos --- augur/tasks/util/collection_util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index a31fbbbf2..bed73bd12 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -154,11 +154,14 @@ def get_valid_repos(self,session): def get_newly_added_repos(session, limit, hook): condition_string = "" + order_by_field = "" if hook in ["core", "secondary", "ml"]: condition_string += f"""{hook}_status='{str(CollectionState.PENDING.value)}'""" + order_by_field = "issue_pr_sum" elif hook == "facade": condition_string += f"""facade_status='{str(CollectionState.UPDATE.value)}'""" + order_by_field = "commit_sum" if hook == "secondary": condition_string += f""" and core_status='{str(CollectionState.SUCCESS.value)}'""" @@ -168,7 +171,7 @@ def get_newly_added_repos(session, limit, hook): from augur_operations.collection_status x, augur_data.repo y where x.repo_id=y.repo_id and {condition_string} - order by repo_added + order by {order_by_field} limit :limit_num """).bindparams(limit_num=limit) From e9635550b1063117f33fcaa2485240a40e4d45df Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 5 Feb 2025 09:02:14 -0600 Subject: [PATCH 04/11] Update README.md Signed-off-by: Sean P. Goggins --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7467c59dd..14fdfc545 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.80.0 +# Augur NEW Release v0.80.1 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data - less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot), a public instance of 8Knot is available [here](https://metrix.chaoss.io) - this is tied to a public instance of [Augur](https://ai.chaoss.io). @@ -12,7 +12,7 @@ We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy o **If you want to jump right in, the updated docker, docker-compose and bare metal installation instructions are available [here](docs/new-install.md)**. -Augur is now releasing a dramatically improved new version to the ```main``` branch. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.80.0). +Augur is now releasing a dramatically improved new version to the ```main``` branch. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.80.1). - The `main` branch is a stable version of our new architecture, which features: From 6ada8ce3e13a212974a1869307a6c4feb6d575d3 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 5 Feb 2025 09:02:35 -0600 Subject: [PATCH 05/11] Update metadata.py Signed-off-by: Sean P. Goggins --- metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata.py b/metadata.py index 6a1154f5f..a8e71cd7a 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.80.0" -__release__ = "v0.80.0 (Data Monster)" +__version__ = "0.80.1" +__release__ = "v0.80.1 (Data Monster)" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2112" From 129b69c6fedfc42b385ad4851827765eb3aa3e8e Mon Sep 17 00:00:00 2001 From: Sean Goggins Date: Wed, 5 Feb 2025 15:10:00 +0000 Subject: [PATCH 06/11] update Signed-off-by: Sean Goggins --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 14fdfc545..55cfa78d0 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy o ## NEW RELEASE ALERT! **If you want to jump right in, the updated docker, docker-compose and bare metal installation instructions are available [here](docs/new-install.md)**. - Augur is now releasing a dramatically improved new version to the ```main``` branch. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.80.1). From 47884ca25082e5ff267c8e027196b2789fd66030 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 11 Feb 2025 10:53:08 -0600 Subject: [PATCH 07/11] Create gsoc-interest.md Signed-off-by: Sean P. Goggins --- gsoc-interest.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 gsoc-interest.md diff --git a/gsoc-interest.md b/gsoc-interest.md new file mode 100644 index 000000000..afa63217d --- /dev/null +++ b/gsoc-interest.md @@ -0,0 +1,31 @@ +# Google Summer of Code 2025 Interested Candidates + +Hi potential GSoC students, + +You can ask questions and meet the community on Slack here: https://join.slack.com/t/chaoss-workspace/shared_invite/zt-289zxh6tu-3oQaFlutPFY039MjKpnWcA ... look for the `wg-augur-8knot` channel. + +A few details regarding the application process specific to the CHAOSS project: + +1) You must complete one micro-task related to the idea you are interested in. You can find the micro-tasks on the GSoc Idea Page at: [gsoc-ideas.md](./gsoc-ideas.md) + +2) Once you completed one micro-task, create a pull request on this file below to add yourself, your information, and a link to your repository of the completed micro-task. **NOTE:** This repository requires [Developer Certificate of Origin](https://developercertificate.org/) (DCO) sign-off; see [CONTRIBUTING.md](https://github.com/chaoss/governance/blob/master/CONTRIBUTING.md#code-or-document-change-contributions-github-interface) for details on how to sign your commits. + +3) You are welcome to include in your repositories other information that could be of interest, such as open issues or pull requests submitted to the project to which you intend to contribute during GSoC, contributions to other projects, skills, and other related information. + +4) Using and submitting other people's work as your own is not allowed. If you use other people's work, be sure to acknowledge their work in your submission. + +5) Documentation of all code contributions is critical, and expected from all CHAOSS GSoC Students. + +You must complete these things by April 19, 2022 13:00 US Central Time (UTC-5). Make sure to also [submit the information required by GSoC for applicants](https://summerofcode.withgoogle.com/) (i.e., project proposal), linking to it from your pull request to this file. Here is an [Proposal Template](https://docs.google.com/document/d/1YZez6_hgp2dBybEsMZoQ-ONB9IawK4_OPISLHe9Tjew/edit) to get you started. + +Regards, +GSoC Mentors + +--- + +## Applicants + +**The applicants section will be completed as applicants are added here. At the moment, we are at the very beginning!** + + +**UPDATE:** This repository requires [Developer Certificate of Origin](https://developercertificate.org/) (DCO) sign-off; see [CONTRIBUTING.md](https://github.com/chaoss/governance/blob/master/CONTRIBUTING.md#code-or-document-change-contributions-github-interface) for details on how to sign your commits. From b2f67b073f7c8e448d2f5088a32253892469d1f8 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 11 Feb 2025 10:56:34 -0600 Subject: [PATCH 08/11] Create gsoc-ideas.md Signed-off-by: Sean P. Goggins --- gsoc-ideas.md | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 gsoc-ideas.md diff --git a/gsoc-ideas.md b/gsoc-ideas.md new file mode 100644 index 000000000..1cf74e11e --- /dev/null +++ b/gsoc-ideas.md @@ -0,0 +1,56 @@ + +## Idea: Enhance Conversational Topic Modelling Capabilities in CHAOSS Software + +**Hours: 350** + +[Micro-tasks and place for questions](https://github.com/chaoss/augur/issues/1640) + +This project will add GenSIM logic, and other capabilities to the Clustering Worker inside of Augur Software, and be extended into a generalized Open Source Software Conversational Topic Modeling Instrument. + +CHOASS/augur has several workers that store machine learning information derived from computational linguistic analysis of data in the `message` table. The message table includes messages from issue, pull request, pull request review, and email messages. They are related to their origin with bridge tables like `pull_request_message_ref`. The ML/CL workers are all run against all the messages, regardless of origin. + +1. Clustering Worker (clusters created and topics modeled) +2. message analysis worker (sentiment and novelty analysis) +3. discourse analysis worker (speech act classification (question, answer, approval, etc.) + +Clustering Worker Notes: + +Clustering Worker: 2 Models. + - Models: + - Topic modeling, but it needs a better way of estimating number of topics. + - Tables + - repo_topic + - topic_words + - Computational linguistic clustering + - Tables + - repo_cluster_messages + - Key Needs + - Add GenSim algorithms to topic modeling section https://github.com/chaoss/augur/issues/1199 + - The topics, and associated topic words need to be persisted after each run. At the moment, the topic words get overwritten for each topic modeling run. + - Description/optimization of the parameters used to create the computational linguistic clusters. + - Periodic deletion of models (heuristic: If 3 months pass, OR there’s a 10% increase in the messages, issues, or PRs in a repo, rebuild the models) + - Establish some kind of model archiving with appropriate metadata (lower priority) + +Discourse Analysis Worker Notes: + +discourse_insights table (select max(data_collection_date) for each msg_id) + - sequence is reassembled from the timestamp in the message table (look at msg_timestamp) + - issues_msg_ref, pull_request_message_ref, pull_request_review_msg_ref + +Message Analysis Worker + - message_analysis + - message_analysis_summary + +augur-tech + +The aims of the project are as follows: + - Advance topic modeling of open source software conversations captured in GitHub. + - Integrate this information into clearer, more parsimonious CHAOSS metrics. + - Automate the management machine learning insights, and topic models over time. + - (Stretch Goal) Improve the operation of the overall machine learning insights pipeline in CHAOSS/augur, and generalize these capabilities. + + +* _Difficulty:_ Medium +* _Requirements:_ Interest in software analytics. Python programming. Conceptual understanding of machine learning, and an eagerness to learn maching learning, and SQL knowledge. +* _Recommended:_ Experience with Python +* _Mentors:_ Sean Goggins, Andrew Brain, Isaac Milarsky From b1375f8ff4dd9f1184f05dd0d55856b07b605f30 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 11 Feb 2025 10:59:45 -0600 Subject: [PATCH 09/11] Update gsoc-interest.md Signed-off-by: Sean P. Goggins --- gsoc-interest.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gsoc-interest.md b/gsoc-interest.md index afa63217d..01c8adf75 100644 --- a/gsoc-interest.md +++ b/gsoc-interest.md @@ -29,3 +29,9 @@ GSoC Mentors **UPDATE:** This repository requires [Developer Certificate of Origin](https://developercertificate.org/) (DCO) sign-off; see [CONTRIBUTING.md](https://github.com/chaoss/governance/blob/master/CONTRIBUTING.md#code-or-document-change-contributions-github-interface) for details on how to sign your commits. + + +| Name | Email | Idea | Micro-Task Repository | Project Proposal | Submitted on GSOC | +| --- | --- | --- | --- | --- | --- | +| Your Name Here | Your Email Here | Idea You Hoping to Work On | Link to your Mico-task Repo | Link to Your Proposal | YES/NO | + From d2f1b81a4f14ef8c8ec869d47a8691d1065a33cb Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 11 Feb 2025 11:09:38 -0600 Subject: [PATCH 10/11] Update gsoc-interest.md Signed-off-by: Sean P. Goggins --- gsoc-interest.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gsoc-interest.md b/gsoc-interest.md index 01c8adf75..04a237581 100644 --- a/gsoc-interest.md +++ b/gsoc-interest.md @@ -16,7 +16,7 @@ A few details regarding the application process specific to the CHAOSS project: 5) Documentation of all code contributions is critical, and expected from all CHAOSS GSoC Students. -You must complete these things by April 19, 2022 13:00 US Central Time (UTC-5). Make sure to also [submit the information required by GSoC for applicants](https://summerofcode.withgoogle.com/) (i.e., project proposal), linking to it from your pull request to this file. Here is an [Proposal Template](https://docs.google.com/document/d/1YZez6_hgp2dBybEsMZoQ-ONB9IawK4_OPISLHe9Tjew/edit) to get you started. +You must complete these things by the GSOC Deadline. Make sure to also [submit the information required by GSoC for applicants](https://summerofcode.withgoogle.com/) (i.e., project proposal), linking to it from your pull request to this file. Here is an [Proposal Template](https://docs.google.com/document/d/1YZez6_hgp2dBybEsMZoQ-ONB9IawK4_OPISLHe9Tjew/edit) to get you started. Regards, GSoC Mentors From c199f859fd3ad0b7fbaa4a00a5797e1ca268b9c4 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 11 Feb 2025 19:54:16 -0600 Subject: [PATCH 11/11] comment out update weight task as it is no longer needed --- augur/tasks/init/celery_app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index edd833657..1be45b1f0 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -226,8 +226,8 @@ def setup_periodic_tasks(sender, **kwargs): logger.info(f"Scheduling refresh materialized view every night at 1am CDT") sender.add_periodic_task(datetime.timedelta(days=mat_views_interval), refresh_materialized_views.s()) - logger.info(f"Scheduling update of collection weights on midnight each day") - sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) + # logger.info(f"Scheduling update of collection weights on midnight each day") + # sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) logger.info(f"Setting 404 repos to be marked for retry on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s())