diff --git a/.github/workflows/unit-test-partial.yml b/.github/workflows/unit-test-partial.yml new file mode 100644 index 000000000..460c52fa3 --- /dev/null +++ b/.github/workflows/unit-test-partial.yml @@ -0,0 +1,77 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: unittest-partial + +on: + workflow_dispatch: + pull_request: + push: + branches: + - main + +permissions: + contents: read + +env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + +jobs: + unittest-single: + runs-on: [GPU, unittest] + environment: Testing + steps: + - uses: actions/checkout@v3 + with: + path: dj-${{ github.run_id }} + + - name: Setup docker compose + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose up -d + + - name: Install data-juicer + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose exec ray-head pip install -e .\[all\] + docker compose exec ray-worker pip install -e .\[all\] + + - name: Clean dataset cache + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose exec ray-head rm -rf /data/huggingface/dataset + + - name: Run unittest standalone + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head python tests/run.py --tag standalone --mode partial + + - name: Upload coverage report of standalone + uses: actions/upload-artifact@v4 + with: + name: coverage_report_standalone + include-hidden-files: true + path: dj-${{ github.run_id }}/coverage_report_standalone + + - name: Run unittest ray + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose exec ray-head python tests/run.py --tag ray --mode regression + + - name: Upload coverage report of ray + uses: actions/upload-artifact@v4 + with: + name: coverage_report_ray + include-hidden-files: true + path: dj-${{ github.run_id }}/coverage_report_ray + + - name: Remove docker compose + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + if: always() + run: | + docker compose down --remove-orphans + + - name: Cleanup workspace + if: always() + run: | + rm -rf dj-${{ github.run_id }} diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 021c384bf..07e662610 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -1,14 +1,13 @@ # This workflow will install Python dependencies, run tests and lint with a single version of Python # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python +# Test only on 7:00 morning of Friday each week in Beijing time, which is 23:00 of Thursday in UTC time. -name: unittest +name: unittest-regression on: workflow_dispatch: - pull_request: - push: - branches: - - main + schedule: + - cron: '0 23 * * 4' permissions: contents: read @@ -44,7 +43,7 @@ jobs: - name: Run unittest standalone working-directory: dj-${{ github.run_id }}/.github/workflows/docker run: | - docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head python tests/run.py --tag standalone + docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head python tests/run.py --tag standalone --mode regression - name: Upload coverage report of standalone uses: actions/upload-artifact@v4 @@ -56,7 +55,7 @@ jobs: - name: Run unittest ray working-directory: dj-${{ github.run_id }}/.github/workflows/docker run: | - docker compose exec ray-head python tests/run.py --tag ray + docker compose exec ray-head python tests/run.py --tag ray --mode regression - name: Upload coverage report of ray uses: actions/upload-artifact@v4 diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index 8f0e0e80d..09d33fa7d 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -1,5 +1,6 @@ import os import shutil +import subprocess import unittest import numpy @@ -122,3 +123,39 @@ def convert_record(rec): first = sorted(first, key=lambda x: tuple(sorted(x.items()))) second = sorted(second, key=lambda x: tuple(sorted(x.items()))) return self.assertEqual(first, second) + + +# for partial unittest +def get_diff_files(prefix_filter=['data_juicer/', 'tests/']): + """Get git diff files in target dirs except the __init__.py files""" + changed_files = subprocess.check_output( + ['git', 'diff', '--name-only', '--diff-filter=ACMRT', 'origin/main'], + universal_newlines=True, + ).strip().split('\n') + return [ + f for f in changed_files + if any([f.startswith(prefix) for prefix in prefix_filter]) + and f.endswith('.py') and not f.endswith('__init__.py') + ] + + +def find_corresponding_test_file(file_path): + test_file = file_path.replace('data_juicer', 'tests') + basename = os.path.basename(test_file) + dir = os.path.dirname(test_file) + test_file = os.path.join(dir, 'test_' + basename) + if os.path.exists(test_file): + return test_file + else: + return None + + +def get_partial_test_cases(): + diff_files = get_diff_files() + test_files = [ + find_corresponding_test_file(file_path) for file_path in diff_files + ] + if None in test_files: + # can't find corresponding test files for some changed files: run all + return None + return test_files diff --git a/tests/run.py b/tests/run.py index 838f97a47..a19e93dd0 100644 --- a/tests/run.py +++ b/tests/run.py @@ -14,7 +14,7 @@ from loguru import logger -from data_juicer.utils.unittest_utils import set_clear_model_flag +from data_juicer.utils.unittest_utils import set_clear_model_flag, get_partial_test_cases file_dir = os.path.join(os.path.dirname(__file__), '..') sys.path.append(file_dir) @@ -24,6 +24,11 @@ default="standalone", help="the tag of tests being run") parser.add_argument('--pattern', default='test_*.py', help='test file pattern') +parser.add_argument('--mode', default='partial', + help='test mode. Should be one of the ["partial", ' + '"regression"]. "partial" means only test on the ' + 'unit tests of the changed files. "regression" means ' + 'test on all unit tests.') parser.add_argument('--test_dir', default='tests', help='directory to be tested') @@ -37,9 +42,12 @@ set_clear_model_flag(args.clear_model) class TaggedTestLoader(unittest.TestLoader): - def __init__(self, tag="standalone"): + def __init__(self, tag="standalone", included_test_files=None): super().__init__() self.tag = tag + if isinstance(included_test_files, str): + included_test_files = [included_test_files] + self.included_test_files = included_test_files def loadTestsFromTestCase(self, testCaseClass): # set tag to testcase class @@ -53,9 +61,21 @@ def loadTestsFromTestCase(self, testCaseClass): loaded_suite.addTest(test_case) return loaded_suite -def gather_test_cases(test_dir, pattern, tag): + def _match_path(self, path, full_path, pattern): + # override this method to use alternative matching strategy + match = super()._match_path(path, full_path, pattern) + if self.included_test_files: + for included_test_file in self.included_test_files: + if included_test_file in full_path: + return match + return False + else: + return match + +def gather_test_cases(test_dir, pattern, tag, mode='partial'): test_to_run = unittest.TestSuite() - test_loader = TaggedTestLoader(tag) + partial_test_files = get_partial_test_cases() if mode == 'partial' else None + test_loader = TaggedTestLoader(tag, included_test_files=partial_test_files) discover = test_loader.discover(test_dir, pattern=pattern, top_level_dir=None) for suite_discovered in discover: print('suite_discovered', suite_discovered) @@ -76,7 +96,7 @@ def main(): runner = unittest.TextTestRunner() test_suite = gather_test_cases(os.path.abspath(args.test_dir), - args.pattern, args.tag) + args.pattern, args.tag, args.mode) res = runner.run(test_suite) cov.stop()