* split unit-test to regression and partial versions

* set the trigger time of regression test to 7:00 on every Friday in Beijing time * replace the unit test before with the partial test, which only test on the corresponding unit tests of changed files. If there is no corresponding unit tests, run all tests.
modelscope · Feb 28, 2025 · 77d7b50 · 77d7b50
1 parent e283e8d
commit 77d7b50
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 12 deletions.
diff --git a/.github/workflows/unit-test-partial.yml b/.github/workflows/unit-test-partial.yml
@@ -0,0 +1,77 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: unittest-partial
+
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: read
+
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+
+jobs:
+  unittest-single:
+    runs-on: [GPU, unittest]
+    environment: Testing
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        path: dj-${{ github.run_id }}
+
+    - name: Setup docker compose
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose up -d
+
+    - name: Install data-juicer
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose exec ray-head pip install -e .\[all\]
+        docker compose exec ray-worker pip install -e .\[all\]
+
+    - name: Clean dataset cache
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose exec ray-head rm -rf /data/huggingface/dataset
+
+    - name: Run unittest standalone
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head python tests/run.py --tag standalone --mode partial
+
+    - name: Upload coverage report of standalone
+      uses: actions/upload-artifact@v4
+      with:
+        name: coverage_report_standalone
+        include-hidden-files: true
+        path: dj-${{ github.run_id }}/coverage_report_standalone
+
+    - name: Run unittest ray
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose exec ray-head python tests/run.py --tag ray --mode regression
+
+    - name: Upload coverage report of ray
+      uses: actions/upload-artifact@v4
+      with:
+        name: coverage_report_ray
+        include-hidden-files: true
+        path: dj-${{ github.run_id }}/coverage_report_ray
+
+    - name: Remove docker compose
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      if: always()
+      run: |
+        docker compose down --remove-orphans
+
+    - name: Cleanup workspace
+      if: always()
+      run: |
+        rm -rf  dj-${{ github.run_id }}
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
@@ -1,14 +1,13 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+# Test only on 7:00 morning of Friday each week in Beijing time, which is 23:00 of Thursday in UTC time.
 
-name: unittest
+name: unittest-regression
 
 on:
   workflow_dispatch:
-  pull_request:
-  push:
-    branches:
-      - main
+  schedule:
+    - cron: '0 23 * * 4'
 
 permissions:
   contents: read
@@ -44,7 +43,7 @@ jobs:
     - name: Run unittest standalone
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       run: |
-        docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head python tests/run.py --tag standalone
+        docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head python tests/run.py --tag standalone --mode regression
 
     - name: Upload coverage report of standalone
       uses: actions/upload-artifact@v4
@@ -56,7 +55,7 @@ jobs:
     - name: Run unittest ray
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       run: |
-        docker compose exec ray-head python tests/run.py --tag ray
+        docker compose exec ray-head python tests/run.py --tag ray --mode regression
 
     - name: Upload coverage report of ray
       uses: actions/upload-artifact@v4

diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py
@@ -1,5 +1,6 @@
 import os
 import shutil
+import subprocess
 import unittest
 
 import numpy
@@ -122,3 +123,39 @@ def convert_record(rec):
         first = sorted(first, key=lambda x: tuple(sorted(x.items())))
         second = sorted(second, key=lambda x: tuple(sorted(x.items())))
         return self.assertEqual(first, second)
+
+
+# for partial unittest
+def get_diff_files(prefix_filter=['data_juicer/', 'tests/']):
+    """Get git diff files in target dirs except the __init__.py files"""
+    changed_files = subprocess.check_output(
+        ['git', 'diff', '--name-only', '--diff-filter=ACMRT', 'origin/main'],
+        universal_newlines=True,
+    ).strip().split('\n')
+    return [
+        f for f in changed_files
+        if any([f.startswith(prefix) for prefix in prefix_filter])
+        and f.endswith('.py') and not f.endswith('__init__.py')
+    ]
+
+
+def find_corresponding_test_file(file_path):
+    test_file = file_path.replace('data_juicer', 'tests')
+    basename = os.path.basename(test_file)
+    dir = os.path.dirname(test_file)
+    test_file = os.path.join(dir, 'test_' + basename)
+    if os.path.exists(test_file):
+        return test_file
+    else:
+        return None
+
+
+def get_partial_test_cases():
+    diff_files = get_diff_files()
+    test_files = [
+        find_corresponding_test_file(file_path) for file_path in diff_files
+    ]
+    if None in test_files:
+        # can't find corresponding test files for some changed files: run all
+        return None
+    return test_files
diff --git a/tests/run.py b/tests/run.py
@@ -14,7 +14,7 @@
 
 from loguru import logger
 
-from data_juicer.utils.unittest_utils import set_clear_model_flag
+from data_juicer.utils.unittest_utils import set_clear_model_flag, get_partial_test_cases
 
 file_dir = os.path.join(os.path.dirname(__file__), '..')
 sys.path.append(file_dir)
@@ -24,6 +24,11 @@
                     default="standalone",
                     help="the tag of tests being run")
 parser.add_argument('--pattern', default='test_*.py', help='test file pattern')
+parser.add_argument('--mode', default='partial',
+                    help='test mode. Should be one of the ["partial", '
+                         '"regression"]. "partial" means only test on the '
+                         'unit tests of the changed files. "regression" means '
+                         'test on all unit tests.')
 parser.add_argument('--test_dir',
                     default='tests',
                     help='directory to be tested')
@@ -37,9 +42,12 @@
 set_clear_model_flag(args.clear_model)
 
 class TaggedTestLoader(unittest.TestLoader):
-    def __init__(self, tag="standalone"):
+    def __init__(self, tag="standalone", included_test_files=None):
         super().__init__()
         self.tag = tag
+        if isinstance(included_test_files, str):
+            included_test_files = [included_test_files]
+        self.included_test_files = included_test_files
 
     def loadTestsFromTestCase(self, testCaseClass):
         # set tag to testcase class
@@ -53,9 +61,21 @@ def loadTestsFromTestCase(self, testCaseClass):
                 loaded_suite.addTest(test_case)
         return loaded_suite
 
-def gather_test_cases(test_dir, pattern, tag):
+    def _match_path(self, path, full_path, pattern):
+        # override this method to use alternative matching strategy
+        match = super()._match_path(path, full_path, pattern)
+        if self.included_test_files:
+            for included_test_file in self.included_test_files:
+                if included_test_file in full_path:
+                    return match
+            return False
+        else:
+            return match
+
+def gather_test_cases(test_dir, pattern, tag, mode='partial'):
     test_to_run = unittest.TestSuite()
-    test_loader = TaggedTestLoader(tag)
+    partial_test_files = get_partial_test_cases() if mode == 'partial' else None
+    test_loader = TaggedTestLoader(tag, included_test_files=partial_test_files)
     discover = test_loader.discover(test_dir, pattern=pattern, top_level_dir=None)
     for suite_discovered in discover:
         print('suite_discovered', suite_discovered)
@@ -76,7 +96,7 @@ def main():
 
     runner = unittest.TextTestRunner()
     test_suite = gather_test_cases(os.path.abspath(args.test_dir),
-                                   args.pattern, args.tag)
+                                   args.pattern, args.tag, args.mode)
     res = runner.run(test_suite)
 
     cov.stop()