score_threshold -> min/max_score

modelscope · Mar 7, 2025 · 4787508 · 4787508
1 parent 2fc1ad3
commit 4787508
Show file tree

Hide file tree

Showing 13 changed files with 38 additions and 38 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -605,7 +605,7 @@ process:
       max_ratio: 0.4                                          # the max face area ratio of filter range
   - image_nsfw_filter:                                      # filter samples according to the nsfw scores of images in them
       hf_nsfw_model: Falconsai/nsfw_image_detection           # Huggingface model name for nsfw classification
-      score_threshold: 0.5                                    # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
+      max_score: 0.5                                    # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
       mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - image_pair_similarity_filter:                           # filter samples according to the similarity score between the image pair.
@@ -651,7 +651,7 @@ process:
       min_score: 0.8                                          # the min language scores to filter text
   - llm_api_difficulty_score_filter:                        # filter to keep sample with high difficulty score estimated by LLM in API.
       api_model: 'gpt-4o'                                     # API model name.
-      score_threshold: 0.5                                    # The lowest difficulty score threshold to keep the sample.
+      min_score: 0.5                                          # The lowest difficulty score threshold to keep the sample.
       api_endpoint: null                                      # URL endpoint for the API.
       response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
       input_keys: ['text']                                    # Sub set of keys in the sample. Support data with multi fields such as 'query', 'analysis' and 'answer' in RFT data.
@@ -664,7 +664,7 @@ process:
       sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
   - llm_api_quality_score_filter:                           # filter to keep sample with high quality score estimated by LLM in API.
       api_model: 'gpt-4o'                                     # API model name.
-      score_threshold: 0.5                                    # The lowest quality score threshold to keep the sample.
+      min_score: 0.5                                    # The lowest quality score threshold to keep the sample.
       api_endpoint: null                                      # URL endpoint for the API.
       response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
       input_keys: ['text']                                    # Sub set of keys in the sample. Support data with multi fields such as 'query', 'analysis' and 'answer' in RFT data.
@@ -782,7 +782,7 @@ process:
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
   - video_nsfw_filter:                                      # filter samples according to the nsfw scores of videos in them
       hf_nsfw_model: Falconsai/nsfw_image_detection           # Huggingface model name for nsfw classification
-      score_threshold: 0.5                                    # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
+      max_score: 0.5                                    # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
       frame_sampling_method: all_keyframes                    # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       reduce_mode: avg                                        # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min'].

diff --git a/configs/data_juicer_recipes/data-juicer-sandbox-optimal.yaml b/configs/data_juicer_recipes/data-juicer-sandbox-optimal.yaml
@@ -10,7 +10,7 @@ np: 4  # number of subprocess to process your dataset
 process:
   - video_nsfw_filter:
       hf_nsfw_model: Falconsai/nsfw_image_detection
-      score_threshold: 0.000195383
+      max_score: 0.000195383
       frame_sampling_method: uniform
       frame_num: 3
       reduce_mode: avg

diff --git a/configs/data_juicer_recipes/data-juicer-sandbox-self-evolution.yaml b/configs/data_juicer_recipes/data-juicer-sandbox-self-evolution.yaml
@@ -10,7 +10,7 @@ np: 4  # number of subprocess to process your dataset
 process:
   - video_nsfw_filter:
       hf_nsfw_model: Falconsai/nsfw_image_detection
-      score_threshold: 0.000195383
+      max_score: 0.000195383
       frame_sampling_method: uniform
       frame_num: 3
       reduce_mode: avg

diff --git a/configs/data_juicer_recipes/general-video-refine-example.yaml b/configs/data_juicer_recipes/general-video-refine-example.yaml
@@ -51,7 +51,7 @@ process:
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
   - video_nsfw_filter:                                      # filter samples according to the nsfw scores of videos in them
       hf_nsfw_model: Falconsai/nsfw_image_detection           # Huggingface model name for nsfw classification
-      score_threshold: 0.34847191                             # the nsfw score threshold for samples, range from 0 to 1
+      max_score: 0.34847191                             # the nsfw score threshold for samples, range from 0 to 1
       frame_sampling_method: all_keyframes                    # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       reduce_mode: avg                                        # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min'].

diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py
@@ -23,15 +23,15 @@ class ImageNSFWFilter(Filter):
     def __init__(self,
                  hf_nsfw_model: str = 'Falconsai/nsfw_image_detection',
                  trust_remote_code: bool = False,
-                 score_threshold: float = 0.5,
+                 max_score: float = 0.5,
                  any_or_all: str = 'any',
                  *args,
                  **kwargs):
         """
         Initialization method.
 
         :param hf_nsfw_model: nsfw detection model name on huggingface.
-        :param score_threshold: the nsfw score threshold for samples.
+        :param max_score: the nsfw score threshold for samples.
             range from 0 to 1. Samples with nsfw score less than this threshold
             will be kept.
         :param any_or_all: keep this sample with 'any' or 'all' strategy of
@@ -43,7 +43,7 @@ def __init__(self,
         """
         kwargs.setdefault('mem_required', '1GB')
         super().__init__(*args, **kwargs)
-        self.score_threshold = score_threshold
+        self.max_score = max_score
         if any_or_all not in ['any', 'all']:
             raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
                              f'Can only be one of ["any", "all"].')
@@ -90,7 +90,7 @@ def process_single(self, sample, rank=None):
             return True
 
         keep_bools = np.array(
-            [itm_score < self.score_threshold for itm_score in itm_scores])
+            [itm_score < self.max_score for itm_score in itm_scores])
 
         # different strategies
         if self.any:

diff --git a/data_juicer/ops/filter/llm_api_difficulty_score_filter.py b/data_juicer/ops/filter/llm_api_difficulty_score_filter.py
@@ -71,7 +71,7 @@ class LLMAPIDifficultyScoreFilter(Filter):
 
     def __init__(self,
                  api_model: str = 'gpt-4o',
-                 score_threshold: float = 0.5,
+                 min_score: float = 0.5,
                  *,
                  api_endpoint: Optional[str] = None,
                  response_path: Optional[str] = None,
@@ -88,7 +88,7 @@ def __init__(self,
         Initialization method.
 
         :param api_model: API model name.
-        :param score_threshold: The lowest difficulty score threshold to keep
+        :param min_score: The lowest difficulty score threshold to keep
             the sample.
         :param api_endpoint: URL endpoint for the API.
         :param response_path: Path to extract content from the API response.
@@ -117,7 +117,7 @@ def __init__(self,
         self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE
         self.field_template = field_template or self.DEFAULT_FIELD_TEMPLATE
 
-        self.score_threshold = score_threshold
+        self.min_score = min_score
 
         self.sampling_params = sampling_params
 
@@ -202,4 +202,4 @@ def compute_stats_single(self, sample, rank=None, context=False):
     def process_single(self, sample, rank=None):
         itm_score = sample[Fields.stats][StatsKeys.llm_difficulty_score]
 
-        return itm_score >= self.score_threshold
+        return itm_score >= self.min_score
diff --git a/data_juicer/ops/filter/llm_api_quality_score_filter.py b/data_juicer/ops/filter/llm_api_quality_score_filter.py
@@ -74,7 +74,7 @@ class LLMAPIQualityScoreFilter(Filter):
 
     def __init__(self,
                  api_model: str = 'gpt-4o',
-                 score_threshold: float = 0.5,
+                 min_score: float = 0.5,
                  *,
                  api_endpoint: Optional[str] = None,
                  response_path: Optional[str] = None,
@@ -91,7 +91,7 @@ def __init__(self,
         Initialization method.
 
         :param api_model: API model name.
-        :param score_threshold: The lowest quality score threshold to keep the
+        :param min_score: The lowest quality score threshold to keep the
             sample.
         :param api_endpoint: URL endpoint for the API.
         :param response_path: Path to extract content from the API response.
@@ -120,7 +120,7 @@ def __init__(self,
         self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE
         self.field_template = field_template or self.DEFAULT_FIELD_TEMPLATE
 
-        self.score_threshold = score_threshold
+        self.min_score = min_score
 
         self.sampling_params = sampling_params
 
@@ -202,4 +202,4 @@ def compute_stats_single(self, sample, rank=None, context=False):
     def process_single(self, sample, rank=None):
         itm_score = sample[Fields.stats][StatsKeys.llm_quality_score]
 
-        return itm_score >= self.score_threshold
+        return itm_score >= self.min_score
diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py
@@ -27,7 +27,7 @@ class VideoNSFWFilter(Filter):
     def __init__(self,
                  hf_nsfw_model: str = 'Falconsai/nsfw_image_detection',
                  trust_remote_code: bool = False,
-                 score_threshold: float = 0.5,
+                 max_score: float = 0.5,
                  frame_sampling_method: str = 'all_keyframes',
                  frame_num: PositiveInt = 3,
                  reduce_mode: str = 'avg',
@@ -38,7 +38,7 @@ def __init__(self,
         Initialization method.
 
         :param hf_nsfw_model: nsfw detection model name on huggingface.
-        :param score_threshold: the nsfw score threshold for samples.
+        :param max_score: the nsfw score threshold for samples.
             range from 0 to 1. Samples with nsfw score less than this threshold
             will be kept.
         :param frame_sampling_method: sampling method of extracting frame
@@ -67,7 +67,7 @@ def __init__(self,
         """
         kwargs.setdefault('mem_required', '1GB')
         super().__init__(*args, **kwargs)
-        self.score_threshold = score_threshold
+        self.max_score = max_score
         if frame_sampling_method not in ['all_keyframes', 'uniform']:
             raise ValueError(
                 f'Frame sampling method '
@@ -168,7 +168,7 @@ def process_single(self, sample, rank=None):
             return True
 
         keep_bools = np.array(
-            [itm_score < self.score_threshold for itm_score in itm_scores])
+            [itm_score < self.max_score for itm_score in itm_scores])
 
         # different strategies
         if self.any:

diff --git a/demos/api_service/wrapped_filters.py b/demos/api_service/wrapped_filters.py
@@ -125,7 +125,7 @@ def execute_image_nsfw_filter(dataset_path: str) -> ServiceResponse:
         min_th, max_th = show_analyzed_results(export_path, require_min=False)
         dj_config = init_config(export_path,
                                 'image_nsfw_filter',
-                                score_threshold=max_th,
+                                max_score=max_th,
                                 hf_nsfw_model=nsfw_model_path)
         result_path = execute_config(dj_config)
         return ServiceResponse(ServiceExecStatus.SUCCESS,
@@ -150,7 +150,7 @@ def execute_video_nsfw_filter(dataset_path: str) -> ServiceResponse:
         min_th, max_th = show_analyzed_results(export_path, require_min=False)
         dj_config = init_config(export_path,
                                 'video_nsfw_filter',
-                                score_threshold=max_th,
+                                max_score=max_th,
                                 hf_nsfw_model=nsfw_model_path,
                                 frame_sampling_method='uniform')
         result_path = execute_config(dj_config)

diff --git a/tests/benchmark_performance/configs/video.yaml b/tests/benchmark_performance/configs/video.yaml
@@ -11,7 +11,7 @@ use_cache: false
 process:
   - video_nsfw_filter:
       hf_nsfw_model: 'Falconsai/nsfw_image_detection'
-      score_threshold: 1.0
+      max_score: 1.0
       mem_required: '1GB'
   - video_tagging_from_frames_mapper:
       mem_required: '9GB'

diff --git a/tests/ops/filter/test_image_nsfw_filter.py b/tests/ops/filter/test_image_nsfw_filter.py
@@ -59,7 +59,7 @@ def test_nsfw_filter(self):
 
         dataset = Dataset.from_list(ds_list)
         op = ImageNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.0005)
+                            max_score=0.0005)
         self._run_filter(dataset, tgt_list, op)
 
     def test_any(self):
@@ -75,7 +75,7 @@ def test_any(self):
 
         dataset = Dataset.from_list(ds_list)
         op = ImageNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.00012,
+                            max_score=0.00012,
                             any_or_all='any')
         self._run_filter(dataset, tgt_list, op)    
 
@@ -92,7 +92,7 @@ def test_all(self):
 
         dataset = Dataset.from_list(ds_list)
         op = ImageNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.0005,
+                            max_score=0.0005,
                             any_or_all='all')
         self._run_filter(dataset, tgt_list, op)   
 
@@ -119,7 +119,7 @@ def test_multi_process(self):
 
         dataset = Dataset.from_list(ds_list)
         op = ImageNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.0005)
+                            max_score=0.0005)
         self._run_filter(dataset, tgt_list, op, num_proc=num_proc)
 
 if __name__ == '__main__':

diff --git a/tests/ops/filter/test_video_nsfw_filter.py b/tests/ops/filter/test_video_nsfw_filter.py
@@ -59,7 +59,7 @@ def test_all_keyframes(self):
 
         dataset = Dataset.from_list(ds_list)
         op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.1,
+                            max_score=0.1,
                             frame_sampling_method='all_keyframes')
         self._run_filter(dataset, tgt_list, op)
 
@@ -81,7 +81,7 @@ def test_uniform_frames(self):
 
         dataset = Dataset.from_list(ds_list)
         op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.1,
+                            max_score=0.1,
                             frame_sampling_method='uniform',
                             frame_num=3)
         self._run_filter(dataset, tgt_list, op)
@@ -104,7 +104,7 @@ def test_reduce_max(self):
 
         dataset = Dataset.from_list(ds_list)
         op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.9,
+                            max_score=0.9,
                             frame_sampling_method='all_keyframes',
                             reduce_mode='max')
         self._run_filter(dataset, tgt_list, op)
@@ -127,7 +127,7 @@ def test_reduce_min(self):
 
         dataset = Dataset.from_list(ds_list)
         op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.0004,
+                            max_score=0.0004,
                             frame_sampling_method='all_keyframes',
                             reduce_mode='min')
         self._run_filter(dataset, tgt_list, op)
@@ -145,7 +145,7 @@ def test_any(self):
 
         dataset = Dataset.from_list(ds_list)
         op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.01,
+                            max_score=0.01,
                             frame_sampling_method='all_keyframes',
                             any_or_all='any')
         self._run_filter(dataset, tgt_list, op)    
@@ -163,7 +163,7 @@ def test_all(self):
 
         dataset = Dataset.from_list(ds_list)
         op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.1,
+                            max_score=0.1,
                             frame_sampling_method='all_keyframes',
                             any_or_all='all')
         self._run_filter(dataset, tgt_list, op)   
@@ -191,7 +191,7 @@ def test_multi_process(self):
 
         dataset = Dataset.from_list(ds_list)
         op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
-                            score_threshold=0.1,
+                            max_score=0.1,
                             frame_sampling_method='all_keyframes')
         self._run_filter(dataset, tgt_list, op, num_proc=num_proc)
 

diff --git a/tests/tools/test_process_data.py b/tests/tools/test_process_data.py
@@ -126,7 +126,7 @@ def test_ray_image(self):
                     'image_nsfw_filter': {
                         'hf_nsfw_model': 'Falconsai/nsfw_image_detection',
                         'trust_remote_code': True,
-                        'score_threshold': 0.5,
+                        'max_score': 0.5,
                         'any_or_all': 'any',
                         'mem_required': '8GB'
                     },