Skip to content

Commit

Permalink
score_threshold -> min/max_score
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Mar 7, 2025
1 parent 2fc1ad3 commit 4787508
Show file tree
Hide file tree
Showing 13 changed files with 38 additions and 38 deletions.
8 changes: 4 additions & 4 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,7 @@ process:
max_ratio: 0.4 # the max face area ratio of filter range
- image_nsfw_filter: # filter samples according to the nsfw scores of images in them
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification
score_threshold: 0.5 # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
max_score: 0.5 # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
- image_pair_similarity_filter: # filter samples according to the similarity score between the image pair.
Expand Down Expand Up @@ -651,7 +651,7 @@ process:
min_score: 0.8 # the min language scores to filter text
- llm_api_difficulty_score_filter: # filter to keep sample with high difficulty score estimated by LLM in API.
api_model: 'gpt-4o' # API model name.
score_threshold: 0.5 # The lowest difficulty score threshold to keep the sample.
min_score: 0.5 # The lowest difficulty score threshold to keep the sample.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
input_keys: ['text'] # Sub set of keys in the sample. Support data with multi fields such as 'query', 'analysis' and 'answer' in RFT data.
Expand All @@ -664,7 +664,7 @@ process:
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- llm_api_quality_score_filter: # filter to keep sample with high quality score estimated by LLM in API.
api_model: 'gpt-4o' # API model name.
score_threshold: 0.5 # The lowest quality score threshold to keep the sample.
min_score: 0.5 # The lowest quality score threshold to keep the sample.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
input_keys: ['text'] # Sub set of keys in the sample. Support data with multi fields such as 'query', 'analysis' and 'answer' in RFT data.
Expand Down Expand Up @@ -782,7 +782,7 @@ process:
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_nsfw_filter: # filter samples according to the nsfw scores of videos in them
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification
score_threshold: 0.5 # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
max_score: 0.5 # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min'].
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ np: 4 # number of subprocess to process your dataset
process:
- video_nsfw_filter:
hf_nsfw_model: Falconsai/nsfw_image_detection
score_threshold: 0.000195383
max_score: 0.000195383
frame_sampling_method: uniform
frame_num: 3
reduce_mode: avg
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ np: 4 # number of subprocess to process your dataset
process:
- video_nsfw_filter:
hf_nsfw_model: Falconsai/nsfw_image_detection
score_threshold: 0.000195383
max_score: 0.000195383
frame_sampling_method: uniform
frame_num: 3
reduce_mode: avg
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ process:
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_nsfw_filter: # filter samples according to the nsfw scores of videos in them
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification
score_threshold: 0.34847191 # the nsfw score threshold for samples, range from 0 to 1
max_score: 0.34847191 # the nsfw score threshold for samples, range from 0 to 1
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min'].
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/ops/filter/image_nsfw_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@ class ImageNSFWFilter(Filter):
def __init__(self,
hf_nsfw_model: str = 'Falconsai/nsfw_image_detection',
trust_remote_code: bool = False,
score_threshold: float = 0.5,
max_score: float = 0.5,
any_or_all: str = 'any',
*args,
**kwargs):
"""
Initialization method.
:param hf_nsfw_model: nsfw detection model name on huggingface.
:param score_threshold: the nsfw score threshold for samples.
:param max_score: the nsfw score threshold for samples.
range from 0 to 1. Samples with nsfw score less than this threshold
will be kept.
:param any_or_all: keep this sample with 'any' or 'all' strategy of
Expand All @@ -43,7 +43,7 @@ def __init__(self,
"""
kwargs.setdefault('mem_required', '1GB')
super().__init__(*args, **kwargs)
self.score_threshold = score_threshold
self.max_score = max_score
if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
f'Can only be one of ["any", "all"].')
Expand Down Expand Up @@ -90,7 +90,7 @@ def process_single(self, sample, rank=None):
return True

keep_bools = np.array(
[itm_score < self.score_threshold for itm_score in itm_scores])
[itm_score < self.max_score for itm_score in itm_scores])

# different strategies
if self.any:
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/ops/filter/llm_api_difficulty_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class LLMAPIDifficultyScoreFilter(Filter):

def __init__(self,
api_model: str = 'gpt-4o',
score_threshold: float = 0.5,
min_score: float = 0.5,
*,
api_endpoint: Optional[str] = None,
response_path: Optional[str] = None,
Expand All @@ -88,7 +88,7 @@ def __init__(self,
Initialization method.
:param api_model: API model name.
:param score_threshold: The lowest difficulty score threshold to keep
:param min_score: The lowest difficulty score threshold to keep
the sample.
:param api_endpoint: URL endpoint for the API.
:param response_path: Path to extract content from the API response.
Expand Down Expand Up @@ -117,7 +117,7 @@ def __init__(self,
self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE
self.field_template = field_template or self.DEFAULT_FIELD_TEMPLATE

self.score_threshold = score_threshold
self.min_score = min_score

self.sampling_params = sampling_params

Expand Down Expand Up @@ -202,4 +202,4 @@ def compute_stats_single(self, sample, rank=None, context=False):
def process_single(self, sample, rank=None):
itm_score = sample[Fields.stats][StatsKeys.llm_difficulty_score]

return itm_score >= self.score_threshold
return itm_score >= self.min_score
8 changes: 4 additions & 4 deletions data_juicer/ops/filter/llm_api_quality_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class LLMAPIQualityScoreFilter(Filter):

def __init__(self,
api_model: str = 'gpt-4o',
score_threshold: float = 0.5,
min_score: float = 0.5,
*,
api_endpoint: Optional[str] = None,
response_path: Optional[str] = None,
Expand All @@ -91,7 +91,7 @@ def __init__(self,
Initialization method.
:param api_model: API model name.
:param score_threshold: The lowest quality score threshold to keep the
:param min_score: The lowest quality score threshold to keep the
sample.
:param api_endpoint: URL endpoint for the API.
:param response_path: Path to extract content from the API response.
Expand Down Expand Up @@ -120,7 +120,7 @@ def __init__(self,
self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE
self.field_template = field_template or self.DEFAULT_FIELD_TEMPLATE

self.score_threshold = score_threshold
self.min_score = min_score

self.sampling_params = sampling_params

Expand Down Expand Up @@ -202,4 +202,4 @@ def compute_stats_single(self, sample, rank=None, context=False):
def process_single(self, sample, rank=None):
itm_score = sample[Fields.stats][StatsKeys.llm_quality_score]

return itm_score >= self.score_threshold
return itm_score >= self.min_score
8 changes: 4 additions & 4 deletions data_juicer/ops/filter/video_nsfw_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class VideoNSFWFilter(Filter):
def __init__(self,
hf_nsfw_model: str = 'Falconsai/nsfw_image_detection',
trust_remote_code: bool = False,
score_threshold: float = 0.5,
max_score: float = 0.5,
frame_sampling_method: str = 'all_keyframes',
frame_num: PositiveInt = 3,
reduce_mode: str = 'avg',
Expand All @@ -38,7 +38,7 @@ def __init__(self,
Initialization method.
:param hf_nsfw_model: nsfw detection model name on huggingface.
:param score_threshold: the nsfw score threshold for samples.
:param max_score: the nsfw score threshold for samples.
range from 0 to 1. Samples with nsfw score less than this threshold
will be kept.
:param frame_sampling_method: sampling method of extracting frame
Expand Down Expand Up @@ -67,7 +67,7 @@ def __init__(self,
"""
kwargs.setdefault('mem_required', '1GB')
super().__init__(*args, **kwargs)
self.score_threshold = score_threshold
self.max_score = max_score
if frame_sampling_method not in ['all_keyframes', 'uniform']:
raise ValueError(
f'Frame sampling method '
Expand Down Expand Up @@ -168,7 +168,7 @@ def process_single(self, sample, rank=None):
return True

keep_bools = np.array(
[itm_score < self.score_threshold for itm_score in itm_scores])
[itm_score < self.max_score for itm_score in itm_scores])

# different strategies
if self.any:
Expand Down
4 changes: 2 additions & 2 deletions demos/api_service/wrapped_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def execute_image_nsfw_filter(dataset_path: str) -> ServiceResponse:
min_th, max_th = show_analyzed_results(export_path, require_min=False)
dj_config = init_config(export_path,
'image_nsfw_filter',
score_threshold=max_th,
max_score=max_th,
hf_nsfw_model=nsfw_model_path)
result_path = execute_config(dj_config)
return ServiceResponse(ServiceExecStatus.SUCCESS,
Expand All @@ -150,7 +150,7 @@ def execute_video_nsfw_filter(dataset_path: str) -> ServiceResponse:
min_th, max_th = show_analyzed_results(export_path, require_min=False)
dj_config = init_config(export_path,
'video_nsfw_filter',
score_threshold=max_th,
max_score=max_th,
hf_nsfw_model=nsfw_model_path,
frame_sampling_method='uniform')
result_path = execute_config(dj_config)
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmark_performance/configs/video.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use_cache: false
process:
- video_nsfw_filter:
hf_nsfw_model: 'Falconsai/nsfw_image_detection'
score_threshold: 1.0
max_score: 1.0
mem_required: '1GB'
- video_tagging_from_frames_mapper:
mem_required: '9GB'
Expand Down
8 changes: 4 additions & 4 deletions tests/ops/filter/test_image_nsfw_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_nsfw_filter(self):

dataset = Dataset.from_list(ds_list)
op = ImageNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.0005)
max_score=0.0005)
self._run_filter(dataset, tgt_list, op)

def test_any(self):
Expand All @@ -75,7 +75,7 @@ def test_any(self):

dataset = Dataset.from_list(ds_list)
op = ImageNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.00012,
max_score=0.00012,
any_or_all='any')
self._run_filter(dataset, tgt_list, op)

Expand All @@ -92,7 +92,7 @@ def test_all(self):

dataset = Dataset.from_list(ds_list)
op = ImageNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.0005,
max_score=0.0005,
any_or_all='all')
self._run_filter(dataset, tgt_list, op)

Expand All @@ -119,7 +119,7 @@ def test_multi_process(self):

dataset = Dataset.from_list(ds_list)
op = ImageNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.0005)
max_score=0.0005)
self._run_filter(dataset, tgt_list, op, num_proc=num_proc)

if __name__ == '__main__':
Expand Down
14 changes: 7 additions & 7 deletions tests/ops/filter/test_video_nsfw_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_all_keyframes(self):

dataset = Dataset.from_list(ds_list)
op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.1,
max_score=0.1,
frame_sampling_method='all_keyframes')
self._run_filter(dataset, tgt_list, op)

Expand All @@ -81,7 +81,7 @@ def test_uniform_frames(self):

dataset = Dataset.from_list(ds_list)
op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.1,
max_score=0.1,
frame_sampling_method='uniform',
frame_num=3)
self._run_filter(dataset, tgt_list, op)
Expand All @@ -104,7 +104,7 @@ def test_reduce_max(self):

dataset = Dataset.from_list(ds_list)
op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.9,
max_score=0.9,
frame_sampling_method='all_keyframes',
reduce_mode='max')
self._run_filter(dataset, tgt_list, op)
Expand All @@ -127,7 +127,7 @@ def test_reduce_min(self):

dataset = Dataset.from_list(ds_list)
op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.0004,
max_score=0.0004,
frame_sampling_method='all_keyframes',
reduce_mode='min')
self._run_filter(dataset, tgt_list, op)
Expand All @@ -145,7 +145,7 @@ def test_any(self):

dataset = Dataset.from_list(ds_list)
op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.01,
max_score=0.01,
frame_sampling_method='all_keyframes',
any_or_all='any')
self._run_filter(dataset, tgt_list, op)
Expand All @@ -163,7 +163,7 @@ def test_all(self):

dataset = Dataset.from_list(ds_list)
op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.1,
max_score=0.1,
frame_sampling_method='all_keyframes',
any_or_all='all')
self._run_filter(dataset, tgt_list, op)
Expand Down Expand Up @@ -191,7 +191,7 @@ def test_multi_process(self):

dataset = Dataset.from_list(ds_list)
op = VideoNSFWFilter(hf_nsfw_model=self.hf_nsfw_model,
score_threshold=0.1,
max_score=0.1,
frame_sampling_method='all_keyframes')
self._run_filter(dataset, tgt_list, op, num_proc=num_proc)

Expand Down
2 changes: 1 addition & 1 deletion tests/tools/test_process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def test_ray_image(self):
'image_nsfw_filter': {
'hf_nsfw_model': 'Falconsai/nsfw_image_detection',
'trust_remote_code': True,
'score_threshold': 0.5,
'max_score': 0.5,
'any_or_all': 'any',
'mem_required': '8GB'
},
Expand Down

0 comments on commit 4787508

Please sign in to comment.