update humanvbench operators

modelscope · Mar 9, 2025 · 07209f7 · 07209f7
1 parent c8b1837
commit 07209f7
Show file tree

Hide file tree

Showing 175 changed files with 5,871 additions and 255 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -514,40 +514,49 @@ process:
   - whitespace_normalization_mapper:                        # normalize different kinds of whitespaces to English whitespace.
 
 
+# When use HumanVBench mapper, keep_stats_in_res_ds should be set true
+
   - video_human_tracks_extraction_mapper:                   # Get the body and face trajectory bounding box of people in one shot of the video. To ensure correctness, it should be applied after video_split_by_scene_mapper
-      face_track_bbox_path: your_path_to_save_bounding_box_data
-      YOLOv8_human_model_path: ./data_juicer/my_pretrained_method/YOLOv8_human/weights/best.pt
+      face_track_bbox_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt        # The storage location of the bounding box tracks of the characters in the video
+      YOLOv8_human_model_path: ./thirdparty/humanvbench_models/YOLOv8_human/weights/best.pt
       mem_required: '10GB'
 
-  - video_active_speaker_mapper:                          # Based on the results of video_human_tracks_extraction_mapper, determine whether each person is an active speaker
-      tempt_save_path: ./HumanVBenchRecipe/dj_ASD_tempt          # Used to store temporary videos
-      face_track_bbox_path: ./HumanVBenchRecipe/dj_human_track        # Human track Data storage address in video_human_tracks_extraction_mapper
-      mem_required: '10GB' 
+  - video_humantrack_face_demographic_mapper:                   # Get the facial demographics of each person based on the results of video_human_tracks_extraction_mapper
+      original_data_save_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2                # The location where the specific results of each frame's detection are stored
+      detect_interval: 5
 
   - video_audio_attribute_mapper:                         # If the audio is speech, classify the gender and age of the speech
-      hf_audio_mapper: 'pt_model/wav2vec2-large-robust-24-ft-age-gender'      # Huggingface model name for speech age and gender classification
+      hf_audio_mapper: '/mnt/daoyuan_open_research/zt_data/pt_model/wav2vec2-large-robust-24-ft-age-gender'      # Huggingface model name for speech age and gender classification
       mem_required: '7GB' 
 
   - video_captioning_from_human_tracks_mapper:                # Based on the results of video_human_tracks_extraction_mapper, focus on the single person in the video for captioning
-      video_describe_model_path: pt_model/sharegpt4video-8b           # model path to sharegpt4video-8b
-      tempt_video_path: data_juicer/HumanVBenchRecipe/dj_tmpt           # Used to store temporary videos
-      mem_required: '35GB' 
+      video_describe_model_path: /mnt/daoyuan_open_research/zt_data/pt_model/videollm/VideoLLaMA3-7B           # model path to sharegpt4video-8b
+      trust_remote_code: true
+      tempt_video_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2           # Used to store temporary videos that will be removed finally.
+      mem_required: '40GB'     
 
   - video_captioning_face_attribute_emotion_mapper:                   # Based on the results of video_human_tracks_extraction_mapper, focus on judging the gender, age, and race of a single person in the video
       face_track_query: Please only describe the appearance and facial emotions of the person in the video in detail. Don't mention the background. Less than 80 words.
-      cropping_face_video_tempt_path: ./tempt_video/tmp_video_remove            # Used to store temporary videos
-      video_describe_model_path: 'pt_model/VideoLLaMA2'          # Huggingface model DAMO-NLP-SG/VideoLLaMA2-7B-16F
-      mem_required: '35GB' 
+      trust_remote_code: true
+      cropping_face_video_tempt_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2            # Used to store temporary videos
+      video_describe_model_path: /mnt/daoyuan_open_research/zt_data/pt_model/videollm/VideoLLaMA3-7B          # Huggingface model DAMO-NLP-SG/VideoLLaMA2-7B-16F
+      mem_required: '40GB' 
+
+  - video_active_speaker_mapper:                          # Based on the results of video_human_tracks_extraction_mapper, determine whether each person is an active speaker
+      tempt_save_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2          # Used to store temporary videos
+      Light_ASD_model_path: /home/daoyuan_mm/data-juicer/thirdparty/humanvbench_models/Light-ASD/weight/finetuning_TalkSet.model
+      acitve_threshold: 15
+      mem_required: '10GB' 
+
 
   - video_audio_speech_ASR_mapper:                          # Automatic speech recognition from video speech
-      model_dir_ASR: 'pt_model/SenseVoiceSmall'                     # Huggingface model FunAudioLLM/SenseVoiceSmall
+      model_dir_ASR: '/mnt/daoyuan_open_research/zt_data/pt_model/SenseVoiceSmall'                     # Huggingface model FunAudioLLM/SenseVoiceSmall
       mem_required: '20GB' 
 
   - video_audio_speech_emotion_mapper:                          # Speech emotion recognition from video speech
-      model_dir_emo: 'pt_model/SenseVoiceSmall'                         # # Huggingface model FunAudioLLM/SenseVoiceSmall
+      model_dir_emo: '/mnt/daoyuan_open_research/zt_data/pt_model/SenseVoiceSmall'                         # # Huggingface model FunAudioLLM/SenseVoiceSmall
       mem_required: '20GB' 
 
-
   # Filter ops
   - video_face_ratio_filter:                                # Filter to retain human-centric videos
       threshold: 0.65                                   # The lower limit of the ratio of frames with faces to the total number of video frames

diff --git a/data_juicer/ops/filter/video_face_ratio_filter.py b/data_juicer/ops/filter/video_face_ratio_filter.py
@@ -1,7 +1,6 @@
 import av
 import numpy as np
 from jsonargparse.typing import ClosedUnitInterval
-from data_juicer.utils.availability_utils import AvailabilityChecking
 from data_juicer.utils.constant import Fields, StatsKeys
 from data_juicer.utils.mm_utils import (load_data_with_context, load_video,
                                         pil_to_opencv, pil_to_opencv, process_each_frame)
@@ -13,9 +12,8 @@
 
 OP_NAME = 'video_face_ratio_filter'
 
-with AvailabilityChecking(['dlib', 'Pillow'], OP_NAME):
-    import cv2,dlib
-    from PIL import ImageFilter
+import cv2,dlib
+from PIL import ImageFilter
 
 @OPERATORS.register_module(OP_NAME)
 @LOADED_VIDEOS.register_module(OP_NAME)
@@ -49,13 +47,12 @@ def __init__(self,
 
         # Initialize face detector
         self.detector = dlib.get_frontal_face_detector()
-        # self.detector_key = prepare_model(model_type='face_detect_S3FD')
 
 
         self.detect_interval = detect_interval
 
 
-    def compute_stats(self, sample, rank=None, context=False):
+    def compute_stats_single(self, sample, rank=None, context=False):
         # check if it's computed already
         if StatsKeys.video_face_exist in sample[Fields.stats]:
             return sample
@@ -126,7 +123,7 @@ def compute_stats(self, sample, rank=None, context=False):
 
         return sample
 
-    def process(self, sample):
+    def process_single(self, sample):
         video_faces_ratio = sample[Fields.stats][StatsKeys.video_face_exist]
         keep_bools = np.array([
             duration >= self.threshold

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -76,10 +76,12 @@
 from .video_active_speaker_mapper import VideoActiveSpeakerMapper
 from .video_audio_attribute_mapper import VideoAudioAttributeMapper
 from .video_audio_speech_ASR_mapper import VideoAudioSpeechASRMapper
+from .video_audio_speech_emotion_mapper import VideoAudioSpeechEmotionMapper
 from .video_captioning_face_attribute_emotion_mapper import VideoCaptioningFaceAttributeEmotionMapper
 from .video_captioning_from_human_tracks_mapper import VideoCaptioningFromHumanTracksMapper
 from .video_human_tracks_extraction_mapper import VideoHumanTracksExtractionMapper
 from .video_captioning_face_attribute_emotion_mapper import VideoCaptioningFaceAttributeEmotionMapper
+from .video_humantrack_face_demographic_mapper import VideoHumantrackFaceDemographicMapper
 
 __all__ = [
     'AudioFFmpegWrappedMapper', 'CalibrateQAMapper', 'CalibrateQueryMapper',
@@ -115,5 +117,6 @@
     'WhitespaceNormalizationMapper','VideoActiveSpeakerMapper',
     'VideoAudioAttributeMapper', 'VideoAudioSpeechASRMapper',
     'VideoCaptioningFaceAttributeEmotionMapper','VideoCaptioningFromHumanTracksMapper', 
-    'VideoHumanTracksExtractionMapper', 'VideoCaptioningFaceAttributeEmotionMapper'
+    'VideoHumanTracksExtractionMapper', 'VideoCaptioningFaceAttributeEmotionMapper',
+    'VideoHumantrackFaceDemographicMapper', 'VideoAudioSpeechEmotionMapper'
 ]
diff --git a/data_juicer/ops/mapper/video_active_speaker_mapper.py b/data_juicer/ops/mapper/video_active_speaker_mapper.py
@@ -1,6 +1,3 @@
-from data_juicer.utils.availability_utils import AvailabilityChecking
-from data_juicer.utils.constant import Fields
-
 from data_juicer.utils.ASD_mapper_utils import get_video_array_cv2,evaluate_network, \
     crop_video_with_facetrack, longest_continuous_actives
 
@@ -11,50 +8,54 @@
 
 OP_NAME = 'video_active_speaker_mapper'
 
-with AvailabilityChecking([], OP_NAME):
-    import torch
-    import sys
-    sys.path.append('./data_juicer/my_pretrained_method/Light-ASD')
-    import tempfile
-    import shutil, pickle
-    from shutil import rmtree
-    import os, subprocess
-    import tqdm, glob
-    # from model.faceDetector.s3fd import S3FD
+import torch
+import sys
+sys.path.append('./thirdparty/humanvbench_models/Light-ASD')
+from data_juicer.utils.constant import Fields, MetaKeys
+import tempfile
+import shutil, pickle
+from shutil import rmtree
+import os, subprocess
+import tqdm, glob
+# from model.faceDetector.s3fd import S3FD
 
 
 @OPERATORS.register_module(OP_NAME)
 @LOADED_VIDEOS.register_module(OP_NAME)
 class VideoActiveSpeakerMapper(Mapper):
+    _accelerator = 'cuda'
+    _batched_op = True
+
     """
     """
 
     _default_kwargs = {'upsample_num_times': 0}
 
     def __init__(self,
                  tempt_save_path: str = './HumanVBenchRecipe/dj_ASD_tempt',
-                 face_track_bbox_path: str = './HumanVBenchRecipe/dj_human_track',
-                 Light_ASD_model_path: str = 'weight/finetuning_TalkSet.model',
+                 Light_ASD_model_path: str = './thirdparty/humanvbench_models/Light-ASD/weight/finetuning_TalkSet.model',
                  acitve_threshold: int = 15,
+                 active_speaker_flag: str = MetaKeys.active_speaker_flag,
                  *args,
                  **kwargs):
         """
         Initialization method.
 
         :param blur_type: 
         """
+        kwargs.setdefault('mem_required', '10GB')
         super().__init__(*args, **kwargs)
-        self._accelerator = 'cuda'
         self._init_parameters = self.remove_extra_parameters(locals())
         self.acitve_threshold = acitve_threshold
 
         self.tempt_save_path = tempt_save_path
-        self.face_track_bbox_path = face_track_bbox_path
 
         # Initialize ASD model
         self.ASD_model_key = prepare_model(model_type='Light_ASD',
                                        pretrained_model_name_or_path=Light_ASD_model_path)
-
+
+        self.active_speaker_flag = active_speaker_flag
+
     def active_speaker_detection_revise(self, active_score,is_child_descrip,speech_audio,face_gender):
         speech_child = speech_audio['child'][0]
         speech_male = speech_audio['male'][0]
@@ -85,7 +86,7 @@ def active_speaker_detection_revise(self, active_score,is_child_descrip,speech_a
             if not is_child_voice == 'Not Sure':
                 if is_child_apperance == is_child_voice:
                     # gender consistency test
-                    if speech_gender_confidence > 0.65 and float(face_gender[1]) > 0.65:
+                    if speech_gender_confidence > 0.9 and float(face_gender[1]) > 0.9:
                         if not speech_gender == face_gender[0]:
                             speak_active = False
                 else:
@@ -95,31 +96,31 @@ def active_speaker_detection_revise(self, active_score,is_child_descrip,speech_a
             return False
 
 
-    def process(self, sample, rank=None):
+    def process_single(self, sample, rank=None):
         # there is no video in this sample
         if self.video_key not in sample or not sample[self.video_key]:
             sample[Fields.source_file] = []
             return sample
 
-        if not Fields.video_audio_tags in sample:
+        if not MetaKeys.video_audio_tags in sample[Fields.meta]:
             raise ValueError("video_active_speaker_mapper must be operated after video_tagging_from_audio_mapper.")
 
-        if not Fields.human_track_data_path in sample:
+        if not MetaKeys.human_track_data_path in sample[Fields.meta]:
             raise ValueError("video_active_speaker_mapper must be operated after video_human_tracks_extraction_mapper.")
 
-        if not Fields.audio_speech_attribute in sample:
-            raise ValueError("video_active_speaker_mapper must be operated after audio_speech_attribute.")
+        if not MetaKeys.audio_speech_attribute in sample[Fields.meta]:
+            raise ValueError("video_active_speaker_mapper must be operated after video_audio_attribute_mapper.")
 
-        if not Fields.video_facetrack_attribute_demographic in sample:
-            raise ValueError("video_active_speaker_mapper must be operated after video_facetrack_attribute_demographic.")
+        if not MetaKeys.video_facetrack_attribute_demographic in sample[Fields.meta]:
+            raise ValueError("video_active_speaker_mapper must be operated after video_humantrack_face_demographic_mapper.")
 
-        if not Fields.video_facetrack_is_child in sample:
+        if not MetaKeys.video_track_is_child in sample[Fields.meta]:
             raise ValueError("video_active_speaker_mapper must be operated after video_captioning_from_human_tracks_mapper.")
 
         loaded_video_keys = sample[self.video_key]
-        audio_speech_attribute = sample[Fields.audio_speech_attribute]
-        face_demographic = sample[Fields.video_facetrack_attribute_demographic][0]
-        child_flag = sample[Fields.video_facetrack_is_child]
+        audio_speech_attribute = sample[Fields.meta][MetaKeys.audio_speech_attribute]
+        face_demographic = sample[Fields.meta][MetaKeys.video_facetrack_attribute_demographic][0]
+        child_flag = sample[Fields.meta][MetaKeys.video_track_is_child][0]
 
         Total_result = []
 
@@ -131,7 +132,7 @@ def process(self, sample, rank=None):
         if os.path.exists(temp_dir):
             rmtree(temp_dir)
 
-        audio_tag = sample[Fields.video_audio_tags]
+        audio_tag = sample[Fields.meta][MetaKeys.video_audio_tags]
         asd_detection_model = get_model(self.ASD_model_key, rank=rank)
 
         for id_out,video_key in enumerate(loaded_video_keys):
@@ -157,7 +158,7 @@ def load_pkl(file_path):
                 with open(file_path, 'rb') as file:
                     return pickle.load(file)
             # get allTracks
-            allTracks = [load_pkl(item['bbox_path']) for item in sample[Fields.human_track_data_path][id_out]]
+            allTracks = [load_pkl(item['bbox_path']) for item in sample[Fields.meta][MetaKeys.human_track_data_path][id_out]]
 
             # Face clips cropping
             for ii, track in tqdm.tqdm(enumerate(allTracks), total = len(allTracks)):
@@ -199,7 +200,7 @@ def load_pkl(file_path):
             Total_result.append(speak_flag_for_tracks_in_a_video)
             torch.cuda.empty_cache()
 
-        sample[Fields.ASD_revise_flag] = Total_result
+        sample[Fields.meta][self.active_speaker_flag] = Total_result
 
         gc.collect()
         torch.cuda.empty_cache()

diff --git a/data_juicer/ops/mapper/video_audio_attribute_mapper.py b/data_juicer/ops/mapper/video_audio_attribute_mapper.py
@@ -1,8 +1,7 @@
 import librosa
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.constant import Fields, MetaKeys
 from data_juicer.utils.mm_utils import extract_audio_from_video
-from data_juicer.my_pretrained_method.audio_code.wav2vec_age_gender import process_func,AgeGenderModel
+from thirdparty.humanvbench_models.audio_code.wav2vec_age_gender import process_func,AgeGenderModel
 from ..base_op import OPERATORS, Mapper
 from data_juicer.utils.model_utils import get_model, prepare_model
 
@@ -12,8 +11,7 @@
     'tiktoken'
 ]
 
-with AvailabilityChecking(CHECK_PKGS, NAME):
-    from data_juicer.utils.model_utils import get_model, prepare_model
+from data_juicer.utils.model_utils import get_model, prepare_model
 
 
 
@@ -22,9 +20,12 @@ class VideoAudioAttributeMapper(Mapper):
     """Mapper to caption a video according to its audio streams based on
     Qwen-Audio model.
     """
+    _accelerator = 'cuda'
+    _batched_op = True
 
     def __init__(self, 
                  hf_audio_mapper: str = None,
+                 tag_field_name: str = MetaKeys.audio_speech_attribute,
                  *args, **kwargs):
         """
         Initialization method.
@@ -36,31 +37,32 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
+        kwargs.setdefault('mem_required', '7GB')
         super().__init__(*args, **kwargs)
-        self._accelerator = 'cuda'
         self._model_sampling_rate = 16000
 
         self._hf_summarizer = hf_audio_mapper if hf_audio_mapper else 'audeering/wav2vec2-large-robust-24-ft-age-gender'  # noqa: E501
         self.model_key = prepare_model(
-            model_type='huggingface',
+            model_type='wav2vec2_age_gender',
             pretrained_model_name_or_path=self._hf_summarizer,
         )
+        self.tag_field_name = tag_field_name
 
-
-
-
-    def process(self, sample, rank=None):
+    def process_single(self, sample, rank=None):
         # there is no video in this sample
         if self.video_key not in sample or not sample[self.video_key]:
             return []
+
+        if not MetaKeys.video_audio_tags in sample[Fields.meta]:
+            raise ValueError("video_audio_attribute_mapper must be operated after video_tagging_from_audio_mapper.")
 
         # get paths of all video(s)
         loaded_video_keys = sample[self.video_key]
-        audio_tag = sample['__dj__video_audio_tags__']
+        audio_tag = sample[Fields.meta][MetaKeys.video_audio_tags]
 
         Total_result = []
         # get models
-        model, processor = get_model(self.model_key, rank=rank)
+        model, processor = get_model(self.model_key, rank, self.use_cuda())
 
         for i,video in enumerate(loaded_video_keys):
             audio_tag_this = audio_tag[i]
@@ -92,5 +94,5 @@ def process(self, sample, rank=None):
                 Age_female_male_child_dict['child'] = [Age_female_male_child[3]]
                 Total_result.append([Age_female_male_child_dict])
 
-        sample[Fields.audio_speech_attribute] = Total_result
+        sample[Fields.meta][self.tag_field_name] = Total_result
         return sample