Skip to content

Commit

Permalink
update humanvbench operators
Browse files Browse the repository at this point in the history
  • Loading branch information
SYSUzhouting committed Mar 9, 2025
1 parent c8b1837 commit 07209f7
Show file tree
Hide file tree
Showing 175 changed files with 5,871 additions and 255 deletions.
41 changes: 25 additions & 16 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -514,40 +514,49 @@ process:
- whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace.


# When use HumanVBench mapper, keep_stats_in_res_ds should be set true

- video_human_tracks_extraction_mapper: # Get the body and face trajectory bounding box of people in one shot of the video. To ensure correctness, it should be applied after video_split_by_scene_mapper
face_track_bbox_path: your_path_to_save_bounding_box_data
YOLOv8_human_model_path: ./data_juicer/my_pretrained_method/YOLOv8_human/weights/best.pt
face_track_bbox_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt # The storage location of the bounding box tracks of the characters in the video
YOLOv8_human_model_path: ./thirdparty/humanvbench_models/YOLOv8_human/weights/best.pt
mem_required: '10GB'

- video_active_speaker_mapper: # Based on the results of video_human_tracks_extraction_mapper, determine whether each person is an active speaker
tempt_save_path: ./HumanVBenchRecipe/dj_ASD_tempt # Used to store temporary videos
face_track_bbox_path: ./HumanVBenchRecipe/dj_human_track # Human track Data storage address in video_human_tracks_extraction_mapper
mem_required: '10GB'
- video_humantrack_face_demographic_mapper: # Get the facial demographics of each person based on the results of video_human_tracks_extraction_mapper
original_data_save_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2 # The location where the specific results of each frame's detection are stored
detect_interval: 5

- video_audio_attribute_mapper: # If the audio is speech, classify the gender and age of the speech
hf_audio_mapper: 'pt_model/wav2vec2-large-robust-24-ft-age-gender' # Huggingface model name for speech age and gender classification
hf_audio_mapper: '/mnt/daoyuan_open_research/zt_data/pt_model/wav2vec2-large-robust-24-ft-age-gender' # Huggingface model name for speech age and gender classification
mem_required: '7GB'

- video_captioning_from_human_tracks_mapper: # Based on the results of video_human_tracks_extraction_mapper, focus on the single person in the video for captioning
video_describe_model_path: pt_model/sharegpt4video-8b # model path to sharegpt4video-8b
tempt_video_path: data_juicer/HumanVBenchRecipe/dj_tmpt # Used to store temporary videos
mem_required: '35GB'
video_describe_model_path: /mnt/daoyuan_open_research/zt_data/pt_model/videollm/VideoLLaMA3-7B # model path to sharegpt4video-8b
trust_remote_code: true
tempt_video_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2 # Used to store temporary videos that will be removed finally.
mem_required: '40GB'

- video_captioning_face_attribute_emotion_mapper: # Based on the results of video_human_tracks_extraction_mapper, focus on judging the gender, age, and race of a single person in the video
face_track_query: Please only describe the appearance and facial emotions of the person in the video in detail. Don't mention the background. Less than 80 words.
cropping_face_video_tempt_path: ./tempt_video/tmp_video_remove # Used to store temporary videos
video_describe_model_path: 'pt_model/VideoLLaMA2' # Huggingface model DAMO-NLP-SG/VideoLLaMA2-7B-16F
mem_required: '35GB'
trust_remote_code: true
cropping_face_video_tempt_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2 # Used to store temporary videos
video_describe_model_path: /mnt/daoyuan_open_research/zt_data/pt_model/videollm/VideoLLaMA3-7B # Huggingface model DAMO-NLP-SG/VideoLLaMA2-7B-16F
mem_required: '40GB'

- video_active_speaker_mapper: # Based on the results of video_human_tracks_extraction_mapper, determine whether each person is an active speaker
tempt_save_path: /home/daoyuan_mm/data-juicer/tmptreciept/tpt2 # Used to store temporary videos
Light_ASD_model_path: /home/daoyuan_mm/data-juicer/thirdparty/humanvbench_models/Light-ASD/weight/finetuning_TalkSet.model
acitve_threshold: 15
mem_required: '10GB'


- video_audio_speech_ASR_mapper: # Automatic speech recognition from video speech
model_dir_ASR: 'pt_model/SenseVoiceSmall' # Huggingface model FunAudioLLM/SenseVoiceSmall
model_dir_ASR: '/mnt/daoyuan_open_research/zt_data/pt_model/SenseVoiceSmall' # Huggingface model FunAudioLLM/SenseVoiceSmall
mem_required: '20GB'

- video_audio_speech_emotion_mapper: # Speech emotion recognition from video speech
model_dir_emo: 'pt_model/SenseVoiceSmall' # # Huggingface model FunAudioLLM/SenseVoiceSmall
model_dir_emo: '/mnt/daoyuan_open_research/zt_data/pt_model/SenseVoiceSmall' # # Huggingface model FunAudioLLM/SenseVoiceSmall
mem_required: '20GB'


# Filter ops
- video_face_ratio_filter: # Filter to retain human-centric videos
threshold: 0.65 # The lower limit of the ratio of frames with faces to the total number of video frames
Expand Down
11 changes: 4 additions & 7 deletions data_juicer/ops/filter/video_face_ratio_filter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import av
import numpy as np
from jsonargparse.typing import ClosedUnitInterval
from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import (load_data_with_context, load_video,
pil_to_opencv, pil_to_opencv, process_each_frame)
Expand All @@ -13,9 +12,8 @@

OP_NAME = 'video_face_ratio_filter'

with AvailabilityChecking(['dlib', 'Pillow'], OP_NAME):
import cv2,dlib
from PIL import ImageFilter
import cv2,dlib
from PIL import ImageFilter

@OPERATORS.register_module(OP_NAME)
@LOADED_VIDEOS.register_module(OP_NAME)
Expand Down Expand Up @@ -49,13 +47,12 @@ def __init__(self,

# Initialize face detector
self.detector = dlib.get_frontal_face_detector()
# self.detector_key = prepare_model(model_type='face_detect_S3FD')


self.detect_interval = detect_interval


def compute_stats(self, sample, rank=None, context=False):
def compute_stats_single(self, sample, rank=None, context=False):
# check if it's computed already
if StatsKeys.video_face_exist in sample[Fields.stats]:
return sample
Expand Down Expand Up @@ -126,7 +123,7 @@ def compute_stats(self, sample, rank=None, context=False):

return sample

def process(self, sample):
def process_single(self, sample):
video_faces_ratio = sample[Fields.stats][StatsKeys.video_face_exist]
keep_bools = np.array([
duration >= self.threshold
Expand Down
5 changes: 4 additions & 1 deletion data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,12 @@
from .video_active_speaker_mapper import VideoActiveSpeakerMapper
from .video_audio_attribute_mapper import VideoAudioAttributeMapper
from .video_audio_speech_ASR_mapper import VideoAudioSpeechASRMapper
from .video_audio_speech_emotion_mapper import VideoAudioSpeechEmotionMapper
from .video_captioning_face_attribute_emotion_mapper import VideoCaptioningFaceAttributeEmotionMapper
from .video_captioning_from_human_tracks_mapper import VideoCaptioningFromHumanTracksMapper
from .video_human_tracks_extraction_mapper import VideoHumanTracksExtractionMapper
from .video_captioning_face_attribute_emotion_mapper import VideoCaptioningFaceAttributeEmotionMapper
from .video_humantrack_face_demographic_mapper import VideoHumantrackFaceDemographicMapper

__all__ = [
'AudioFFmpegWrappedMapper', 'CalibrateQAMapper', 'CalibrateQueryMapper',
Expand Down Expand Up @@ -115,5 +117,6 @@
'WhitespaceNormalizationMapper','VideoActiveSpeakerMapper',
'VideoAudioAttributeMapper', 'VideoAudioSpeechASRMapper',
'VideoCaptioningFaceAttributeEmotionMapper','VideoCaptioningFromHumanTracksMapper',
'VideoHumanTracksExtractionMapper', 'VideoCaptioningFaceAttributeEmotionMapper'
'VideoHumanTracksExtractionMapper', 'VideoCaptioningFaceAttributeEmotionMapper',
'VideoHumantrackFaceDemographicMapper', 'VideoAudioSpeechEmotionMapper'
]
67 changes: 34 additions & 33 deletions data_juicer/ops/mapper/video_active_speaker_mapper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields

from data_juicer.utils.ASD_mapper_utils import get_video_array_cv2,evaluate_network, \
crop_video_with_facetrack, longest_continuous_actives

Expand All @@ -11,50 +8,54 @@

OP_NAME = 'video_active_speaker_mapper'

with AvailabilityChecking([], OP_NAME):
import torch
import sys
sys.path.append('./data_juicer/my_pretrained_method/Light-ASD')
import tempfile
import shutil, pickle
from shutil import rmtree
import os, subprocess
import tqdm, glob
# from model.faceDetector.s3fd import S3FD
import torch
import sys
sys.path.append('./thirdparty/humanvbench_models/Light-ASD')
from data_juicer.utils.constant import Fields, MetaKeys
import tempfile
import shutil, pickle
from shutil import rmtree
import os, subprocess
import tqdm, glob
# from model.faceDetector.s3fd import S3FD


@OPERATORS.register_module(OP_NAME)
@LOADED_VIDEOS.register_module(OP_NAME)
class VideoActiveSpeakerMapper(Mapper):
_accelerator = 'cuda'
_batched_op = True

"""
"""

_default_kwargs = {'upsample_num_times': 0}

def __init__(self,
tempt_save_path: str = './HumanVBenchRecipe/dj_ASD_tempt',
face_track_bbox_path: str = './HumanVBenchRecipe/dj_human_track',
Light_ASD_model_path: str = 'weight/finetuning_TalkSet.model',
Light_ASD_model_path: str = './thirdparty/humanvbench_models/Light-ASD/weight/finetuning_TalkSet.model',
acitve_threshold: int = 15,
active_speaker_flag: str = MetaKeys.active_speaker_flag,
*args,
**kwargs):
"""
Initialization method.
:param blur_type:
"""
kwargs.setdefault('mem_required', '10GB')
super().__init__(*args, **kwargs)
self._accelerator = 'cuda'
self._init_parameters = self.remove_extra_parameters(locals())
self.acitve_threshold = acitve_threshold

self.tempt_save_path = tempt_save_path
self.face_track_bbox_path = face_track_bbox_path

# Initialize ASD model
self.ASD_model_key = prepare_model(model_type='Light_ASD',
pretrained_model_name_or_path=Light_ASD_model_path)


self.active_speaker_flag = active_speaker_flag

def active_speaker_detection_revise(self, active_score,is_child_descrip,speech_audio,face_gender):
speech_child = speech_audio['child'][0]
speech_male = speech_audio['male'][0]
Expand Down Expand Up @@ -85,7 +86,7 @@ def active_speaker_detection_revise(self, active_score,is_child_descrip,speech_a
if not is_child_voice == 'Not Sure':
if is_child_apperance == is_child_voice:
# gender consistency test
if speech_gender_confidence > 0.65 and float(face_gender[1]) > 0.65:
if speech_gender_confidence > 0.9 and float(face_gender[1]) > 0.9:
if not speech_gender == face_gender[0]:
speak_active = False
else:
Expand All @@ -95,31 +96,31 @@ def active_speaker_detection_revise(self, active_score,is_child_descrip,speech_a
return False


def process(self, sample, rank=None):
def process_single(self, sample, rank=None):
# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
sample[Fields.source_file] = []
return sample

if not Fields.video_audio_tags in sample:
if not MetaKeys.video_audio_tags in sample[Fields.meta]:
raise ValueError("video_active_speaker_mapper must be operated after video_tagging_from_audio_mapper.")

if not Fields.human_track_data_path in sample:
if not MetaKeys.human_track_data_path in sample[Fields.meta]:
raise ValueError("video_active_speaker_mapper must be operated after video_human_tracks_extraction_mapper.")

if not Fields.audio_speech_attribute in sample:
raise ValueError("video_active_speaker_mapper must be operated after audio_speech_attribute.")
if not MetaKeys.audio_speech_attribute in sample[Fields.meta]:
raise ValueError("video_active_speaker_mapper must be operated after video_audio_attribute_mapper.")

if not Fields.video_facetrack_attribute_demographic in sample:
raise ValueError("video_active_speaker_mapper must be operated after video_facetrack_attribute_demographic.")
if not MetaKeys.video_facetrack_attribute_demographic in sample[Fields.meta]:
raise ValueError("video_active_speaker_mapper must be operated after video_humantrack_face_demographic_mapper.")

if not Fields.video_facetrack_is_child in sample:
if not MetaKeys.video_track_is_child in sample[Fields.meta]:
raise ValueError("video_active_speaker_mapper must be operated after video_captioning_from_human_tracks_mapper.")

loaded_video_keys = sample[self.video_key]
audio_speech_attribute = sample[Fields.audio_speech_attribute]
face_demographic = sample[Fields.video_facetrack_attribute_demographic][0]
child_flag = sample[Fields.video_facetrack_is_child]
audio_speech_attribute = sample[Fields.meta][MetaKeys.audio_speech_attribute]
face_demographic = sample[Fields.meta][MetaKeys.video_facetrack_attribute_demographic][0]
child_flag = sample[Fields.meta][MetaKeys.video_track_is_child][0]

Total_result = []

Expand All @@ -131,7 +132,7 @@ def process(self, sample, rank=None):
if os.path.exists(temp_dir):
rmtree(temp_dir)

audio_tag = sample[Fields.video_audio_tags]
audio_tag = sample[Fields.meta][MetaKeys.video_audio_tags]
asd_detection_model = get_model(self.ASD_model_key, rank=rank)

for id_out,video_key in enumerate(loaded_video_keys):
Expand All @@ -157,7 +158,7 @@ def load_pkl(file_path):
with open(file_path, 'rb') as file:
return pickle.load(file)
# get allTracks
allTracks = [load_pkl(item['bbox_path']) for item in sample[Fields.human_track_data_path][id_out]]
allTracks = [load_pkl(item['bbox_path']) for item in sample[Fields.meta][MetaKeys.human_track_data_path][id_out]]

# Face clips cropping
for ii, track in tqdm.tqdm(enumerate(allTracks), total = len(allTracks)):
Expand Down Expand Up @@ -199,7 +200,7 @@ def load_pkl(file_path):
Total_result.append(speak_flag_for_tracks_in_a_video)
torch.cuda.empty_cache()

sample[Fields.ASD_revise_flag] = Total_result
sample[Fields.meta][self.active_speaker_flag] = Total_result

gc.collect()
torch.cuda.empty_cache()
Expand Down
30 changes: 16 additions & 14 deletions data_juicer/ops/mapper/video_audio_attribute_mapper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import librosa
from data_juicer.utils.constant import Fields
from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, MetaKeys
from data_juicer.utils.mm_utils import extract_audio_from_video
from data_juicer.my_pretrained_method.audio_code.wav2vec_age_gender import process_func,AgeGenderModel
from thirdparty.humanvbench_models.audio_code.wav2vec_age_gender import process_func,AgeGenderModel
from ..base_op import OPERATORS, Mapper
from data_juicer.utils.model_utils import get_model, prepare_model

Expand All @@ -12,8 +11,7 @@
'tiktoken'
]

with AvailabilityChecking(CHECK_PKGS, NAME):
from data_juicer.utils.model_utils import get_model, prepare_model
from data_juicer.utils.model_utils import get_model, prepare_model



Expand All @@ -22,9 +20,12 @@ class VideoAudioAttributeMapper(Mapper):
"""Mapper to caption a video according to its audio streams based on
Qwen-Audio model.
"""
_accelerator = 'cuda'
_batched_op = True

def __init__(self,
hf_audio_mapper: str = None,
tag_field_name: str = MetaKeys.audio_speech_attribute,
*args, **kwargs):
"""
Initialization method.
Expand All @@ -36,31 +37,32 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
kwargs.setdefault('mem_required', '7GB')
super().__init__(*args, **kwargs)
self._accelerator = 'cuda'
self._model_sampling_rate = 16000

self._hf_summarizer = hf_audio_mapper if hf_audio_mapper else 'audeering/wav2vec2-large-robust-24-ft-age-gender' # noqa: E501
self.model_key = prepare_model(
model_type='huggingface',
model_type='wav2vec2_age_gender',
pretrained_model_name_or_path=self._hf_summarizer,
)
self.tag_field_name = tag_field_name




def process(self, sample, rank=None):
def process_single(self, sample, rank=None):
# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
return []

if not MetaKeys.video_audio_tags in sample[Fields.meta]:
raise ValueError("video_audio_attribute_mapper must be operated after video_tagging_from_audio_mapper.")

# get paths of all video(s)
loaded_video_keys = sample[self.video_key]
audio_tag = sample['__dj__video_audio_tags__']
audio_tag = sample[Fields.meta][MetaKeys.video_audio_tags]

Total_result = []
# get models
model, processor = get_model(self.model_key, rank=rank)
model, processor = get_model(self.model_key, rank, self.use_cuda())

for i,video in enumerate(loaded_video_keys):
audio_tag_this = audio_tag[i]
Expand Down Expand Up @@ -92,5 +94,5 @@ def process(self, sample, rank=None):
Age_female_male_child_dict['child'] = [Age_female_male_child[3]]
Total_result.append([Age_female_male_child_dict])

sample[Fields.audio_speech_attribute] = Total_result
sample[Fields.meta][self.tag_field_name] = Total_result
return sample
Loading

0 comments on commit 07209f7

Please sign in to comment.