From 0bcfb14c395a9d892e8285f3bdeebbb72354f710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ce=20Ge=20=28=E6=88=88=E7=AD=96=29?= Date: Fri, 25 Oct 2024 14:35:02 +0800 Subject: [PATCH] minor fix (#462) * minor fix * simpify * enhance req --- README.md | 2 +- data_juicer/ops/deduplicator/__init__.py | 4 - data_juicer/ops/filter/__init__.py | 82 ++++--------------- .../ops/filter/image_face_count_filter.py | 1 - data_juicer/ops/mapper/__init__.py | 30 ------- data_juicer/ops/selector/__init__.py | 2 - data_juicer/utils/auto_install_mapping.py | 3 +- docs/Operators.md | 2 +- docs/Operators_ZH.md | 2 +- environments/minimal_requires.txt | 1 + environments/science_requires.txt | 3 +- 11 files changed, 22 insertions(+), 110 deletions(-) diff --git a/README.md b/README.md index 5ae6ae095..46198be90 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ Table of Contents visualization, and multidimensional automatic evaluation, so that you can better understand and improve your data and models. ![Data-in-the-loop](https://img.alicdn.com/imgextra/i2/O1CN017U7Zz31Y7XtCJ5GOz_!!6000000003012-0-tps-3640-1567.jpg) -- **Towards production environment **: Providing efficient and parallel data processing pipelines (Aliyun-PAI\Ray\Slurm\CUDA\OP Fusion) +- **Towards production environment**: Providing efficient and parallel data processing pipelines (Aliyun-PAI\Ray\Slurm\CUDA\OP Fusion) requiring less memory and CPU usage, optimized with automatic fault-toleration. ![sys-perf](https://img.alicdn.com/imgextra/i4/O1CN01Sk0q2U1hdRxbnQXFg_!!6000000004300-0-tps-2438-709.jpg) diff --git a/data_juicer/ops/deduplicator/__init__.py b/data_juicer/ops/deduplicator/__init__.py index 69f73b361..4a19ab056 100644 --- a/data_juicer/ops/deduplicator/__init__.py +++ b/data_juicer/ops/deduplicator/__init__.py @@ -1,7 +1,3 @@ -from . import (document_deduplicator, document_minhash_deduplicator, - document_simhash_deduplicator, image_deduplicator, - ray_document_deduplicator, ray_image_deduplicator, - ray_video_deduplicator, video_deduplicator) from .document_deduplicator import DocumentDeduplicator from .document_minhash_deduplicator import DocumentMinhashDeduplicator from .document_simhash_deduplicator import DocumentSimhashDeduplicator diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py index f21c81546..ace768932 100644 --- a/data_juicer/ops/filter/__init__.py +++ b/data_juicer/ops/filter/__init__.py @@ -1,25 +1,3 @@ -# yapf: disable -from . import (alphanumeric_filter, audio_duration_filter, - audio_nmf_snr_filter, audio_size_filter, - average_line_length_filter, character_repetition_filter, - flagged_words_filter, image_aesthetics_filter, - image_aspect_ratio_filter, image_face_count_filter, - image_face_ratio_filter, image_nsfw_filter, - image_pair_similarity_filter, image_shape_filter, - image_size_filter, image_text_matching_filter, - image_text_similarity_filter, image_watermark_filter, - language_id_score_filter, maximum_line_length_filter, - perplexity_filter, phrase_grounding_recall_filter, - special_characters_filter, specified_field_filter, - specified_numeric_field_filter, stopwords_filter, suffix_filter, - text_action_filter, text_entity_dependency_filter, - text_length_filter, token_num_filter, video_aesthetics_filter, - video_aspect_ratio_filter, video_duration_filter, - video_frames_text_similarity_filter, video_motion_score_filter, - video_nsfw_filter, video_ocr_area_ratio_filter, - video_resolution_filter, video_tagging_from_frames_filter, - video_watermark_filter, word_repetition_filter, - words_num_filter) from .alphanumeric_filter import AlphanumericFilter from .audio_duration_filter import AudioDurationFilter from .audio_nmf_snr_filter import AudioNMFSNRFilter @@ -66,49 +44,21 @@ from .words_num_filter import WordsNumFilter __all__ = [ - 'ImageTextSimilarityFilter', - 'VideoAspectRatioFilter', - 'ImageTextMatchingFilter', - 'ImageNSFWFilter', - 'TokenNumFilter', - 'TextLengthFilter', - 'SpecifiedNumericFieldFilter', - 'AudioNMFSNRFilter', - 'VideoAestheticsFilter', - 'PerplexityFilter', - 'PhraseGroundingRecallFilter', - 'MaximumLineLengthFilter', - 'AverageLineLengthFilter', - 'SpecifiedFieldFilter', - 'VideoTaggingFromFramesFilter', - 'TextEntityDependencyFilter', - 'VideoResolutionFilter', - 'AlphanumericFilter', - 'ImageWatermarkFilter', - 'ImageAestheticsFilter', - 'AudioSizeFilter', - 'StopWordsFilter', - 'CharacterRepetitionFilter', - 'ImageShapeFilter', - 'VideoDurationFilter', - 'TextActionFilter', - 'VideoOcrAreaRatioFilter', - 'VideoNSFWFilter', - 'SpecialCharactersFilter', - 'VideoFramesTextSimilarityFilter', - 'ImageAspectRatioFilter', - 'AudioDurationFilter', - 'LanguageIDScoreFilter', - 'SuffixFilter', - 'ImageSizeFilter', - 'VideoWatermarkFilter', - 'WordsNumFilter', - 'ImageFaceCountFilter', - 'ImageFaceRatioFilter', - 'FlaggedWordFilter', - 'WordRepetitionFilter', - 'VideoMotionScoreFilter', + 'ImageTextSimilarityFilter', 'VideoAspectRatioFilter', + 'ImageTextMatchingFilter', 'ImageNSFWFilter', 'TokenNumFilter', + 'TextLengthFilter', 'SpecifiedNumericFieldFilter', 'AudioNMFSNRFilter', + 'VideoAestheticsFilter', 'PerplexityFilter', 'PhraseGroundingRecallFilter', + 'MaximumLineLengthFilter', 'AverageLineLengthFilter', + 'SpecifiedFieldFilter', 'VideoTaggingFromFramesFilter', + 'TextEntityDependencyFilter', 'VideoResolutionFilter', + 'AlphanumericFilter', 'ImageWatermarkFilter', 'ImageAestheticsFilter', + 'AudioSizeFilter', 'StopWordsFilter', 'CharacterRepetitionFilter', + 'ImageShapeFilter', 'VideoDurationFilter', 'TextActionFilter', + 'VideoOcrAreaRatioFilter', 'VideoNSFWFilter', 'SpecialCharactersFilter', + 'VideoFramesTextSimilarityFilter', 'ImageAspectRatioFilter', + 'AudioDurationFilter', 'LanguageIDScoreFilter', 'SuffixFilter', + 'ImageSizeFilter', 'VideoWatermarkFilter', 'WordsNumFilter', + 'ImageFaceCountFilter', 'ImageFaceRatioFilter', 'FlaggedWordFilter', + 'WordRepetitionFilter', 'VideoMotionScoreFilter', 'ImagePairSimilarityFilter' ] - -# yapf: enable diff --git a/data_juicer/ops/filter/image_face_count_filter.py b/data_juicer/ops/filter/image_face_count_filter.py index f58bfe05e..7f992a122 100644 --- a/data_juicer/ops/filter/image_face_count_filter.py +++ b/data_juicer/ops/filter/image_face_count_filter.py @@ -98,7 +98,6 @@ def compute_stats_single(self, sample, context=False): for key, image in images.items(): dets = detect_faces(image, model, **self.extra_kwargs) face_counts[key] = len(dets) - print(f'face counts: {face_counts}') logger.debug(f'face counts: {face_counts}') except Exception as e: logger.exception(e) diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py index eb814b374..5a31f9d79 100644 --- a/data_juicer/ops/mapper/__init__.py +++ b/data_juicer/ops/mapper/__init__.py @@ -1,31 +1,3 @@ -# yapf: disable -from . import (audio_ffmpeg_wrapped_mapper, chinese_convert_mapper, - clean_copyright_mapper, clean_email_mapper, clean_html_mapper, - clean_ip_mapper, clean_links_mapper, expand_macro_mapper, - extract_qa_mapper, fix_unicode_mapper, - generate_instruction_mapper, image_blur_mapper, - image_captioning_from_gpt4v_mapper, image_captioning_mapper, - image_diffusion_mapper, image_face_blur_mapper, - image_tagging_mapper, nlpaug_en_mapper, nlpcda_zh_mapper, - optimize_instruction_mapper, punctuation_normalization_mapper, - remove_bibliography_mapper, remove_comments_mapper, - remove_header_mapper, remove_long_words_mapper, - remove_non_chinese_character_mapper, - remove_repeat_sentences_mapper, remove_specific_chars_mapper, - remove_table_text_mapper, - remove_words_with_incorrect_substrings_mapper, - replace_content_mapper, sentence_split_mapper, - video_captioning_from_audio_mapper, - video_captioning_from_frames_mapper, - video_captioning_from_summarizer_mapper, - video_captioning_from_video_mapper, video_face_blur_mapper, - video_ffmpeg_wrapped_mapper, video_remove_watermark_mapper, - video_resize_aspect_ratio_mapper, - video_resize_resolution_mapper, video_split_by_duration_mapper, - video_split_by_key_frame_mapper, video_split_by_scene_mapper, - video_tagging_from_audio_mapper, - video_tagging_from_frames_mapper, - whitespace_normalization_mapper) from .audio_ffmpeg_wrapped_mapper import AudioFFmpegWrappedMapper from .chinese_convert_mapper import ChineseConvertMapper from .clean_copyright_mapper import CleanCopyrightMapper @@ -127,5 +99,3 @@ 'VideoFaceBlurMapper', 'ImageTaggingMapper', ] - -# yapf: enable diff --git a/data_juicer/ops/selector/__init__.py b/data_juicer/ops/selector/__init__.py index a90f6db8e..22df12987 100644 --- a/data_juicer/ops/selector/__init__.py +++ b/data_juicer/ops/selector/__init__.py @@ -1,5 +1,3 @@ -from . import (frequency_specified_field_selector, random_selector, - range_specified_field_selector, topk_specified_field_selector) from .frequency_specified_field_selector import FrequencySpecifiedFieldSelector from .random_selector import RandomSelector from .range_specified_field_selector import RangeSpecifiedFieldSelector diff --git a/data_juicer/utils/auto_install_mapping.py b/data_juicer/utils/auto_install_mapping.py index 2d6db9625..c1f310f56 100644 --- a/data_juicer/utils/auto_install_mapping.py +++ b/data_juicer/utils/auto_install_mapping.py @@ -2,7 +2,7 @@ MODULE_TO_PKGS = { 'aesthetics_predictor': ['simple-aesthetics-predictor'], 'cv2': ['opencv-python'], - 'fasttext': ['fasttext', 'fasttext-wheel'], + 'fasttext': ['fasttext-wheel'], 'ffmpeg': ['ffmpeg-python'], 'PIL': ['Pillow'], 'ram': ['ram@git+https://github.com/xinyu1205/recognize-anything.git'], @@ -31,7 +31,6 @@ 'video_tagging_from_audio_mapper' ], 'easyocr': ['video_ocr_area_ratio_filter'], - 'fasttext': ['language_id_score_filter'], 'fasttext-wheel': ['language_id_score_filter'], 'kenlm': ['perplexity_filter'], 'sentencepiece': [ diff --git a/docs/Operators.md b/docs/Operators.md index 27915dc09..f84cc25a0 100644 --- a/docs/Operators.md +++ b/docs/Operators.md @@ -13,7 +13,7 @@ The operators in Data-Juicer are categorized into 5 types. | [ Formatter ]( #formatter ) | 7 | Discovers, loads, and canonicalizes source data | | [ Mapper ]( #mapper ) | 47 | Edits and transforms samples | | [ Filter ]( #filter ) | 43 | Filters out low-quality samples | -| [ Deduplicator ]( #deduplicator ) | 5 | Detects and removes duplicate samples | +| [ Deduplicator ]( #deduplicator ) | 8 | Detects and removes duplicate samples | | [ Selector ]( #selector ) | 4 | Selects top samples based on ranking | diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md index 115e6f446..5a7c8ddda 100644 --- a/docs/Operators_ZH.md +++ b/docs/Operators_ZH.md @@ -13,7 +13,7 @@ Data-Juicer 中的算子分为以下 5 种类型。 | [ Formatter ]( #formatter ) | 7 | 发现、加载、规范化原始数据 | | [ Mapper ]( #mapper ) | 47 | 对数据样本进行编辑和转换 | | [ Filter ]( #filter ) | 43 | 过滤低质量样本 | -| [ Deduplicator ]( #deduplicator ) | 5 | 识别、删除重复样本 | +| [ Deduplicator ]( #deduplicator ) | 8 | 识别、删除重复样本 | | [ Selector ]( #selector ) | 4 | 基于排序选取高质量样本 | 下面列出所有具体算子,每种算子都通过多个标签来注明其主要功能。 diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index 80fd1769a..c83259e39 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -27,3 +27,4 @@ dill==0.3.4 psutil pydantic>=2.0 Pillow +numpy<2 diff --git a/environments/science_requires.txt b/environments/science_requires.txt index 575fd7b97..f1e613126 100644 --- a/environments/science_requires.txt +++ b/environments/science_requires.txt @@ -1,7 +1,6 @@ torch>=1.11.0 torchaudio easyocr==1.7.1 -fasttext fasttext-wheel kenlm sentencepiece @@ -25,5 +24,5 @@ simple-aesthetics-predictor scenedetect[opencv] ffmpeg-python opencv-python -vllm +vllm>=0.1.3 rouge