From 910ab2a3b29b49981c826ab6c66149dd73f2a3de Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 30 May 2025 14:39:40 +0800 Subject: [PATCH 1/4] fix self-cognition load_from_cache_file --- swift/llm/dataset/dataset/llm.py | 5 +++-- swift/llm/dataset/loader.py | 23 ++++++++++++++--------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py index e011a4dd8..85695b6d9 100644 --- a/swift/llm/dataset/dataset/llm.py +++ b/swift/llm/dataset/dataset/llm.py @@ -825,12 +825,12 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: class SelfCognitionPreprocessor(ResponsePreprocessor): - name: Optional[Tuple[str, str]] = None - author: Optional[Tuple[str, str]] = None def __init__(self, *args, query_suffix: str = '', response_prefix: str = '', **kwargs): self.query_suffix = query_suffix self.response_prefix = response_prefix + self.name: Optional[Tuple[str, str]] = None + self.author: Optional[Tuple[str, str]] = None super().__init__(*args, **kwargs) def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: @@ -863,4 +863,5 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: SubsetDataset( 'empty_think', preprocess_func=SelfCognitionPreprocessor(response_prefix='\n\n\n\n')), ], + dataset_name='self-cognition', tags=['chat', 'self-cognition', '🔥'])) diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py index 601302a1e..3d9ae816b 100644 --- a/swift/llm/dataset/loader.py +++ b/swift/llm/dataset/loader.py @@ -422,18 +422,23 @@ def load( def init_self_cognition_preprocessor( + dataset_meta, model_name: Union[Tuple[str, str], List[str], None] = None, model_author: Union[Tuple[str, str], List[str], None] = None, ) -> None: from .dataset.llm import SelfCognitionPreprocessor - # zh, en - for key in ['model_name', 'model_author']: - val = locals()[key] - if isinstance(val, str): - val = [val] - if val is not None and val[0] is not None and (len(val) == 1 or val[1] is None): - val = (val[0], val[0]) - setattr(SelfCognitionPreprocessor, key[len('model_'):], val) + preprocess_funcs = [dataset_meta.preprocess_func] + preprocess_funcs += [subset.preprocess_func for subset in dataset_meta.subsets if isinstance(subset, SubsetDataset)] + for preprocess_func in preprocess_funcs: + if isinstance(preprocess_func, SelfCognitionPreprocessor): + # zh, en + for key in ['model_name', 'model_author']: + val = locals()[key] + if isinstance(val, str): + val = [val] + if val is not None and val[0] is not None and (len(val) == 1 or val[1] is None): + val = (val[0], val[0]) + setattr(preprocess_func, key[len('model_'):], val) def load_dataset( @@ -479,7 +484,7 @@ def load_dataset( Returns: The train dataset and val dataset """ - init_self_cognition_preprocessor(model_name, model_author) + init_self_cognition_preprocessor(DATASET_MAPPING['self-cognition'], model_name, model_author) if isinstance(datasets, str): datasets = [datasets] if not isinstance(seed, np.random.RandomState): From 14992e832b07b4102be41fb3c37a8ff5ef9a6e2b Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 30 May 2025 14:52:48 +0800 Subject: [PATCH 2/4] update --- swift/llm/argument/base_args/data_args.py | 9 ++++---- swift/llm/dataset/dataset/llm.py | 4 ++++ swift/llm/dataset/loader.py | 25 +++++++++++++++-------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/swift/llm/argument/base_args/data_args.py b/swift/llm/argument/base_args/data_args.py index 19d7462bc..86aac2bcd 100644 --- a/swift/llm/argument/base_args/data_args.py +++ b/swift/llm/argument/base_args/data_args.py @@ -22,9 +22,9 @@ class DataArguments: streaming (bool): Flag to enable streaming of datasets. Default is False. download_mode (Literal): Mode for downloading datasets. Default is 'reuse_dataset_if_exists'. columns: Used for manual column mapping of datasets. - model_name (List[str]): List containing Chinese and English names of the model. Default is [None, None]. + model_name (List[str]): List containing Chinese and English names of the model. Default is None. model_author (List[str]): List containing Chinese and English names of the model author. - Default is [None, None]. + Default is None. custom_dataset_info (Optional[str]): Path to custom dataset_info.json file. Default is None. """ # dataset_id or dataset_dir or dataset_path @@ -49,9 +49,8 @@ class DataArguments: strict: bool = False remove_unused_columns: bool = True # Chinese name and English name - model_name: List[str] = field(default_factory=lambda: [None, None], metadata={'help': "e.g. ['小黄', 'Xiao Huang']"}) - model_author: List[str] = field( - default_factory=lambda: [None, None], metadata={'help': "e.g. ['魔搭', 'ModelScope']"}) + model_name: Optional[List[str]] = field(default=None, metadata={'help': "e.g. ['小黄', 'Xiao Huang']"}) + model_author: Optional[List[str]] = field(default=None, metadata={'help': "e.g. ['魔搭', 'ModelScope']"}) custom_dataset_info: List[str] = field(default_factory=list) # .json diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py index 85695b6d9..70a00da78 100644 --- a/swift/llm/dataset/dataset/llm.py +++ b/swift/llm/dataset/dataset/llm.py @@ -833,6 +833,10 @@ def __init__(self, *args, query_suffix: str = '', response_prefix: str = '', **k self.author: Optional[Tuple[str, str]] = None super().__init__(*args, **kwargs) + def set_name_author(self, name, author): + self.name = name + self.author = author + def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: for key in ['name', 'author']: val = getattr(self, key) diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py index 3d9ae816b..6bc1e8a4c 100644 --- a/swift/llm/dataset/loader.py +++ b/swift/llm/dataset/loader.py @@ -426,19 +426,26 @@ def init_self_cognition_preprocessor( model_name: Union[Tuple[str, str], List[str], None] = None, model_author: Union[Tuple[str, str], List[str], None] = None, ) -> None: + if dataset_meta is None or model_name is None or model_author is None: + return + kwargs = {} + # zh, en + for key in ['name', 'author']: + val = locals()[f'model_{key}'] + if isinstance(val, str): + val = [val] + if val is not None and val[0] is not None and (len(val) == 1 or val[1] is None): + val = (val[0], val[0]) + kwargs[key] = val + from .dataset.llm import SelfCognitionPreprocessor preprocess_funcs = [dataset_meta.preprocess_func] preprocess_funcs += [subset.preprocess_func for subset in dataset_meta.subsets if isinstance(subset, SubsetDataset)] for preprocess_func in preprocess_funcs: if isinstance(preprocess_func, SelfCognitionPreprocessor): - # zh, en - for key in ['model_name', 'model_author']: - val = locals()[key] - if isinstance(val, str): - val = [val] - if val is not None and val[0] is not None and (len(val) == 1 or val[1] is None): - val = (val[0], val[0]) - setattr(preprocess_func, key[len('model_'):], val) + preprocess_func.set_name_author(**kwargs) + logger.info_once( + f"Successfully set name: {kwargs['name']}, author: {kwargs['author']} in SelfCognitionPreprocessor.") def load_dataset( @@ -484,7 +491,7 @@ def load_dataset( Returns: The train dataset and val dataset """ - init_self_cognition_preprocessor(DATASET_MAPPING['self-cognition'], model_name, model_author) + init_self_cognition_preprocessor(DATASET_MAPPING.get('self-cognition'), model_name, model_author) if isinstance(datasets, str): datasets = [datasets] if not isinstance(seed, np.random.RandomState): From fbdb4ffd40631f666329ae1ece93c48916167013 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 30 May 2025 15:04:53 +0800 Subject: [PATCH 3/4] update --- swift/llm/dataset/loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py index 6bc1e8a4c..bb090cdfd 100644 --- a/swift/llm/dataset/loader.py +++ b/swift/llm/dataset/loader.py @@ -426,7 +426,7 @@ def init_self_cognition_preprocessor( model_name: Union[Tuple[str, str], List[str], None] = None, model_author: Union[Tuple[str, str], List[str], None] = None, ) -> None: - if dataset_meta is None or model_name is None or model_author is None: + if dataset_meta is None or model_name is None and model_author is None: return kwargs = {} # zh, en @@ -444,8 +444,8 @@ def init_self_cognition_preprocessor( for preprocess_func in preprocess_funcs: if isinstance(preprocess_func, SelfCognitionPreprocessor): preprocess_func.set_name_author(**kwargs) - logger.info_once( - f"Successfully set name: {kwargs['name']}, author: {kwargs['author']} in SelfCognitionPreprocessor.") + logger.info_once(f"SelfCognitionPreprocessor has been successfully configured with name: {kwargs['name']}, " + f"author: {kwargs['author']}.") def load_dataset( From d0252480e0255b1eac4b3211ff87828ea6702b57 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 30 May 2025 15:11:47 +0800 Subject: [PATCH 4/4] update --- swift/llm/argument/base_args/data_args.py | 3 +-- swift/llm/dataset/loader.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/swift/llm/argument/base_args/data_args.py b/swift/llm/argument/base_args/data_args.py index 86aac2bcd..10150485c 100644 --- a/swift/llm/argument/base_args/data_args.py +++ b/swift/llm/argument/base_args/data_args.py @@ -23,8 +23,7 @@ class DataArguments: download_mode (Literal): Mode for downloading datasets. Default is 'reuse_dataset_if_exists'. columns: Used for manual column mapping of datasets. model_name (List[str]): List containing Chinese and English names of the model. Default is None. - model_author (List[str]): List containing Chinese and English names of the model author. - Default is None. + model_author (List[str]): List containing Chinese and English names of the model author. Default is None. custom_dataset_info (Optional[str]): Path to custom dataset_info.json file. Default is None. """ # dataset_id or dataset_dir or dataset_path diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py index bb090cdfd..075f8cc84 100644 --- a/swift/llm/dataset/loader.py +++ b/swift/llm/dataset/loader.py @@ -422,7 +422,7 @@ def load( def init_self_cognition_preprocessor( - dataset_meta, + dataset_meta: Optional[DatasetMeta], model_name: Union[Tuple[str, str], List[str], None] = None, model_author: Union[Tuple[str, str], List[str], None] = None, ) -> None: