Skip to content

[dataset] fix self-cognition & load_from_cache_file #4426

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions swift/llm/argument/base_args/data_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@ class DataArguments:
streaming (bool): Flag to enable streaming of datasets. Default is False.
download_mode (Literal): Mode for downloading datasets. Default is 'reuse_dataset_if_exists'.
columns: Used for manual column mapping of datasets.
model_name (List[str]): List containing Chinese and English names of the model. Default is [None, None].
model_author (List[str]): List containing Chinese and English names of the model author.
Default is [None, None].
model_name (List[str]): List containing Chinese and English names of the model. Default is None.
model_author (List[str]): List containing Chinese and English names of the model author. Default is None.
custom_dataset_info (Optional[str]): Path to custom dataset_info.json file. Default is None.
"""
# dataset_id or dataset_dir or dataset_path
Expand All @@ -49,9 +48,8 @@ class DataArguments:
strict: bool = False
remove_unused_columns: bool = True
# Chinese name and English name
model_name: List[str] = field(default_factory=lambda: [None, None], metadata={'help': "e.g. ['小黄', 'Xiao Huang']"})
model_author: List[str] = field(
default_factory=lambda: [None, None], metadata={'help': "e.g. ['魔搭', 'ModelScope']"})
model_name: Optional[List[str]] = field(default=None, metadata={'help': "e.g. ['小黄', 'Xiao Huang']"})
model_author: Optional[List[str]] = field(default=None, metadata={'help': "e.g. ['魔搭', 'ModelScope']"})

custom_dataset_info: List[str] = field(default_factory=list) # .json

Expand Down
9 changes: 7 additions & 2 deletions swift/llm/dataset/dataset/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -825,14 +825,18 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:


class SelfCognitionPreprocessor(ResponsePreprocessor):
name: Optional[Tuple[str, str]] = None
author: Optional[Tuple[str, str]] = None

def __init__(self, *args, query_suffix: str = '', response_prefix: str = '', **kwargs):
self.query_suffix = query_suffix
self.response_prefix = response_prefix
self.name: Optional[Tuple[str, str]] = None
self.author: Optional[Tuple[str, str]] = None
super().__init__(*args, **kwargs)

def set_name_author(self, name, author):
self.name = name
self.author = author

def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
for key in ['name', 'author']:
val = getattr(self, key)
Expand Down Expand Up @@ -863,4 +867,5 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
SubsetDataset(
'empty_think', preprocess_func=SelfCognitionPreprocessor(response_prefix='<think>\n\n</think>\n\n')),
],
dataset_name='self-cognition',
tags=['chat', 'self-cognition', '🔥']))
22 changes: 17 additions & 5 deletions swift/llm/dataset/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,18 +422,30 @@ def load(


def init_self_cognition_preprocessor(
dataset_meta: Optional[DatasetMeta],
model_name: Union[Tuple[str, str], List[str], None] = None,
model_author: Union[Tuple[str, str], List[str], None] = None,
) -> None:
from .dataset.llm import SelfCognitionPreprocessor
if dataset_meta is None or model_name is None and model_author is None:
return
kwargs = {}
# zh, en
for key in ['model_name', 'model_author']:
val = locals()[key]
for key in ['name', 'author']:
val = locals()[f'model_{key}']
if isinstance(val, str):
val = [val]
if val is not None and val[0] is not None and (len(val) == 1 or val[1] is None):
val = (val[0], val[0])
setattr(SelfCognitionPreprocessor, key[len('model_'):], val)
kwargs[key] = val

from .dataset.llm import SelfCognitionPreprocessor
preprocess_funcs = [dataset_meta.preprocess_func]
preprocess_funcs += [subset.preprocess_func for subset in dataset_meta.subsets if isinstance(subset, SubsetDataset)]
for preprocess_func in preprocess_funcs:
if isinstance(preprocess_func, SelfCognitionPreprocessor):
preprocess_func.set_name_author(**kwargs)
logger.info_once(f"SelfCognitionPreprocessor has been successfully configured with name: {kwargs['name']}, "
f"author: {kwargs['author']}.")


def load_dataset(
Expand Down Expand Up @@ -479,7 +491,7 @@ def load_dataset(
Returns:
The train dataset and val dataset
"""
init_self_cognition_preprocessor(model_name, model_author)
init_self_cognition_preprocessor(DATASET_MAPPING.get('self-cognition'), model_name, model_author)
if isinstance(datasets, str):
datasets = [datasets]
if not isinstance(seed, np.random.RandomState):
Expand Down
Loading