From 910ab2a3b29b49981c826ab6c66149dd73f2a3de Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 30 May 2025 14:39:40 +0800
Subject: [PATCH 1/4] fix self-cognition load_from_cache_file

---
 swift/llm/dataset/dataset/llm.py |  5 +++--
 swift/llm/dataset/loader.py      | 23 ++++++++++++++---------
 2 files changed, 17 insertions(+), 11 deletions(-)
diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
index e011a4dd8..85695b6d9 100644
--- a/swift/llm/dataset/dataset/llm.py
+++ b/swift/llm/dataset/dataset/llm.py
@@ -825,12 +825,12 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
 
 
 class SelfCognitionPreprocessor(ResponsePreprocessor):
-    name: Optional[Tuple[str, str]] = None
-    author: Optional[Tuple[str, str]] = None
 
     def __init__(self, *args, query_suffix: str = '', response_prefix: str = '', **kwargs):
         self.query_suffix = query_suffix
         self.response_prefix = response_prefix
+        self.name: Optional[Tuple[str, str]] = None
+        self.author: Optional[Tuple[str, str]] = None
         super().__init__(*args, **kwargs)
 
     def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
@@ -863,4 +863,5 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
             SubsetDataset(
                 'empty_think', preprocess_func=SelfCognitionPreprocessor(response_prefix='<think>\n\n</think>\n\n')),
         ],
+        dataset_name='self-cognition',
         tags=['chat', 'self-cognition', '🔥']))
diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py
index 601302a1e..3d9ae816b 100644
--- a/swift/llm/dataset/loader.py
+++ b/swift/llm/dataset/loader.py
@@ -422,18 +422,23 @@ def load(
 
 
 def init_self_cognition_preprocessor(
+    dataset_meta,
     model_name: Union[Tuple[str, str], List[str], None] = None,
     model_author: Union[Tuple[str, str], List[str], None] = None,
 ) -> None:
     from .dataset.llm import SelfCognitionPreprocessor
-    # zh, en
-    for key in ['model_name', 'model_author']:
-        val = locals()[key]
-        if isinstance(val, str):
-            val = [val]
-        if val is not None and val[0] is not None and (len(val) == 1 or val[1] is None):
-            val = (val[0], val[0])
-        setattr(SelfCognitionPreprocessor, key[len('model_'):], val)
+    preprocess_funcs = [dataset_meta.preprocess_func]
+    preprocess_funcs += [subset.preprocess_func for subset in dataset_meta.subsets if isinstance(subset, SubsetDataset)]
+    for preprocess_func in preprocess_funcs:
+        if isinstance(preprocess_func, SelfCognitionPreprocessor):
+            # zh, en
+            for key in ['model_name', 'model_author']:
+                val = locals()[key]
+                if isinstance(val, str):
+                    val = [val]
+                if val is not None and val[0] is not None and (len(val) == 1 or val[1] is None):
+                    val = (val[0], val[0])
+                setattr(preprocess_func, key[len('model_'):], val)
 
 
 def load_dataset(
@@ -479,7 +484,7 @@ def load_dataset(
     Returns:
         The train dataset and val dataset
     """
-    init_self_cognition_preprocessor(model_name, model_author)
+    init_self_cognition_preprocessor(DATASET_MAPPING['self-cognition'], model_name, model_author)
     if isinstance(datasets, str):
         datasets = [datasets]
     if not isinstance(seed, np.random.RandomState):

From 14992e832b07b4102be41fb3c37a8ff5ef9a6e2b Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 30 May 2025 14:52:48 +0800
Subject: [PATCH 2/4] update

---
 swift/llm/argument/base_args/data_args.py |  9 ++++----
 swift/llm/dataset/dataset/llm.py          |  4 ++++
 swift/llm/dataset/loader.py               | 25 +++++++++++++++--------
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/swift/llm/argument/base_args/data_args.py b/swift/llm/argument/base_args/data_args.py
index 19d7462bc..86aac2bcd 100644
--- a/swift/llm/argument/base_args/data_args.py
+++ b/swift/llm/argument/base_args/data_args.py
@@ -22,9 +22,9 @@ class DataArguments:
         streaming (bool): Flag to enable streaming of datasets. Default is False.
         download_mode (Literal): Mode for downloading datasets. Default is 'reuse_dataset_if_exists'.
         columns: Used for manual column mapping of datasets.
-        model_name (List[str]): List containing Chinese and English names of the model. Default is [None, None].
+        model_name (List[str]): List containing Chinese and English names of the model. Default is None.
         model_author (List[str]): List containing Chinese and English names of the model author.
-            Default is [None, None].
+            Default is None.
         custom_dataset_info (Optional[str]): Path to custom dataset_info.json file. Default is None.
     """
     # dataset_id or dataset_dir or dataset_path
@@ -49,9 +49,8 @@ class DataArguments:
     strict: bool = False
     remove_unused_columns: bool = True
     # Chinese name and English name
-    model_name: List[str] = field(default_factory=lambda: [None, None], metadata={'help': "e.g. ['小黄', 'Xiao Huang']"})
-    model_author: List[str] = field(
-        default_factory=lambda: [None, None], metadata={'help': "e.g. ['魔搭', 'ModelScope']"})
+    model_name: Optional[List[str]] = field(default=None, metadata={'help': "e.g. ['小黄', 'Xiao Huang']"})
+    model_author: Optional[List[str]] = field(default=None, metadata={'help': "e.g. ['魔搭', 'ModelScope']"})
 
     custom_dataset_info: List[str] = field(default_factory=list)  # .json
 
diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
index 85695b6d9..70a00da78 100644
--- a/swift/llm/dataset/dataset/llm.py
+++ b/swift/llm/dataset/dataset/llm.py
@@ -833,6 +833,10 @@ def __init__(self, *args, query_suffix: str = '', response_prefix: str = '', **k
         self.author: Optional[Tuple[str, str]] = None
         super().__init__(*args, **kwargs)
 
+    def set_name_author(self, name, author):
+        self.name = name
+        self.author = author
+
     def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
         for key in ['name', 'author']:
             val = getattr(self, key)
diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py
index 3d9ae816b..6bc1e8a4c 100644
--- a/swift/llm/dataset/loader.py
+++ b/swift/llm/dataset/loader.py
@@ -426,19 +426,26 @@ def init_self_cognition_preprocessor(
     model_name: Union[Tuple[str, str], List[str], None] = None,
     model_author: Union[Tuple[str, str], List[str], None] = None,
 ) -> None:
+    if dataset_meta is None or model_name is None or model_author is None:
+        return
+    kwargs = {}
+    # zh, en
+    for key in ['name', 'author']:
+        val = locals()[f'model_{key}']
+        if isinstance(val, str):
+            val = [val]
+        if val is not None and val[0] is not None and (len(val) == 1 or val[1] is None):
+            val = (val[0], val[0])
+        kwargs[key] = val
+
     from .dataset.llm import SelfCognitionPreprocessor
     preprocess_funcs = [dataset_meta.preprocess_func]
     preprocess_funcs += [subset.preprocess_func for subset in dataset_meta.subsets if isinstance(subset, SubsetDataset)]
     for preprocess_func in preprocess_funcs:
         if isinstance(preprocess_func, SelfCognitionPreprocessor):
-            # zh, en
-            for key in ['model_name', 'model_author']:
-                val = locals()[key]
-                if isinstance(val, str):
-                    val = [val]
-                if val is not None and val[0] is not None and (len(val) == 1 or val[1] is None):
-                    val = (val[0], val[0])
-                setattr(preprocess_func, key[len('model_'):], val)
+            preprocess_func.set_name_author(**kwargs)
+    logger.info_once(
+        f"Successfully set name: {kwargs['name']}, author: {kwargs['author']} in SelfCognitionPreprocessor.")
 
 
 def load_dataset(
@@ -484,7 +491,7 @@ def load_dataset(
     Returns:
         The train dataset and val dataset
     """
-    init_self_cognition_preprocessor(DATASET_MAPPING['self-cognition'], model_name, model_author)
+    init_self_cognition_preprocessor(DATASET_MAPPING.get('self-cognition'), model_name, model_author)
     if isinstance(datasets, str):
         datasets = [datasets]
     if not isinstance(seed, np.random.RandomState):

From fbdb4ffd40631f666329ae1ece93c48916167013 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 30 May 2025 15:04:53 +0800
Subject: [PATCH 3/4] update

---
 swift/llm/dataset/loader.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py
index 6bc1e8a4c..bb090cdfd 100644
--- a/swift/llm/dataset/loader.py
+++ b/swift/llm/dataset/loader.py
@@ -426,7 +426,7 @@ def init_self_cognition_preprocessor(
     model_name: Union[Tuple[str, str], List[str], None] = None,
     model_author: Union[Tuple[str, str], List[str], None] = None,
 ) -> None:
-    if dataset_meta is None or model_name is None or model_author is None:
+    if dataset_meta is None or model_name is None and model_author is None:
         return
     kwargs = {}
     # zh, en
@@ -444,8 +444,8 @@ def init_self_cognition_preprocessor(
     for preprocess_func in preprocess_funcs:
         if isinstance(preprocess_func, SelfCognitionPreprocessor):
             preprocess_func.set_name_author(**kwargs)
-    logger.info_once(
-        f"Successfully set name: {kwargs['name']}, author: {kwargs['author']} in SelfCognitionPreprocessor.")
+    logger.info_once(f"SelfCognitionPreprocessor has been successfully configured with name: {kwargs['name']}, "
+                     f"author: {kwargs['author']}.")
 
 
 def load_dataset(

From d0252480e0255b1eac4b3211ff87828ea6702b57 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 30 May 2025 15:11:47 +0800
Subject: [PATCH 4/4] update

---
 swift/llm/argument/base_args/data_args.py | 3 +--
 swift/llm/dataset/loader.py               | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/swift/llm/argument/base_args/data_args.py b/swift/llm/argument/base_args/data_args.py
index 86aac2bcd..10150485c 100644
--- a/swift/llm/argument/base_args/data_args.py
+++ b/swift/llm/argument/base_args/data_args.py
@@ -23,8 +23,7 @@ class DataArguments:
         download_mode (Literal): Mode for downloading datasets. Default is 'reuse_dataset_if_exists'.
         columns: Used for manual column mapping of datasets.
         model_name (List[str]): List containing Chinese and English names of the model. Default is None.
-        model_author (List[str]): List containing Chinese and English names of the model author.
-            Default is None.
+        model_author (List[str]): List containing Chinese and English names of the model author. Default is None.
         custom_dataset_info (Optional[str]): Path to custom dataset_info.json file. Default is None.
     """
     # dataset_id or dataset_dir or dataset_path
diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py
index bb090cdfd..075f8cc84 100644
--- a/swift/llm/dataset/loader.py
+++ b/swift/llm/dataset/loader.py
@@ -422,7 +422,7 @@ def load(
 
 
 def init_self_cognition_preprocessor(
-    dataset_meta,
+    dataset_meta: Optional[DatasetMeta],
     model_name: Union[Tuple[str, str], List[str], None] = None,
     model_author: Union[Tuple[str, str], List[str], None] = None,
 ) -> None: