diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py index abeb6fc27..5c6828b07 100644 --- a/data_juicer/utils/constant.py +++ b/data_juicer/utils/constant.py @@ -129,9 +129,20 @@ def __getattr__(cls, attr): cls._accessed_by[caller_class].add(stat_key) return stat_key - def get_access_log(cls, dj_cfg=None): + def get_access_log(cls, dj_cfg=None, dataset=None): if cls._accessed_by: return cls._accessed_by + elif dj_cfg and dataset: + tmp_dj_cfg = copy.deepcopy(dj_cfg) + tmp_dj_cfg.use_cache = False + tmp_dj_cfg.use_checkpoint = False + + from data_juicer.core import Analyzer + tmp_analyzer = Analyzer(tmp_dj_cfg) + + dataset = dataset.take(1) + # do not overwrite the true analysis results + tmp_analyzer.run(dataset=dataset, skip_export=True) elif dj_cfg: tmp_dj_cfg = copy.deepcopy(dj_cfg) # the access has been skipped due to the use of cache @@ -175,9 +186,6 @@ def get_access_log(cls, dj_cfg=None): tmp_dj_cfg.use_cache = False tmp_dj_cfg.use_checkpoint = False - from data_juicer.config import get_init_configs - tmp_dj_cfg = get_init_configs(tmp_dj_cfg) - from data_juicer.core import Analyzer tmp_analyzer = Analyzer(tmp_dj_cfg) # do not overwrite the true analysis results