Skip to content

Commit f0072f3

Browse files
authored
use max of 32 dataset processes if not explicit (axolotl-ai-cloud#2403)
* use max of 32 dataset processes if not explicit * change alternate min val for consistency
1 parent 59899b9 commit f0072f3

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

src/axolotl/core/datasets/chat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def map_fn(ex):
4343
process_or_cpu_count: int = (
4444
process_count or os.cpu_count() # type: ignore[assignment]
4545
)
46-
num_proc = min(64, process_or_cpu_count)
46+
num_proc = min(32, process_or_cpu_count)
4747
features = data.features.keys()
4848
tokenized_data = data.map(
4949
map_fn,

src/axolotl/utils/config/models/input/v0_4_1/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -728,7 +728,7 @@ class AxolotlInputConfig(
728728
default=None,
729729
json_schema_extra={"description": "streaming dataset to use for pretraining"},
730730
)
731-
dataset_processes: Optional[int] = Field(default=os.cpu_count())
731+
dataset_processes: Optional[int] = Field(default=min(32, os.cpu_count())) # type: ignore[type-var]
732732
dataset_exact_deduplication: Optional[bool] = None
733733
dataset_keep_in_memory: Optional[bool] = None
734734
dataloader_pin_memory: Optional[bool] = None

0 commit comments

Comments
 (0)