REFACTOR TO THE MAX (huggingface#7)

GitMonkey0 · Jan 24, 2025 · d8aa42d · d8aa42d
1 parent 4cb6c95
commit d8aa42d
Show file tree

Hide file tree

Showing 19 changed files with 121 additions and 1,285 deletions.
diff --git a/Makefile b/Makefile
@@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src
 
-check_dirs := src scripts
+check_dirs := src
 
 style:
 	black --line-length 119 --target-version py310 $(check_dirs) setup.py

diff --git a/README.md b/README.md
@@ -55,6 +55,10 @@ If it isn't installed, run:
 sudo apt-get install git-lfs
 ```
 
+## Training models
+
+
+
 ## Evaluating models
 
 For small models use `--data_parallel=$NUM_GPUS`, for large models shard with `--tensor_parallel=$NUM_GPUS`

diff --git a/...s/accelerate_configs/deepspeed_zero3.yaml → accelerate_configs/zero3.yaml b/...s/accelerate_configs/deepspeed_zero3.yaml → accelerate_configs/zero3.yaml
diff --git a/recipes/launch.slurm → launch.slurm b/recipes/launch.slurm → launch.slurm
@@ -24,29 +24,13 @@ echo "PYTHON ENV: $(which python)"
 MODEL=Qwen2.5-1.5B-Instruct
 TASK=sft
 PRECISION=v00.00
-ACCELERATOR=deepspeed_zero3
+ACCELERATOR=zero3
 
 # Training setup
 NUM_NODES=$SLURM_NNODES
 GPUS_PER_NODE=8
 WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
-# Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
-CONFIG_FILE=recipes/$MODEL/$TASK/config_$PRECISION.yaml
 
-echo "CONFIG_FILE: $CONFIG_FILE"
-GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
-
-
-# Loop through the arguments and find the one with "--gradient_accumulation_steps"
-for arg in "${ARGS[@]}"; do
-    if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
-        # Extract the value after the equals sign
-        GRAD_ACC_STEPS="${arg#*=}"
-        break  # Exit the loop once we find the desired argument
-    fi
-done
-
-echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
 # so processes know who to talk to
 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 MASTER_PORT=6000
@@ -56,7 +40,7 @@ export CMD=" \
     "
 
 export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
-    --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
+    --config_file accelerate_configs/$ACCELERATOR.yaml  \
     --gradient_accumulation_steps $GRAD_ACC_STEPS \
     --num_machines $NUM_NODES \
     --num_processes $WORLD_SIZE \

diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_v00.00.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_v00.00.yaml
diff --git a/recipes/accelerate_configs/fsdp.yaml b/recipes/accelerate_configs/fsdp.yaml
diff --git a/recipes/accelerate_configs/fsdp_qlora.yaml b/recipes/accelerate_configs/fsdp_qlora.yaml
diff --git a/recipes/accelerate_configs/multi_gpu.yaml b/recipes/accelerate_configs/multi_gpu.yaml
diff --git a/scripts/training/README.md b/scripts/training/README.md