Merge pull request #1374 from bghira/main

bghira · web-flow · commit 76cf875bc64c · 2025-04-07T16:55:08.000-06:00
merge
diff --git a/docker-start.sh b/docker-start.sh
@@ -4,7 +4,7 @@
 # This file can then later be sourced in a login shell
 echo "Exporting environment variables..."
 printenv |
-	grep -E '^RUNPOD_|^PATH=|^HF_HOME=|^HUGGING_FACE_HUB_TOKEN=|^_=' |
+	grep -E '^RUNPOD_|^PATH=|^HF_HOME=|^HF_TOKEN=|^HUGGING_FACE_HUB_TOKEN=|^WANDB_API_KEY=|^WANDB_TOKEN=|^_=' |
 	sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >>/etc/rp_environment
 
 # Add it to Bash login script
@@ -26,9 +26,19 @@ fi
 # Start SSH server
 service ssh start
 
-# Load HF, WanDB tokens
-if [ -n "$HUGGING_FACE_HUB_TOKEN" ]; then huggingface-cli login --token "$HUGGING_FACE_HUB_TOKEN" --add-to-git-credential; else echo "HUGGING_FACE_HUB_TOKEN not set; skipping login"; fi
-if [ -n "$WANDB_TOKEN" ]; then wandb login "$WANDB_TOKEN"; else echo "WANDB_TOKEN not set; skipping login"; fi
+# Login to HF
+if [[ -n "${HF_TOKEN:-$HUGGING_FACE_HUB_TOKEN}" ]]; then
+	huggingface-cli login --token "${HF_TOKEN:-$HUGGING_FACE_HUB_TOKEN}" --add-to-git-credential
+else
+	echo "HF_TOKEN or HUGGING_FACE_HUB_TOKEN not set; skipping login"
+fi
+
+# Login to WanDB
+if [[ -n "${WANDB_API_KEY:-$WANDB_TOKEN}" ]]; then
+	wandb login "${WANDB_API_KEY:-$WANDB_TOKEN}"
+else
+	echo "WANDB_API_KEY or WANDB_TOKEN not set; skipping login"
+fi
 
 # 🫡
 sleep infinity
diff --git a/documentation/DOCKER.md b/documentation/DOCKER.md
@@ -39,7 +39,7 @@ This command sets up the container with GPU access and maps the SSH port for ext
 To facilitate integration with external tools, the container supports environment variables for Huggingface and WandB tokens. Pass these at runtime as follows:
 
 ```bash
-docker run --gpus all -e HUGGING_FACE_HUB_TOKEN='your_token' -e WANDB_TOKEN='your_token' -it -p 22:22 simpletuner
+docker run --gpus all -e HF_TOKEN='your_token' -e WANDB_API_KEY='your_token' -it -p 22:22 simpletuner
 ```
 
 ### 4. Data Volumes
@@ -98,8 +98,8 @@ services:
       - "[path to your datasets]:/datasets"
       - "[path to your configs]:/workspace/SimpleTuner/config"
     environment:
-      HUGGING_FACE_HUB_TOKEN: [your hugging face token]
-      WANDB_TOKEN: [your wanddb token]
+      HF_TOKEN: [your hugging face token]
+      WANDB_API_KEY: [your wanddb token]
     command: ["tail", "-f", "/dev/null"]
     deploy:
       resources:
@@ -155,4 +155,4 @@ services:
 ### General Advice
 
 - **Logs and Output**: Review the container logs and output for any error messages or warnings that can provide more context on the issue.
-- **Documentation and Forums**: Consult the Docker and NVIDIA CUDA documentation for more detailed troubleshooting advice. Community forums and issue trackers related to the specific software or dependencies you are using can also be valuable resources.
+- **Documentation and Forums**: Consult the Docker and NVIDIA CUDA documentation for more detailed troubleshooting advice. Community forums and issue trackers related to the specific software or dependencies you are using can also be valuable resources.
diff --git a/helpers/models/sd3/__init__.py b/helpers/models/sd3/__init__.py
diff --git a/helpers/models/sd3/pipeline.py b/helpers/models/sd3/pipeline.py
@@ -69,7 +69,32 @@
         >>> image.save("sd3.png")
         ```
 """
-
+@torch.cuda.amp.autocast(dtype=torch.float32)
+def optimized_scale(positive_flat, negative_flat):
+
+    # Calculate dot production
+    dot_product = torch.sum(positive_flat * negative_flat, dim=1, keepdim=True)
+
+    # Squared norm of uncondition
+    squared_norm = torch.sum(negative_flat ** 2, dim=1, keepdim=True) + 1e-8
+
+    # st_star = v_cond^T * v_uncond / ||v_uncond||^2
+    st_star = dot_product / squared_norm
+    
+    return st_star
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
@@ -763,6 +788,7 @@ def __call__(
         width: Optional[int] = None,
         num_inference_steps: int = 28,
         timesteps: List[int] = None,
+        sigmas: Optional[List[float]] = None,
         guidance_scale: float = 7.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt_2: Optional[Union[str, List[str]]] = None,
@@ -785,6 +811,11 @@ def __call__(
         skip_layer_guidance_scale: int = 2.8,
         skip_layer_guidance_stop: int = 0.2,
         skip_layer_guidance_start: int = 0.01,
+        mu: Optional[float] = None,
+        use_cfg_zero_star: Optional[bool] = True,
+        use_zero_init: Optional[bool] = True,
+        zero_steps: Optional[int] = 0,
+
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -970,16 +1001,7 @@ def __call__(
                 [negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0
             )
 
-        # 4. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler, num_inference_steps, device, timesteps
-        )
-        num_warmup_steps = max(
-            len(timesteps) - num_inference_steps * self.scheduler.order, 0
-        )
-        self._num_timesteps = len(timesteps)
-
-        # 5. Prepare latent variables
+        # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
@@ -992,6 +1014,35 @@ def __call__(
             latents,
         )
 
+        # 5. Prepare timesteps
+        scheduler_kwargs = {}
+        if self.scheduler.config.get("use_dynamic_shifting", None) and mu is None:
+            _, _, height, width = latents.shape
+            image_seq_len = (height // self.transformer.config.patch_size) * (
+                width // self.transformer.config.patch_size
+            )
+            mu = calculate_shift(
+                image_seq_len,
+                self.scheduler.config.base_image_seq_len,
+                self.scheduler.config.max_image_seq_len,
+                self.scheduler.config.base_shift,
+                self.scheduler.config.max_shift,
+            )
+            scheduler_kwargs["mu"] = mu
+        elif mu is not None:
+            scheduler_kwargs["mu"] = mu
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            **scheduler_kwargs,
+        )
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        self._num_timesteps = len(timesteps)
+
         # 6. Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
@@ -1026,9 +1077,21 @@ def __call__(
                 # perform guidance
                 if self.do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + self.guidance_scale * (
-                        noise_pred_text - noise_pred_uncond
-                    )
+                    if use_cfg_zero_star:
+                        positive_flat = noise_pred_text.view(batch_size, -1)  
+                        negative_flat = noise_pred_uncond.view(batch_size, -1)  
+
+                        alpha = optimized_scale(positive_flat,negative_flat)
+                        alpha = alpha.view(batch_size, 1, 1, 1)
+                        alpha = alpha.to(positive_flat.dtype)
+
+                        if (i <= zero_steps) and use_zero_init:
+                            noise_pred = noise_pred_text*0.
+                        else:
+                            noise_pred = noise_pred_uncond * alpha + guidance_scale * (noise_pred_text - noise_pred_uncond * alpha)
+                    else:
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
                     should_skip_layers = (
                         True
                         if i > num_inference_steps * skip_layer_guidance_start
@@ -1810,6 +1873,7 @@ def __call__(
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 256,
+
     ):
         r"""
         Function invoked when calling the pipeline for generation.
diff --git a/helpers/prompts.py b/helpers/prompts.py
@@ -30,7 +30,6 @@
     "fairy_garden": "Whimsical garden filled with fairies, magical plants, sparkling lights, serene atmosphere, high detail",
     "fantasy_dragon": "Majestic dragon soaring through the sky, detailed scales, dynamic pose, fantasy art, high resolution",
     "floating_islands": "Fantasy world, floating islands in the sky, waterfalls, lush vegetation, detailed landscape, high resolution",
-    "futuristic_cityscape": "Futuristic city skyline at night, neon lights, cyberpunk style, high contrast, sharp focus",
     "galactic_battle": "Space battle scene, starships fighting, laser beams, explosions, cosmic background",
     "haunted_fairground": "Abandoned fairground at night, eerie rides, ghostly figures, fog, dark atmosphere, high detail",
     "haunted_mansion": "Spooky haunted mansion on a hill, dark and eerie, glowing windows, ghostly atmosphere, high detail",
diff --git a/helpers/training/trainer.py b/helpers/training/trainer.py
@@ -779,7 +779,9 @@ def init_unload_text_encoder(self):
         reclaim_memory()
         memory_after_unload = self.stats_memory_used()
         memory_saved = memory_after_unload - memory_before_unload
-        logger.info(f"After nuking text encoders from orbit, we freed {abs(round(memory_saved, 2))} GB of VRAM.")
+        logger.info(
+            f"After nuking text encoders from orbit, we freed {abs(round(memory_saved, 2))} GB of VRAM."
+        )
 
     def init_precision(
         self, preprocessing_models_only: bool = False, ema_only: bool = False
@@ -1650,6 +1652,8 @@ def init_resume_checkpoint(self, lr_scheduler):
                 p = group["params"][0]
                 group["running_d_numerator"] = group["running_d_numerator"].to(p.device)
                 group["running_d_denom"] = group["running_d_denom"].to(p.device)
+                if "use_focus" not in group:
+                    group["use_focus"] = False
 
         return lr_scheduler
 
diff --git a/helpers/training/validation.py b/helpers/training/validation.py
@@ -611,7 +611,7 @@ def init_vae(self):
                 ).to(self.inference_device)
         StateTracker.set_vae(self.vae)
 
-        logger.info(f"VAE type: {type(self.vae)}")
+        # logger.info(f"VAE type: {type(self.vae)}")
         return self.vae
 
     def _discover_validation_input_samples(self):
@@ -1108,7 +1108,7 @@ def setup_scheduler(self):
                 scheduler_args["use_beta_sigmas"] = True
                 scheduler_args["shift"] = self.args.flow_schedule_shift
             if self.args.validation_noise_scheduler == "unipc":
-                scheduler_args["prediction_type"] = 'flow_prediction'
+                scheduler_args["prediction_type"] = "flow_prediction"
                 scheduler_args["use_flow_sigmas"] = True
                 scheduler_args["num_train_timesteps"] = 1000
                 scheduler_args["flow_shift"] = self.args.flow_schedule_shift
diff --git a/train.sh b/train.sh
@@ -91,7 +91,12 @@ if [ -z "${DISABLE_UPDATES}" ]; then
 fi
 # Run the training script.
 if [[ -z "${ACCELERATE_CONFIG_PATH}" ]]; then
-    ACCELERATE_CONFIG_PATH="${HOME}/.cache/huggingface/accelerate/default_config.yaml"
+    # Look for accelerate config in HF_HOME first, otherwise fallback to $HOME
+    if [[ -f "${HF_HOME}/accelerate/default_config.yaml" ]]; then
+        ACCELERATE_CONFIG_PATH="${HF_HOME}/accelerate/default_config.yaml"
+    else
+        ACCELERATE_CONFIG_PATH="${HOME}/.cache/huggingface/accelerate/default_config.yaml"
+    fi
 fi
 if [ -f "${ACCELERATE_CONFIG_PATH}" ]; then
     echo "Using Accelerate config file: ${ACCELERATE_CONFIG_PATH}"