clean up flux-1.schnell benchmark and add to docs

vkuzo · vkuzo · commit 0a229a95bf91 · 2026-03-13T13:22:53.000Z
Summary: Combining various improvements to the flux-1.schnell benchmark in a single PR: * set `num_inference_steps` to 4 to match the default for this model * turn on cuda graphs via `reduce-overhead` to improve nvfp4 performance for batch size 1, this required patching the transformer blocks forward using code I got from jbschlosser * add a larger batch size for additional performance metrics at larger shapes * remove torch.compile from vae, as compile times are too long and the goal of this benchmark is to compare recipes and track performance improvements, not achieve best possible latency. Now perf results for a single configuration take ~30s - ~1 min, and overall run on one B200 is ~20 mins. * fix quant recipe to exclude one more layer with small shapes * add the results to documentation I want to land this so we can see the e2e metrics lift from eventually landing #4031 Test Plan: ``` ./benchmarks/quantization/eval_accuracy_and_perf_of_flux.sh ``` ghstack-source-id: 8fb74eb ghstack-comment-id: 4055040194 Pull-Request: #4072
diff --git a/benchmarks/quantization/eval_accuracy_and_perf_of_flux.py b/benchmarks/quantization/eval_accuracy_and_perf_of_flux.py
@@ -8,6 +8,8 @@
 import os
 import random
 import time
+from functools import wraps
+from typing import Callable, TypeVar
 
 import diffusers
 import fire
@@ -25,6 +27,9 @@
     quantize_,
 )
 
+# Type variables for better type hinting
+T = TypeVar("T")
+
 # -----------------------------
 # Config
 # -----------------------------
@@ -71,12 +76,18 @@ def print_pipeline_architecture(pipe):
 
 
 def generate_image(
-    pipe, prompt: str, seed: int, device: str, num_inference_steps: int
+    pipe,
+    prompt: str,
+    seed: int,
+    device: str,
+    num_inference_steps: int,
+    batch_size: int = 1,
 ) -> Image.Image:
     generator = torch.Generator(device=device).manual_seed(seed)
 
+    prompts = [prompt] * batch_size
     image = pipe(
-        prompt=prompt,
+        prompt=prompts,
         num_inference_steps=num_inference_steps,  # can tweak for speed vs quality
         guidance_scale=7.5,
         generator=generator,
@@ -238,18 +249,58 @@ def pil_to_lpips_tensor(img: Image.Image, device: str):
     return t.to(device)
 
 
+from torch.utils._pytree import tree_map_only
+
+
+def clone_output_wrapper(f: Callable[..., T]) -> Callable[..., T]:
+    """
+    Clone the CUDA output tensors of a function to avoid in-place operations.
+
+    This wrapper is useful when working with torch.compile to prevent errors
+    related to in-place operations on tensors.
+
+    Args:
+        f: The function whose CUDA tensor outputs should be cloned
+
+    Returns:
+        A wrapped function that clones any CUDA tensor outputs
+    """
+
+    @wraps(f)
+    def wrapped(*args, **kwargs):
+        outputs = f(*args, **kwargs)
+        return tree_map_only(
+            torch.Tensor, lambda t: t.clone() if t.is_cuda else t, outputs
+        )
+
+    return wrapped
+
+
+def apply_torch_compile(pipe, torch_compile_mode: str = "default"):
+    """Apply torch.compile to the transformer blocks in-place."""
+    for block in pipe.transformer.transformer_blocks:
+        block.forward = clone_output_wrapper(
+            torch.compile(block.forward, mode=torch_compile_mode)
+        )
+    for block in pipe.transformer.single_transformer_blocks:
+        block.forward = clone_output_wrapper(
+            torch.compile(block.forward, mode=torch_compile_mode)
+        )
+
+
 @torch.inference_mode()
 def run(
     mode: str = "accuracy",
     num_prompts: int = None,
-    num_inference_steps: int = 20,
+    num_inference_steps: int = 4,
     quant_config_str: str = "float8_rowwise",
     use_compile: bool = False,
     torch_compile_mode: str = "default",
     debug_prompt: str | None = None,
     print_model: bool = False,
     cache_baseline_images: bool = False,
     perf_n_iter: int = 10,
+    batch_size: int = 1,
     use_deterministic_algorithms: bool = False,
     num_gpus_used: int = None,
 ):
@@ -282,6 +333,7 @@ def run(
           instead of regenerated, if available. This is useful to make eval runs faster
           if we know the baseline is not changing.
         perf_n_iter: number of measurements to take for measuring performance
+        batch_size: batch size for performance_hp and performance_quant modes (default 1)
         use_deterministic_algorithms: if True, sets torch.use_deterministic_algorithms(True)
         num_gpus_used: For 'aggregate_accuracy' mode, the number of GPUs that were used
           to generate the data. Required for aggregate_accuracy mode.
@@ -314,6 +366,7 @@ def run(
     print(f"[Rank {local_rank}/{world_size}] use_compile: {use_compile}")
     print(f"[Rank {local_rank}/{world_size}] torch_compile_mode: {torch_compile_mode}")
     print(f"[Rank {local_rank}/{world_size}] {use_deterministic_algorithms=}")
+    print(f"[Rank {local_rank}/{world_size}] {batch_size=}")
     print(f"[Rank {local_rank}/{world_size}] {cache_baseline_images=}")
 
     assert mode in (
@@ -322,6 +375,11 @@ def run(
         "performance_quant",
         "aggregate_accuracy",
     )
+    assert batch_size >= 1, f"batch_size must be >= 1, got {batch_size}"
+    if mode in ("accuracy", "aggregate_accuracy"):
+        assert batch_size == 1, (
+            f"batch_size must be 1 for {mode} mode, got {batch_size}"
+        )
 
     # Handle aggregate_accuracy mode separately
     if mode == "aggregate_accuracy":
@@ -438,14 +496,6 @@ def run(
 
     loss_fn = lpips.LPIPS(net="vgg").to(device)
 
-    # Store original for restoration later, since we will quantize it
-    # and compile the quantized version again
-    orig_transformer = pipe.transformer
-
-    if use_compile:
-        pipe.transformer = torch.compile(orig_transformer, mode=torch_compile_mode)
-        pipe.vae.decode = torch.compile(pipe.vae.decode, mode=torch_compile_mode)
-
     # -----------------------------
     # 2. Baseline images (for all prompts)
     # -----------------------------
@@ -473,6 +523,8 @@ def run(
     baseline_times = []
 
     if mode == "accuracy":
+        # note: never compile for baseline images
+
         for local_idx, prompt in enumerate(my_prompts):
             # Calculate global prompt index
             global_idx = local_rank + local_idx * world_size
@@ -500,32 +552,39 @@ def run(
             baseline_times.append(t1 - t0)
 
     elif mode == "performance_hp":
+        if use_compile:
+            apply_torch_compile(pipe, torch_compile_mode)
+
         # High precision performance mode - measure baseline without quantization
         if local_rank == 0:
             # warm up compile
             _ = generate_image(
-                pipe, prompts_to_use[0], RANDOM_SEED, device, num_inference_steps
+                pipe,
+                prompts_to_use[0],
+                RANDOM_SEED,
+                device,
+                num_inference_steps,
+                batch_size=batch_size,
             )
 
             for _ in range(perf_n_iter):
                 t0 = time.time()
                 _ = generate_image(
-                    pipe, prompts_to_use[0], RANDOM_SEED, device, num_inference_steps
+                    pipe,
+                    prompts_to_use[0],
+                    RANDOM_SEED,
+                    device,
+                    num_inference_steps,
+                    batch_size=batch_size,
                 )
                 t1 = time.time()
                 baseline_times.append(t1 - t0)
 
-    if use_compile and mode in ("accuracy", "performance_quant"):
-        print(
-            f"[Rank {local_rank}/{world_size}] Restoring original (uncompiled) transformer before quantization"
-        )
-        pipe.transformer = orig_transformer
-
     # Only quantize for accuracy and performance_quant modes
     if mode in ("accuracy", "performance_quant"):
         # Inspect Linear layers in main component
         component_linear_fqns_and_weight_shapes = []
-        for fqn, module in orig_transformer.named_modules():
+        for fqn, module in pipe.transformer.named_modules():
             if isinstance(module, torch.nn.Linear):
                 weight_shape = module.weight.shape
                 if print_model:
@@ -545,15 +604,21 @@ def run(
                 continue
             elif fqn == "proj_out":
                 continue
+            elif "norm.linear" in fqn:
+                # activations here have shape [batch_size, 3072], so
+                # too small to see speedups from activation quantization
+                continue
             elif weight_shape[0] < 1024 or weight_shape[1] < 1024:
                 continue
             fqn_to_config_dict[fqn] = config_obj
         fqn_to_config = FqnToConfig(fqn_to_config=fqn_to_config_dict)
 
         # Quantize the main component using this config
         quantize_(pipe.transformer, fqn_to_config, filter_fn=None)
+
         if use_compile:
-            pipe.transformer = torch.compile(pipe.transformer, mode=torch_compile_mode)
+            apply_torch_compile(pipe, torch_compile_mode)
+
         if print_model:
             print_pipeline_architecture(pipe)
 
@@ -615,13 +680,23 @@ def run(
         if local_rank == 0:
             # warm up compile
             _ = generate_image(
-                pipe, prompts_to_use[0], RANDOM_SEED, device, num_inference_steps
+                pipe,
+                prompts_to_use[0],
+                RANDOM_SEED,
+                device,
+                num_inference_steps,
+                batch_size=batch_size,
             )
 
             for _ in range(perf_n_iter):
                 t0 = time.time()
                 _ = generate_image(
-                    pipe, prompts_to_use[0], RANDOM_SEED, device, num_inference_steps
+                    pipe,
+                    prompts_to_use[0],
+                    RANDOM_SEED,
+                    device,
+                    num_inference_steps,
+                    batch_size=batch_size,
                 )
                 t1 = time.time()
                 times.append(t1 - t0)
@@ -691,11 +766,13 @@ def run(
                 writer.writerow(["average_quantized_time", f"{avg_quant_time:.4f}"])
             elif mode == "performance_hp":
                 writer.writerow(["perf_n_iter", perf_n_iter])
+                writer.writerow(["batch_size", batch_size])
                 writer.writerow(["average_time", f"{avg_time:.4f}"])
                 for idx, val in enumerate(baseline_times):
                     writer.writerow([f"time_{idx}", f"{val:.4f}"])
             elif mode == "performance_quant":
                 writer.writerow(["perf_n_iter", perf_n_iter])
+                writer.writerow(["batch_size", batch_size])
                 writer.writerow(["average_time", f"{avg_time:.4f}"])
                 for idx, val in enumerate(times):
                     writer.writerow([f"time_{idx}", f"{val:.4f}"])
diff --git a/benchmarks/quantization/eval_accuracy_and_perf_of_flux.sh b/benchmarks/quantization/eval_accuracy_and_perf_of_flux.sh
@@ -1,25 +1,27 @@
 #!/bin/bash
 
 # number of local GPUs to use for accuracy eval
-NUM_GPUS=8
+NUM_GPUS=1
 
 # float8 rowwise
 # note: max-autotune performance is nearly identical to regular compile on b200, so skip it for now
 time torchrun --nproc_per_node=$NUM_GPUS benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str float8_rowwise --mode accuracy --use_deterministic_algorithms
 time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str float8_rowwise --mode aggregate_accuracy --num_gpus_used $NUM_GPUS
-time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str float8_rowwise --mode performance_hp --use_compile
-time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str float8_rowwise --mode performance_quant --use_compile
+time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str float8_rowwise --mode performance_hp --use_compile --torch_compile_mode reduce-overhead --batch_size 1
+time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str float8_rowwise --mode performance_hp --use_compile --torch_compile_mode reduce-overhead --batch_size 4
+time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str float8_rowwise --mode performance_quant --use_compile --torch_compile_mode reduce-overhead --batch_size 1
+time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str float8_rowwise --mode performance_quant --use_compile --torch_compile_mode reduce-overhead --batch_size 4
 
 # mxfp8
 time torchrun --nproc_per_node=$NUM_GPUS benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str mxfp8 --mode accuracy --cache_baseline_images --use_deterministic_algorithms
 time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str mxfp8 --mode aggregate_accuracy --num_gpus_used $NUM_GPUS
-# time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str mxfp8 --mode performance_hp --use_compile
-time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str mxfp8 --mode performance_quant --use_compile
+time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str mxfp8 --mode performance_quant --use_compile --torch_compile_mode reduce-overhead --batch_size 1
+time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str mxfp8 --mode performance_quant --use_compile --torch_compile_mode reduce-overhead --batch_size 4
 
 # nvfp4
 # note: even though we are using a triton kernel for to_nvfp4 cast, we still need
 # to enable compile for fast generation of the nvfp4 global scale
 time torchrun --nproc_per_node=$NUM_GPUS benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str nvfp4 --mode accuracy --cache_baseline_images --use_deterministic_algorithms
 time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str nvfp4 --mode aggregate_accuracy --num_gpus_used $NUM_GPUS
-# time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str nvfp4 --mode performance_hp --use_compile
-time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str nvfp4 --mode performance_quant --use_compile
+time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str nvfp4 --mode performance_quant --use_compile --torch_compile_mode reduce-overhead --batch_size 1
+time python -u benchmarks/quantization/eval_accuracy_and_perf_of_flux.py --quant_config_str nvfp4 --mode performance_quant --use_compile --torch_compile_mode reduce-overhead --batch_size 4
diff --git a/docs/source/workflows/inference.md b/docs/source/workflows/inference.md
@@ -178,6 +178,34 @@ torchao version         0.17.0+git3075bb624
 4  16384  16384  16384                      3.82            2.31
 ```
 
+## e2e flux-1.schnell benchmarks
+
+These benchmarks compare accuracy and performance of torchao inference quantization on the
+[flux-1.schnell](https://huggingface.co/black-forest-labs/FLUX.1-schnell) model.
+
+For accuracy, we measure the [LPIPS](https://github.com/richzhang/PerceptualSimilarity) score
+between images generated by the quantized model and the high precision (bfloat16) baseline,
+averaged over the prompts from the [sayakpaul/drawbench](https://huggingface.co/datasets/sayakpaul/drawbench) dataset —
+lower is better, with 0 meaning identical.
+
+Note that this benchmark optimizes for speed of iteration and does not represent 
+the best possible metrics someone could achieve on this model. Instead, this is an
+apples-to-apples comparison intended to compare different quantization recipes at a
+high level, and measure performance improvements.
+
+| experiment | lpips_avg | time_s_bsz_1 | speedup_bsz_1 | time_s_bsz_4 | speedup_bsz_4 |
+| ---------- | --------- | ------------- | -------------- | ------------- | -------------- |
+| bfloat16 | 0 | 0.4178 | 1.00 | 1.4914 | 1.00 |
+| float8_rowwise | 0.1236| 0.3455 | 1.21 | 1.1986 | 1.24 |
+| mxfp8 | 0.1260 | 0.3673 | 1.14 | 1.2820 | 1.16 |
+| nvfp4 | 0.2694 | 0.3308 | 1.26 | 1.1334 | 1.32 |
+
+To reproduce, run:
+
+```bash
+./benchmarks/quantization/eval_accuracy_and_perf_of_flux.sh
+```
+
 ## Other Available Quantization Techniques
 
 ### Int8DynamicActivationIntxWeightConfig Quantization