supporting model output being the cleaned image, given new paper from Kaiming He group, still need to verify with experiments

lucidrains · lucidrains · commit 80c3fbfe328f · 2025-12-04T11:37:16.000-08:00
diff --git a/README.md b/README.md
@@ -276,3 +276,15 @@ $ pip install -U diffusers transformers accelerate scipy ftfy safetensors
     url     = {https://api.semanticscholar.org/CorpusID:277104144}
 }
 ```
+
+```bibtex
+@misc{li2025basicsletdenoisinggenerative,
+    title   = {Back to Basics: Let Denoising Generative Models Denoise}, 
+    author  = {Tianhong Li and Kaiming He},
+    year    = {2025},
+    eprint  = {2511.13720},
+    archivePrefix = {arXiv},
+    primaryClass = {cs.CV},
+    url     = {https://arxiv.org/abs/2511.13720}, 
+}
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.12.0"
+version = "0.14.0"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -624,6 +624,30 @@ def apply_fn_modality_type(
 
     return tree_unflatten(out, tree_spec)
 
+# decorator for model output to flow
+
+def get_model_output_to_flow_fn(
+    noised: Tensor,
+    times: Tensor,
+    eps = 5e-2,
+    return_decorator = False
+):
+    def to_flow(out):
+        padded_times = append_dims(times, out.ndim - 1)
+        flow = (out - noised) / (1. - padded_times).clamp_min(eps)
+        return flow
+
+    if not return_decorator:
+        return to_flow
+
+    def decorator(fn):
+        def inner(embed):
+            out = fn(embed)
+            return to_flow(out)
+        return inner
+
+    return decorator
+
 # sampling related functions
 
 # min_p for text
@@ -1230,6 +1254,7 @@ def __init__(
         *,
         num_text_tokens,
         transformer: dict | Transformer,
+        pred_clean = False,
         dim_latent: int | tuple[int, ...] | None = None,
         channel_first_latent: bool | tuple[bool, ...] = False,
         add_pos_emb: bool | tuple[bool, ...] = False,
@@ -1251,6 +1276,7 @@ def __init__(
             rtol = 1e-5,
             method = 'midpoint'
         ),
+        eps = 5e-2
     ):
         super().__init__()
 
@@ -1453,6 +1479,11 @@ def __init__(
         self.has_recon_loss = reconstruction_loss_weight > 0.
         self.reconstruction_loss_weight = reconstruction_loss_weight
 
+        # whether model is predicting clean
+
+        self.pred_clean = pred_clean
+        self.eps = eps
+
         # flow sampling related
 
         self.odeint_fn = partial(odeint, **odeint_kwargs)
@@ -2001,6 +2032,13 @@ def forward_modality(
         else:
             noised_tokens = tokens
 
+        # save the noised and times
+
+        model_output_to_flow = identity
+
+        if self.pred_clean:
+            model_output_to_flow = get_model_output_to_flow_fn(noised_tokens, times)
+
         # from latent to model tokens
 
         noised_tokens = mod.latent_to_model(noised_tokens)
@@ -2034,7 +2072,9 @@ def forward_modality(
 
         embed = inverse_pack_axial_dims(embed)
 
-        pred_flow = mod.model_to_latent(embed)
+        model_output = mod.model_to_latent(embed)
+
+        pred_flow = model_output_to_flow(model_output)
 
         if not return_loss:
             return pred_flow
@@ -2475,6 +2515,14 @@ def inner(pred_flow):
 
                 inverse_fn = model_to_pred_flow(batch_index, offset + precede_modality_tokens, modality_length, unpack_modality_shape)
 
+                # maybe decorate the function if model output is predicting clean
+
+                if self.pred_clean:
+                    decorator = get_model_output_to_flow_fn(modality_tensor, modality_time, self.eps, return_decorator = True)
+                    inverse_fn = decorator(inverse_fn)
+
+                # store function for extracting flow later
+
                 get_pred_flows[modality_type].append(inverse_fn)
 
                 # increment offset