add optional reconstruction loss off decoder for .forward_modality for starters

lucidrains · lucidrains · commit 11d96ab52abb · 2024-11-25T16:21:07.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.5.1"
+version = "0.5.2"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train_image_only.py b/train_image_only.py
@@ -46,6 +46,7 @@ def forward(self, x):
     add_pos_emb = True,
     modality_num_dim = 2,
     velocity_consistency_loss_weight = 0.1,
+    reconstruction_loss_weight = 0.1,
     transformer = dict(
         dim = 64,
         depth = 4,
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -90,6 +90,7 @@ class LossBreakdown(NamedTuple):
     text: Scalar
     flow: list[Scalar]
     velocity: list[Scalar] | None
+    recon: list[Scalar] | None
 
 class ModalityInfo(NamedTuple):
     encoder: Module | None
@@ -1085,6 +1086,7 @@ def __init__(
         flow_loss_weight = 1.,
         text_loss_weight = 1.,
         velocity_consistency_loss_weight = 0.1,
+        reconstruction_loss_weight = 0.,
         modality_encoder_decoder_requires_batch_dim = True, # whether the modality encoder / decoder requires batch dimension, will auto assume it is needed
         odeint_kwargs: dict = dict(
             atol = 1e-5,
@@ -1277,6 +1279,11 @@ def __init__(
 
         self.velocity_consistency_loss_weight = velocity_consistency_loss_weight
 
+        # additional reconstruction loss, through the decoder
+
+        self.has_recon_loss = reconstruction_loss_weight > 0.
+        self.reconstruction_loss_weight = reconstruction_loss_weight
+
         # flow sampling related
 
         self.odeint_fn = partial(odeint, **odeint_kwargs)
@@ -1711,10 +1718,10 @@ def forward_modality(
         return_loss = True,
         return_loss_breakdown = False
     ) -> Scalar | Float['b ...']:
-
         requires_velocity_consistency = exists(velocity_consistency_ema_model)
 
         modalities = modalities.to(self.device)
+        orig_modalities = modalities
 
         if self.num_modalities > 1:
             assert exists(modality_type), '`modality_type` must be explicitly passed in on forward when training on greater than 1 modality'
@@ -1754,6 +1761,7 @@ def forward_modality(
             noised_tokens = padded_times * tokens + (1. - padded_times) * noise
 
             flow = tokens - noise
+
         else:
             noised_tokens = tokens
 
@@ -1816,17 +1824,37 @@ def forward_modality(
 
             velocity_loss = F.mse_loss(flow, flow_with_delta_time)
 
+        # maybe recon loss
+
+        recon_loss = self.zero
+
+        if self.has_recon_loss:
+            assert encode_modality
+
+            recon = noise + pred_flow * (1. - padded_times)
+
+            if exists(mod.decoder):
+                with torch.no_grad():
+                    mod.decoder.eval()
+                    recon = self.maybe_add_temp_batch_dim(mod.decoder)(recon)
+
+            recon_loss = F.mse_loss(
+                recon,
+                orig_modalities
+            )
+
         # total loss
 
         total_loss = (
             flow_loss +
-            velocity_loss * self.velocity_consistency_loss_weight
+            velocity_loss * self.velocity_consistency_loss_weight +
+            recon_loss * self.reconstruction_loss_weight
         )
 
         if not return_loss_breakdown:
             return total_loss
 
-        return total_loss, (flow_loss, velocity_loss)
+        return total_loss, (flow_loss, velocity_loss, recon_loss)
 
     @torch.no_grad()
     @eval_decorator