first handle axial positional embedding for single modality training

lucidrains · lucidrains · commit 68343418c37e · 2024-10-15T07:12:45.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.1.12"
+version = "0.1.14"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_transfusion.py b/tests/test_transfusion.py
@@ -215,7 +215,7 @@ def test_velocity_consistency():
         dim_latent = 384,
         channel_first_latent = True,
         modality_default_shape = ((4, 4)),
-        modality_validate_num_dim = 2,
+        modality_num_dim = 2,
         modality_encoder = mock_encoder,
         modality_decoder = mock_decoder,
         transformer = dict(
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -1424,6 +1424,8 @@ def forward_modality(
         return_loss = True
     ) -> Float['']:
 
+        shape = modalities.shape
+
         if self.num_modalities > 1:
             assert exists(modality_type), '`modality_type` must be explicitly passed in on forward when training on greater than 1 modality'
 
@@ -1433,12 +1435,32 @@ def forward_modality(
         latent_to_model_fn = self.latent_to_model_projs[modality_type]
         model_to_flow_pred_fn = self.model_to_latent_preds[modality_type]
 
+        # grab the shape of the modality, for maybe axial pos emb
+
+        add_pos_emb = self.add_pos_emb[modality_type]
+        maybe_pos_emb_mlp = self.pos_emb_mlp[modality_type]
+        modality_num_dim = self.modality_num_dim[modality_type]
+
+        if add_pos_emb:
+            assert exists(modality_num_dim), f'modality_num_dim must be set for modality {modality_type} if further injecting axial positional embedding'
+
+            if self.channel_first_latent:
+                _, _, *axial_dims = shape
+            else:
+                _, *axial_dims, _ = shape
+
+            assert len(axial_dims) == modality_num_dim, f'received modalities of ndim {len(axial_dims)} but expected {modality_num_dim}'
+
+        # maybe transform
+
         tokens = transform(modalities)
 
         # maybe channel first
 
         if self.channel_first_latent:
-            tokens = rearrange(tokens, 'b d ... -> b (...) d')
+            tokens = rearrange(tokens, 'b d ... -> b ... d')
+
+        tokens = rearrange(tokens, 'b ... d -> b (...) d')
 
         # rotary
 
@@ -1459,10 +1481,17 @@ def forward_modality(
 
         flow = tokens - noise
 
-        # attention
-
         noised_tokens = latent_to_model_fn(noised_tokens)
 
+        # maybe add axial pos emb
+
+        if add_pos_emb:
+            axial_pos_emb = maybe_pos_emb_mlp(tensor(axial_dims))
+
+            noised_tokens = noised_tokens + rearrange(axial_pos_emb, '... d -> (...) d')
+
+        # attention
+
         embed = self.transformer(
             noised_tokens,
             times = times,