support multiple modalities again, except simpler

lucidrains · lucidrains · commit 05ac09763b37 · 2024-09-06T10:46:25.000-07:00
diff --git a/README.md b/README.md
@@ -39,6 +39,33 @@ loss = model(text_and_images)
 loss.backward()
 ```
 
+Multiple different modalities
+
+```python
+from torch import randint, randn
+from transfusion_pytorch import Transfusion
+
+model = Transfusion(
+    num_text_tokens = 256,
+    dim_latent = (384, 192), # specify multiple latent dimensions
+    transformer = dict(
+        dim = 512,
+        depth = 8
+    )
+)
+
+# then for the Tensors of type float, you can pass a tuple[int, Tensor] and specify the modality index in the first position
+
+text_images_and_audio = [
+    [randint(0, 256, (16,)), (0, randn(4, 384)), randint(0, 256, (8,)), (1, randn(6, 192))],
+    [randint(0, 256, (16,)), randn(7, 384), randint(0, 256, (5,)), (1, randn(2, 192)), randint(0, 256, (9,))]
+]
+
+loss = model(text_images_and_audio)
+
+loss.backward()
+```
+
 ## Citations
 
 ```bibtex
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.0.15"
+version = "0.0.16"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -591,7 +591,7 @@ def device(self):
 
     def forward(
         self,
-        modalities: list[list[Int['_'] | Float['_ _']]],
+        modalities: list[list[Int['_'] | Float['_ _'] | tuple[int, Float['_ _']]]],
         times: Float['b m'] | None = None,
         return_loss = True,
         return_breakdown = False
@@ -614,17 +614,31 @@ def forward(
             batch_text = []
 
             for modality in batch_modalities:
-                is_text = modality.dtype in (torch.long, torch.int)
+                # if non-text modality detected and not given as a tuple
+                # cast to (int, Tensor) where int is defaulted to type 0 (convenience for one modality)
+
+                if torch.is_tensor(modality) and modality.dtype == torch.float:
+                    modality = (0, modality)
+
+                is_text = not isinstance(modality, tuple)
+
+                if is_text:
+                    modality_tensor = modality
+                else:
+                    modality_type, modality_tensor = modality
+
+                    assert 0 <= modality_type < self.num_modalities, f'received a modality index that is out of range. only {self.num_modalities} modalities specified'
+                    assert self.dim_latents[modality_type] == modality_tensor.shape[-1], 'mismatch for modality latent dimension - expected {self.dim_latents[modality_type]} but received {modality_tensor.shape[-1]}'
 
-                length = modality.shape[0]
                 offset = 0
+                length = modality_tensor.shape[0]
 
                 if is_text:
-                    batch_text.append(modality)
+                    batch_text.append(modality_tensor)
                 else:
                     batch_text.append(torch.full((length,), -1, device = device))
-                    batch_modality_tokens.append(modality)
-                    batch_modality_positions.append((offset, length))
+                    batch_modality_tokens.append(modality_tensor)
+                    batch_modality_positions.append((modality_type, offset, length))
 
                 offset += length