go even simpler

lucidrains · lucidrains · commit d4ad91881e48 · 2024-09-06T10:27:50.000-07:00
diff --git a/README.md b/README.md
@@ -17,83 +17,24 @@ $ pip install transfusion-pytorch
 One modality, say images
 
 ```python
-import torch
+from torch import randint, randn
 from transfusion_pytorch import Transfusion
 
 model = Transfusion(
     num_text_tokens = 256,
-    dim_latent = 192,
+    dim_latent = 384,
     transformer = dict(
         dim = 512,
         depth = 8
     )
 )
 
-text_ids = torch.randint(0, 256, (2, 1024))
+text_and_images = [
+    [randint(0, 256, (16,)), randn(4, 384), randint(0, 256, (8,)), randn(6, 384)],
+    [randint(0, 256, (16,)), randn(7, 384), randint(0, 256, (5,)), randn(2, 384), randint(0, 256, (9,))]
+]
 
-modality_tokens = [[
-    torch.randn(6, 192),
-    torch.randn(4, 192)
-], [
-    torch.randn(5, 192),
-    torch.randn(3, 192)
-]]
-
-modality_positions = [[
-    (2, 6),
-    (10, 4)
-], [
-    (2, 5),
-    (10, 3)
-]] # (offset, length)
-
-loss, breakdown = model(
-    text_ids,
-    modality_tokens = modality_tokens,
-    modality_positions = modality_positions
-)
-
-loss.backward()
-```
-
-Multiple modalities
-
-```python
-import torch
-from transfusion_pytorch import Transfusion
-
-model = Transfusion(
-    num_text_tokens = 256,
-    dim_latent = (384, 192),
-    transformer = dict(
-        dim = 512,
-        depth = 8
-    )
-)
-
-text_ids = torch.randint(0, 256, (2, 1024))
-
-modality_tokens = [[
-    torch.randn(6, 384),
-    torch.randn(4, 192)
-], [
-    torch.randn(5, 192),
-    torch.randn(3, 384)
-]]
-
-modality_positions = [[
-    (0, 2, 6),
-    (1, 10, 4)
-], [
-    (1, 2, 5),
-    (0, 10, 3)
-]] # (type, offset, length)
-
-loss, breakdown = model(
-    text_ids,
-    modality_tokens = modality_tokens,
-    modality_positions = modality_positions
-)
+loss = model(text_and_images)
 
 loss.backward()
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.0.14"
+version = "0.0.15"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -585,17 +585,54 @@ def __init__(
         self.ignore_index = ignore_index
         self.diffusion_loss_weight = diffusion_loss_weight
 
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
     def forward(
         self,
-        text: Int['b n'],
-        modality_tokens: list[list[Float['_ _']]] | list[Float['b n _']] | Float['b n _'],
-        modality_positions: RawModalityPositions | Int['b m 2'] | Int['b m 3'],
+        modalities: list[list[Int['_'] | Float['_ _']]],
         times: Float['b m'] | None = None,
-        return_loss = True
+        return_loss = True,
+        return_breakdown = False
     ) -> (
         Float['b n l'] |
+        Float[''] |
         tuple[Float[''], LossBreakdown]
     ):
+        device = self.device
+
+        # process list of text and modalities interspersed with one another
+
+        modality_positions = []
+        modality_tokens = []
+        text = []
+
+        for batch_modalities in modalities:
+            batch_modality_positions = []
+            batch_modality_tokens = []
+            batch_text = []
+
+            for modality in batch_modalities:
+                is_text = modality.dtype in (torch.long, torch.int)
+
+                length = modality.shape[0]
+                offset = 0
+
+                if is_text:
+                    batch_text.append(modality)
+                else:
+                    batch_text.append(torch.full((length,), -1, device = device))
+                    batch_modality_tokens.append(modality)
+                    batch_modality_positions.append((offset, length))
+
+                offset += length
+
+            text.append(torch.cat(batch_text))
+            modality_tokens.append(batch_modality_tokens)
+            modality_positions.append(batch_modality_positions)
+
+        text = pad_sequence(text, padding_value = -1)
 
         # if returning loss, split text for next token prediction
 
@@ -653,6 +690,8 @@ def forward(
 
         # embed text
 
+        text = text.masked_fill(text == -1, 0)
+
         text_tokens = self.text_embed(text)
 
         # noise the modality tokens
@@ -766,4 +805,7 @@ def forward(
             (torch.stack(diffusion_losses) * torch.stack(modality_loss_weights)).sum() * self.diffusion_loss_weight
         )
 
+        if not return_breakdown:
+            return total_loss
+
         return total_loss, LossBreakdown(total_loss, text_loss, diffusion_losses)