muon is here to stay

lucidrains · lucidrains · commit 372e3d27be8c · 2025-10-13T14:28:16.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.11.0"
+version = "0.12.0"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -49,6 +49,7 @@ build-backend = "hatchling.build"
 [project.optional-dependencies]
 
 examples = [
+    "adam-atan2-pytorch>=0.2.2",
     "datasets",
     "diffusers"
 ]
diff --git a/train_image_only.py b/train_image_only.py
@@ -5,7 +5,8 @@
 from torch import tensor
 from torch.nn import Module
 from torch.utils.data import Dataset, DataLoader
-from torch.optim import Adam
+
+from adam_atan2_pytorch import MuonAdamAtan2
 
 from einops import rearrange
 
@@ -83,7 +84,7 @@ def cycle(iter_dl):
 dataloader = DataLoader(dataset, batch_size = 32, shuffle = True)
 iter_dl = cycle(dataloader)
 
-optimizer = Adam(model.parameters(), lr = 8e-4)
+optimizer = MuonAdamAtan2(model.muon_parameters(), model.parameters(), lr = 8e-4)
 
 # train loop
 
diff --git a/train_mnist.py b/train_mnist.py
@@ -24,7 +24,7 @@
 IMAGE_AFTER_TEXT = True   # False for captioning, True for text-to-image
 USE_PROMPT = False        # whether to use prompting, or synthesize from start token 
 NUM_TRAIN_STEPS = 20_000
-SAMPLE_EVERY = 250
+SAMPLE_EVERY = 500
 CHANNEL_FIRST = True
 
 # functions
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -813,18 +813,24 @@ def forward(self, x):
         x, gates = x.chunk(2, dim = -1)
         return F.gelu(gates) * x
 
-def FeedForward(
-    dim,
-    expansion_factor = 4.,
-    dropout = 0.
-):
-    dim_inner = int(dim * expansion_factor * 2 / 3)
-    return nn.Sequential(
-        Linear(dim, dim_inner * 2),
-        GEGLU(),
-        nn.Dropout(dropout),
-        Linear(dim_inner, dim)
-    )
+class FeedForward(Module):
+    def __init__(
+        self,
+        dim,
+        expansion_factor = 4.,
+        dropout = 0.
+    ):
+        super().__init__()
+        dim_inner = int(dim * expansion_factor * 2 / 3)
+        self.net = nn.Sequential(
+            Linear(dim, dim_inner * 2),
+            GEGLU(),
+            nn.Dropout(dropout),
+            Linear(dim_inner, dim)
+        )
+
+    def forward(self, x):
+        return self.net(x)
 
 class Attention(Module):
     def __init__(
@@ -847,9 +853,14 @@ def __init__(
         assert not (use_flex_attn and not exists(flex_attention)), 'flex attention is only available on torch 2.5.0 (nightly) onwards'
         self.use_flex_attn = use_flex_attn
 
-        self.to_qkv = nn.Sequential(
-            Linear(dim, dim_inner * 3, bias = False),
-            Rearrange('b n (qkv h d) -> qkv b h n d', qkv = 3, h = heads)
+        self.to_qk = nn.Sequential(
+            Linear(dim, dim_inner * 2, bias = False),
+            Rearrange('b n (qk h d) -> qk b h n d', qk = 2, h = heads)
+        )
+
+        self.to_v = nn.Sequential(
+            Linear(dim, dim_inner, bias = False),
+            Rearrange('b n (h d) -> b h n d', h = heads)
         )
 
         self.to_learned_value_residual = nn.Sequential(
@@ -902,7 +913,7 @@ def forward(
 
         # project to queries, keys, values
 
-        q, k, v = self.to_qkv(x)
+        q, k, v = (*self.to_qk(x), self.to_v(x))
 
         # value residual
 
@@ -1522,6 +1533,23 @@ def parameters_without_encoder_decoder(self):
             set(self.modality_decoder.parameters())
         )
 
+    def muon_parameters(self):
+        params = []
+
+        for m in self.modules():
+            if isinstance(m, Attention):
+                params.extend([
+                    *m.to_v.parameters(),
+                    *m.to_out.parameters(),
+                ])
+            elif isinstance(m, FeedForward):
+                params.extend([
+                    m.net[0].weight,
+                    m.net[-1].weight
+                ])
+
+        return params
+
     def create_dataloader(
         self,
         *args,