iamwyldecat commited on 4 days ago

Commit

64757cb

1 Parent(s): 036642a

fix(muon): free tensors that are no longer needed

Browse files

Files changed (34) hide show

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py +10 -7
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-cu124-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py +10 -7
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py +10 -7
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py +10 -7
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx98-cu118-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py +10 -7
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx98-cu124-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +10 -7
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx98-cu126-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +10 -7
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +10 -7
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +10 -7
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +10 -7
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} +1 -1
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +10 -7
torch-ext/optimizer/muon.py +10 -7

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch26-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:98bd4b647ad0ecbae82a5e78f618475b47595c5bb68b3356c09ee8b1f1a57060
 size 1787272

 version https://git-lfs.github.com/spec/v1
+oid sha256:9c77e5647b6056bfaee25050cca7948c40859db0a88fa4fcf40b67a85c947d8c
 size 1787272

build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch26-cxx11-cu124-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:796ac374cd2eec4260591c5a771c6b324f7dc6c8f34fc5dc211ab8afca546ffe
 size 1824224

 version https://git-lfs.github.com/spec/v1
+oid sha256:94ea66089cc8d9eda72b017733a9e05e4fee5a2f04c50658b690d2c19f0d3068
 size 1824224

build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch26-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:254706f111eb794b1409ba48d25649ace5438e2c66027727e84490011ee4c5e6
 size 1824224

 version https://git-lfs.github.com/spec/v1
+oid sha256:46e01e1d957ada2d485b30cd60bc3ef7230b8857dffc59f2e7924339761ec577
 size 1824224

build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:027a26212a3dd705876ca83015a53b69d17d80fe7c1559fb01d7aacf614edb57
 size 1749744

 version https://git-lfs.github.com/spec/v1
+oid sha256:a825a0cd31d8c1b91aa9db4b24248d7fc0a506615f625a385b40e6002025c7dd
 size 1749744

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch26-cxx98-cu118-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:62c4408eaf54197941241ae6150afe1401a8bcf5854488a8b957d1f1546b388a
 size 1787192

 version https://git-lfs.github.com/spec/v1
+oid sha256:579e9ddf66a4f17ead9232c2f32e6327fe6a3f16dd235e2e73e6cb282de1797e
 size 1787192

build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch26-cxx98-cu124-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:166d253c91459e1aa1328a1550b0e3ec4bb7c6057870b1d7472a93cc987cf85a
 size 1824184

 version https://git-lfs.github.com/spec/v1
+oid sha256:beacb4ba2d56463b6d444875728b3462cb3ff6c1449e3c9693cd665bfbbbbb73
 size 1824184

build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch26-cxx98-cu126-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8bb7315b326f9af7a77e023c2b78511190235a8dcc9682abd5b49db1dc2b90f2
 size 1824184

 version https://git-lfs.github.com/spec/v1
+oid sha256:9b04b011803d328d8dcd2edcf4c3840ddbb1bb2f093464c208f0ba2faf4f16bc
 size 1824184

build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a74351ee471271eaf1c8292ed01b7e71e6b1b683704144d68d90b67032ba386
 size 1787368

 version https://git-lfs.github.com/spec/v1
+oid sha256:ad6c725009f2e776b99d3134c75f15e11dd7fe75fe4ba1fa94779018c7871f8c
 size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ffb7e3a786405106908da16e74506fe381b09e5e04a27b1062396e378f63f7f7
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:50cb5819ff08a2179d78cd98164d07fd3cef1b66ee7703d599a310dfb140b9d1
 size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:45ee6c653f216af96705a25993d85751648ccd4714a8d6c8c36bdbc8dc19edc5
 size 1883352

 version https://git-lfs.github.com/spec/v1
+oid sha256:9c75e42265f382addc71327ad5628e8a2414da5872791c975e384708c4acd549
 size 1883352

build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_febdf5b_dirty
-ops = torch.ops._optimizer_febdf5b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_febdf5b_dirty::{op_name}"

 import torch
+from . import _optimizer_036642a_dirty
+ops = torch.ops._optimizer_036642a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_036642a_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_febdf5b_dirty.abi3.so → _optimizer_036642a_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8427dae3274100063f3b003a7cebf9565318fcaa2fa340482b2ec9408e9dcea0
 size 1749648

 version https://git-lfs.github.com/spec/v1
+oid sha256:9a2363d4311d6a75fbcc03e6d4a71c73dae4d54e00a30135d25198d4078c6b0f
 size 1749648

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -53,7 +53,7 @@ class _muon_state:
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -70,7 +70,6 @@ def _gather(p, state, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            # TODO: Consider ,,,
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
@@ -81,6 +80,8 @@ def _gather(p, state, rank, comm_stream):
         else:
             state.gathered_grad = None
             state.gather_event = None
 @torch.no_grad()
@@ -94,8 +95,8 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            state.gathered_grad.record_stream(compute_stream)
-            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
@@ -123,8 +124,8 @@ def _scatter(p, state, lr, wd, rank, comm_stream):
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
-            state.computed_u.record_stream(comm_stream)
-            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
@@ -172,6 +173,7 @@ class Muon(torch.optim.Optimizer):
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
         debug=False,
     ):
         defaults = dict(
@@ -182,6 +184,7 @@ class Muon(torch.optim.Optimizer):
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
         )
         super().__init__(model.parameters(), defaults)
@@ -350,7 +353,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:

 @torch.no_grad()
+def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     mesh = g.device_mesh
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
                     "Gather event already exists, which should not happen."
         else:
             state.gathered_grad = None
             state.gather_event = None
+        if none_grad:
+            p.grad = None
 @torch.no_grad()
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
             group=mesh.get_group(),
         )
         if rank == state.worker_rank:
+            # Clear u to free memory
+            state.computed_u = None
         u = DTensor.from_local(
             u,
             placements=p.placements,
         adamw_wd=0.1,
         adamw_betas=(0.9, 0.95),
         adamw_eps=1e-8,
+        none_grad=True,
         debug=False,
     ):
         defaults = dict(
             ns_steps=ns_steps,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
+            none_grad=none_grad,
         )
         super().__init__(model.parameters(), defaults)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]: