Fix moe_normalize_expert_weights when top_k=1 (#87)

* normalize router weights *before* squeezing dim on top-k=1 * keep top-1 optimisation * Update router.py
databricks · Jan 10, 2024 · 04e4f1f · 04e4f1f
1 parent bcb4979
commit 04e4f1f
Showing 1 changed file with 1 addition and 2 deletions.
diff --git a/megablocks/layers/router.py b/megablocks/layers/router.py
@@ -45,10 +45,9 @@ def jitter(self, x):
 
     def _top_k(self, scores):
         if self.args.moe_top_k == 1:
-            return scores.max(dim=-1)
+            return scores.max(dim=-1,keepdim=True)
         return torch.topk(scores, self.args.moe_top_k, dim=-1)
 
-
     def forward(self, x):
         if self.training and self.args.moe_jitter_eps is not None:
             x = x * self.jitter(x)