Merge branch 'master' into zhejiang/fix_runtime_dataloader_shuffle

microsoft · Jan 10, 2025 · a5b3b2e · a5b3b2e
2 parents 26d02df + 1d15ef0
commit a5b3b2e
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 8 deletions.
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
@@ -43,7 +43,7 @@ jobs:
 
       - name: Install deepspeed
         run: |
-          pip install transformers==4.45.2
+          pip install transformers
           pip install .[dev]
           ds_report
 

diff --git a/SECURITY.md b/SECURITY.md
@@ -39,3 +39,7 @@ We prefer all communications to be in English.
 Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
 
 <!-- END MICROSOFT SECURITY.MD BLOCK -->
+
+---
+
+Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
@@ -134,7 +134,8 @@ def is_load_module(module):
         load_layer_names = [
             "LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "FalconLinear",
             "MistralRMSNorm", "T5LayerNorm", "MixtralRMSNorm", "Phi3RotaryEmbedding", "Phi3SuScaledRotaryEmbedding",
-            "Phi3RMSNorm", "YuanRMSNorm", "YuanRotaryEmbedding", "Phi3LongRoPEScaledRotaryEmbedding", "Qwen2RMSNorm"
+            "Phi3RMSNorm", "YuanRMSNorm", "YuanRotaryEmbedding", "Phi3LongRoPEScaledRotaryEmbedding", "Qwen2RMSNorm",
+            "DeepseekV2RMSNorm", "DeepseekV2YarnRotaryEmbedding", "MoEGate"
         ]
         return module.__class__ in load_layers or module._get_name() in load_layer_names
 
@@ -332,9 +333,9 @@ def _replace(self, child, name, conv_linear_layer):
             return
         weight_shape = child.weight.shape
         mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
-        # For mixtral-7x8b, need to skip MoE gate linear replace.
-        if name == "block_sparse_moe.gate" or (('mlp.shared_expert_gate' == name or 'mlp.gate' == name)
-                                               and 'qwen2_moe' in str(type(self.module))):
+        # For TP layer skip, e.g., MoE gate, deepseek low rank layer skip
+        if "q_a_proj" in name or "kv_a_proj_with_mqa" in name or name == "block_sparse_moe.gate" or (
+            ('mlp.shared_expert_gate' == name or 'mlp.gate' == name) and 'qwen2_moe' in str(type(self.module))):
             return child
         # For Yuan model
         if 'Yuan' in str(self.module):
@@ -350,11 +351,15 @@ def _replace(self, child, name, conv_linear_layer):
         arctic_w2_all_reduce_linear = False
         if 'Arctic' in str(self.module) and 'w2' in name:
             arctic_w2_all_reduce_linear = True
+        # For MoE MLP model, e.g., deepseek and jamba
+        down_proj = False
+        if 'down_proj' in name:
+            down_proj = True
         # For MLP including chunk layer.
         if 'gate_up_proj' in name or ('dense_h_to_4h' in name and 'GLM' in str(self.module)):
             weight, bias = shard_chunk_mlp(child.weight.data, child.bias, dist.get_rank(), dist.get_world_size())
             return LinearLayer(weight=weight, bias=bias)
-        if name in self.all_reduce_linears or arctic_w2_all_reduce_linear:
+        if name in self.all_reduce_linears or arctic_w2_all_reduce_linear or down_proj:
             # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
             # else [weight_shape[0], weight_shape[1] // mp_size]
 

diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
@@ -191,7 +191,7 @@ def __init__(self, weight_shape=None, weight=None, bias=None):
         self.offset = 2
         super().__init__(weight_shape, weight=weight)
 
-    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
+    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0, position_ids: int = 0):
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
         attention_mask = attention_mask.long()
 

diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py
@@ -42,11 +42,16 @@ def get_num_attention_heads():
 def get_shard_size(total_size, mp_size, name=None, rank=None):
     global num_kv_heads
     last_linear = ["lm_head", "embed_out"]
+    # MoE MLP layer use near even division will get better perf.
+    moe_mlp_layer = ["gate_proj", "up_proj", "down_proj", "w1", "w2", "w3"]
+    not_moe_mlp_layer = True
+    if name != None and any(s in str(name) for s in moe_mlp_layer):
+        not_moe_mlp_layer = False
     # When we have num_kv_heads defined, uneven division is possible, otherwise enforce near even division
     if rank == None:
         rank = dist.get_rank()
     if num_kv_heads != None and total_size % num_kv_heads == 0 and "mlp" not in str(name) and str(
-            name) not in last_linear:
+            name) not in last_linear and not_moe_mlp_layer:
         my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0)
         return total_size * my_slices // num_kv_heads
     else: