From b8207048b56b0b76d2270e3495035bc79537af3a Mon Sep 17 00:00:00 2001 From: yangcheng Date: Mon, 15 Apr 2024 10:25:17 +0800 Subject: [PATCH 1/3] fix the accelerator issue Co-authored-by: eigen2017 --- README.md | 11 ++++++----- accelerator/abstract_accelerator.py | 8 ++++++++ accelerator/cpu_accelerator.py | 7 +++++++ accelerator/cuda_accelerator.py | 7 +++++++ accelerator/hpu_accelerator.py | 7 +++++++ accelerator/mps_accelerator.py | 8 ++++++++ accelerator/npu_accelerator.py | 7 +++++++ accelerator/xpu_accelerator.py | 7 +++++++ deepspeed/launcher/launch.py | 11 +++++++---- 9 files changed, 64 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a1335caa4949..201b9016f8ab 100755 --- a/README.md +++ b/README.md @@ -159,11 +159,12 @@ dynamically link them at runtime. ## Contributed HW support * DeepSpeed now support various HW accelerators. -| Contributor | Hardware | Accelerator Name | Contributor validated | Upstream validated | -| ----------- | -------- | ---------------- | --------------------- | ------------------ | -| Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes | -| Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes | -| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes | +| Contributor | Hardware | Accelerator Name | Contributor validated | Upstream validated | +|-------------|-------------------------------------|------------------| --------------------- |--------------------| +| Huawei | Huawei Ascend NPU | npu | Yes | No | +| Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes | +| Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes | +| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes | ## PyPI We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases. diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index 3c5d799e293e..768d5ea34e5e 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -287,3 +287,11 @@ def build_extension(self): @abc.abstractmethod def export_envs(self): ... + + @abc.abstractmethod + def visible_devices_envs(self): + ... + + @abc.abstractmethod + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + ... diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 870d3e91816e..4287b19abd9e 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -320,3 +320,10 @@ def build_extension(self): def export_envs(self): return [] + + def visible_devices_envs(self): + return ['CUDA_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 3d5e9c168c16..2fc0cfd94125 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -360,3 +360,10 @@ def build_extension(self): def export_envs(self): return ['NCCL'] + + def visible_devices_envs(self): + return ['CUDA_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index 30b115e8b1ab..326efc8fa01b 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -294,3 +294,10 @@ def build_extension(self): def export_envs(self): return [] + + def visible_devices_envs(self): + return ['HABANA_VISIBLE_MODULES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index 972b33caece1..ee3ba4ca574f 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -258,3 +258,11 @@ def build_extension(self): def export_envs(self): return [] + + def visible_devices_envs(self): + # TODO: could not find visible devices env for mps + return ['CUDA_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py index 472157e32c02..5d891ecb707d 100644 --- a/accelerator/npu_accelerator.py +++ b/accelerator/npu_accelerator.py @@ -278,3 +278,10 @@ def build_extension(self): def export_envs(self): return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH'] + + def visible_devices_envs(self): + return ['ASCEND_RT_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index 3f65263946ab..c59f60077d2f 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -289,3 +289,10 @@ def build_extension(self): def export_envs(self): return [] + + def visible_devices_envs(self): + return ['ZE_AFFINITY_MASK'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py index ffb9cbc18fbd..079182a3b45b 100755 --- a/deepspeed/launcher/launch.py +++ b/deepspeed/launcher/launch.py @@ -22,6 +22,7 @@ from collections import defaultdict from typing import Dict from argparse import ArgumentParser, REMAINDER +from deepspeed.accelerator import get_accelerator from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT from ..nebula.constants import DLTS_POD_ENV_PATH from ..utils import logger, get_numactl_cmd @@ -146,8 +147,8 @@ def main(): node_list = list(world_info.keys()) args.nnodes = len(node_list) local_node = node_list[args.node_rank] - local_gpu_ids = world_info[local_node] - num_local_procs = len(local_gpu_ids) + local_accelerator_ids = world_info[local_node] + num_local_procs = len(local_accelerator_ids) logger.info(f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}") global_rank_mapping = defaultdict(list) @@ -161,8 +162,10 @@ def main(): curr_global_rank += 1 logger.info(f"global_rank_mapping={global_rank_mapping}") logger.info(f"dist_world_size={dist_world_size}") - current_env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, local_gpu_ids)) - logger.info(f"Setting CUDA_VISIBLE_DEVICES={current_env['CUDA_VISIBLE_DEVICES']}") + + get_accelerator().set_visible_devices_envs(current_env, local_accelerator_ids) + for env in get_accelerator().visible_devices_envs(): + logger.info(f"Setting {env}={current_env[env]}") # set PyTorch distributed related environmental variables current_env["MASTER_ADDR"] = args.master_addr From d75c634f4a24dd94d700da5e726c84e1a022fc67 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sat, 20 Apr 2024 16:25:38 -0400 Subject: [PATCH 2/3] Update accelerator/cpu_accelerator.py --- accelerator/cpu_accelerator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 11228b2d8126..237e7f51dcb4 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -323,6 +323,7 @@ def build_extension(self): def export_envs(self): return [] + # TODO: cpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES def visible_devices_envs(self): return ['CUDA_VISIBLE_DEVICES'] From b4d465bdd3df6c530ac65169532e25dec63f9486 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sat, 20 Apr 2024 16:25:46 -0400 Subject: [PATCH 3/3] Update accelerator/mps_accelerator.py --- accelerator/mps_accelerator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index ee3ba4ca574f..ff70b860d7c7 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -259,6 +259,7 @@ def build_extension(self): def export_envs(self): return [] + # TODO: mpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES def visible_devices_envs(self): # TODO: could not find visible devices env for mps return ['CUDA_VISIBLE_DEVICES']