From eb07d41f959bda6c76b106c1c1c40493720b2fb8 Mon Sep 17 00:00:00 2001 From: Nadav Elyahu <88962733+nelyahu@users.noreply.github.com> Date: Thu, 15 Aug 2024 18:01:27 +0300 Subject: [PATCH] Allow accelerator to instantiate the device (#5255) when instantiating torch.device for HPU it cannot be fed with HPU:1 annotation, but only "HPU". moving the logic to accelerator will allow to solve this issue, with single line change. --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase Co-authored-by: Joe Mayer <114769929+jomayeri@users.noreply.github.com> --- .github/workflows/hpu-gaudi2.yml | 1 - accelerator/hpu_accelerator.py | 5 ++--- deepspeed/runtime/engine.py | 4 ++-- deepspeed/runtime/zero/utils.py | 1 - 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index 0272829e8286..4e9ceb32b6b1 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -68,7 +68,6 @@ jobs: (test_flops_profiler.py and test_flops_profiler_in_inference) test_get_optim_files.py test_groups.py - test_init_on_device.py test_partition_balanced.py (test_adamw.py and TestAdamConfigs) test_coalesced_collectives.py diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index 485b205f3e67..1f407e86787e 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -42,9 +42,8 @@ def handles_memory_backpressure(self): return True def device_name(self, device_index=None): - if device_index is None: - return 'hpu' - return 'hpu:{}'.format(device_index) + # ignoring device_index. + return 'hpu' def device(self, device_index=None): return torch.device(self.device_name(device_index)) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index d2839a8f5d7c..27d294b3ae01 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -1009,13 +1009,13 @@ def _set_distributed_vars(self, args): device_rank = args.device_rank if args is not None and hasattr(args, 'device_rank') else self.local_rank if device_rank >= 0: get_accelerator().set_device(device_rank) - self.device = torch.device(get_accelerator().device_name(), device_rank) + self.device = torch.device(get_accelerator().device_name(device_rank)) self.world_size = dist.get_world_size() self.global_rank = dist.get_rank() else: self.world_size = 1 self.global_rank = 0 - self.device = torch.device(get_accelerator().device_name()) + self.device = get_accelerator().device() # Configure based on command line arguments def _configure_with_arguments(self, args, mpu): diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py index fae725819a6b..8f913d065934 100755 --- a/deepspeed/runtime/zero/utils.py +++ b/deepspeed/runtime/zero/utils.py @@ -68,7 +68,6 @@ def get_lst_from_rank0(lst: List[int]) -> None: lst_tensor = torch.tensor( lst if dist.get_rank() == 0 else [-1] * len(lst), dtype=int, - # device=get_accelerator().current_device_name(), device=torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"])), requires_grad=False, )