[Core][Distributed] fix pynccl del error (#4508)
This commit is contained in:
parent
3a922c1e7e
commit
6ef09b08f8
@ -200,6 +200,10 @@ _c_ncclAllReduce.argtypes = [
|
|||||||
ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p
|
ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# be cautious! this is a collective call, it will block until all
|
||||||
|
# processes in the communicator have called this function.
|
||||||
|
# because Python object destruction can happen in random order,
|
||||||
|
# it is better not to call it at all.
|
||||||
# equivalent to c declaration:
|
# equivalent to c declaration:
|
||||||
# ncclResult_t ncclCommDestroy(ncclComm_t comm);
|
# ncclResult_t ncclCommDestroy(ncclComm_t comm);
|
||||||
_c_ncclCommDestroy = nccl.ncclCommDestroy
|
_c_ncclCommDestroy = nccl.ncclCommDestroy
|
||||||
@ -278,11 +282,3 @@ class NCCLCommunicator:
|
|||||||
ncclDataTypeEnum.from_torch(tensor.dtype),
|
ncclDataTypeEnum.from_torch(tensor.dtype),
|
||||||
ncclRedOpTypeEnum.from_torch(op), self.comm,
|
ncclRedOpTypeEnum.from_torch(op), self.comm,
|
||||||
ctypes.c_void_p(stream.cuda_stream)))
|
ctypes.c_void_p(stream.cuda_stream)))
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
# `dist` module might have been already destroyed
|
|
||||||
if hasattr(dist, 'destroy_process_group'):
|
|
||||||
dist.destroy_process_group()
|
|
||||||
# function might have been already destroyed
|
|
||||||
if _c_ncclCommDestroy is not None:
|
|
||||||
_c_ncclCommDestroy(self.comm)
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user