定义
案例1
mp.spawn(fn, args=(), nprocs=1, join=True, daemon=False) #自动生成多个进程 fn: 进程的入口函数,该函数的第一个参数会被默认自动加入当前进*程的rank, 即实际调用: fn(rank, *args) nprocs: 进程数量,即:world_size args: 函数fn的其他常规参数以tuple的形式传递
import torch import torch.distributed as dist import torch.multiprocessing as mp def fn(rank, ws, nums): dist.init_process_group('nccl', init_method='tcp://127.0.0.1:28765', rank=rank, world_size=ws) rank = dist.get_rank() print(f"rank = {rank} is initialized") torch.cuda.set_device(rank) tensor = torch.tensor(nums).cuda() print(tensor) if __name__ == "__main__": ws = 2 mp.spawn(fn, nprocs=ws, args=(ws, [1, 2, 3, 4])) # python test.py
2. 使用torchrun 启动
import torch import torch.distributed as dist import torch.multiprocessing as mp import os dist.init_process_group('nccl', init_method='env://') rank = dist.get_rank() local_rank = os.environ['LOCAL_RANK'] master_addr = os.environ['MASTER_ADDR'] master_port = os.environ['MASTER_PORT'] print(f"rank = {rank} is initialized in {master_addr}:{master_port}; local_rank = {local_rank}") torch.cuda.set_device(rank) tensor = torch.tensor([1, 2, 3, 4]).cuda() print(tensor) #torchrun --nproc_per_node=2 test.py
--nnodes: 使用的机器数量,单机的话,就默认是1了 --nproc_per_node: 单机的进程数,即单机的worldsize --master_addr/port: 使用的主进程rank0的地址和端口 --node_rank: 当前的进程rank
参考: