import torch import random import numpy as np import builtins import fcntl import src.distributed.process_group_manager as pgm def print(*args, **kwargs): """ solves multi-process interleaved print problem """ with open(__file__, "r") as fh: fcntl.flock(fh, fcntl.LOCK_EX) try: builtins.print(*args, **kwargs) finally: fcntl.flock(fh, fcntl.LOCK_UN) def set_all_seed(seed): for module in [random, np.random]: module.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) ## def display_4D_parallelism_grid(): # #TODO(fmom): fix me # #TODO(fmom): add color to distinguish between different parallelism groups # def create_gpu_box(gpu_num, tp, cp, pp): # return [ # f"+------+", # f"|GPU:{gpu_num:<2d}|", # f"| TP:{tp:d} |", # f"| CP:{cp:d} |", # f"| PP:{pp:d} |", # f"+------+" # ] # # def create_node(start_gpu, tp_size, cp_size, pp_size, node_index): # boxes = [] # for i in range(8): # 8 GPUs per node # gpu_num = start_gpu + i # tp = gpu_num % tp_size # cp = (gpu_num // tp_size) % cp_size # pp = (gpu_num // (tp_size * cp_size)) % pp_size # boxes.append(create_gpu_box(gpu_num, tp, cp, pp)) # return [' '.join(row) for row in zip(*boxes)] # # def create_dp_box(replica_output): # width = len(replica_output[0]) + 4 # top_bottom = f"+{'-' * (width - 2)}+" # return [top_bottom] + [f"| {line} |" for line in replica_output] + [top_bottom] # # tp_size = pgm.process_group_manager.tp_size # cp_size = pgm.process_group_manager.cp_size # pp_size = pgm.process_group_manager.pp_size # dp_size = pgm.process_group_manager.dp_size # total_gpus_per_replica = tp_size * cp_size * pp_size # num_nodes_per_replica = (total_gpus_per_replica + 7) // 8 # Round up to nearest whole node # # output = [] # output.append("=== Simplified Parallelism Configuration ===") # output.append(f"TP Size: {tp_size}, CP Size: {cp_size}, PP Size: {pp_size}, DP Size: {dp_size}") # output.append(f"Total GPUs for one replica: {total_gpus_per_replica}") # output.append(f"Number of nodes per replica: {num_nodes_per_replica} (8 GPUs per node)") # output.append(f"Total GPUs: {total_gpus_per_replica * dp_size}") # output.append(f"Total nodes: {num_nodes_per_replica * dp_size}") # output.append("") # # for dp in range(dp_size): # replica_output = [] # for node in range(num_nodes_per_replica): # start_gpu = (dp * total_gpus_per_replica) + (node * 8) # node_output = create_node(start_gpu, tp_size, cp_size, pp_size, node) # replica_output.append(f"Node {dp * num_nodes_per_replica + node}:") # replica_output.extend(node_output) # replica_output.append("") # # dp_box = create_dp_box(replica_output) # output.append(f"Data Parallel Group {dp}:") # output.extend(dp_box) # output.append("") # # print("\n".join(output))