import os import sys from datetime import datetime import torch from tqdm import tqdm from torch.utils.tensorboard import SummaryWriter path = os.path.abspath(__file__) for i in range(2): path = os.path.dirname(path) PROJECT_ROOT = path sys.path.append(PROJECT_ROOT) from configs.config import ConfigManager from datasets.dataset_factory import DatasetFactory from optimizers.optimizer_factory import OptimizerFactory from evaluations.eval_function_factory import EvalFunctionFactory from losses.loss_function_factory import LossFunctionFactory from modules.pipeline import Pipeline from runners.runner import Runner from utils.file_util import FileUtil from utils.tensorboard_util import TensorboardWriter from annotations.external_module import EXTERNAL_FREEZE_MODULES class Trainer(Runner): CHECKPOINT_DIR_NAME: str = 'checkpoints' TENSORBOARD_DIR_NAME: str = 'tensorboard' LOG_DIR_NAME: str = 'log' def __init__(self, config_path): super().__init__(config_path) tensorboard_path = os.path.join(self.experiment_path, Trainer.TENSORBOARD_DIR_NAME) ''' Pipeline ''' self.pipeline_config = ConfigManager.get("settings", "pipeline") self.parallel = ConfigManager.get("settings","general","parallel") self.pipeline = Pipeline(self.pipeline_config) if self.parallel and self.device == "cuda": self.pipeline = torch.nn.DataParallel(self.pipeline) self.pipeline = self.pipeline.to(self.device) ''' Experiment ''' self.current_epoch = 0 self.max_epochs = self.experiments_config["max_epochs"] self.test_first = self.experiments_config["test_first"] self.load_experiment("train") ''' Train ''' self.train_config = ConfigManager.get("settings", "train") self.train_dataset_config = self.train_config["dataset"] self.train_set = DatasetFactory.create(self.train_dataset_config) self.optimizer = OptimizerFactory.create(self.train_config["optimizer"], self.pipeline.parameters()) self.train_writer = SummaryWriter( log_dir=os.path.join(tensorboard_path, f"[train]{self.train_dataset_config['name']}")) ''' Test ''' self.test_config = ConfigManager.get("settings", "test") self.test_dataset_config_list = self.test_config["dataset_list"] self.test_set_list = [] self.test_writer_list = [] seen_name = set() for test_dataset_config in self.test_dataset_config_list: if test_dataset_config["name"] not in seen_name: seen_name.add(test_dataset_config["name"]) else: raise ValueError("Duplicate test dataset name: {}".format(test_dataset_config["name"])) test_set = DatasetFactory.create(test_dataset_config) test_writer = SummaryWriter( log_dir=os.path.join(tensorboard_path, f"[test]{test_dataset_config['name']}")) self.test_set_list.append(test_set) self.test_writer_list.append(test_writer) del seen_name self.print_info() def run(self): save_interval = self.experiments_config["save_checkpoint_interval"] if self.current_epoch != 0: print("Continue training from epoch {}.".format(self.current_epoch)) else: print("Start training from initial model.") if self.test_first: print("Do test first.") self.test() while self.current_epoch < self.max_epochs: self.current_epoch += 1 self.train() self.test() if self.current_epoch % save_interval == 0: self.save_checkpoint() self.save_checkpoint(is_last=True) def train(self): self.pipeline.train() train_set_name = self.train_dataset_config["name"] ratio = self.train_dataset_config["ratio"] train_loader = self.train_set.get_loader(device="cuda", shuffle=True) loop = tqdm(enumerate(train_loader), total=len(train_loader)) loader_length = len(train_loader) for i, data in loop: self.train_set.process_batch(data, self.device) loss_dict = self.train_step(data) loop.set_description( f'Epoch [{self.current_epoch}/{self.max_epochs}] (Train: {train_set_name}, ratio={ratio})') loop.set_postfix(loss=loss_dict) curr_iters = (self.current_epoch - 1) * loader_length + i TensorboardWriter.write_tensorboard(self.train_writer, "iter", loss_dict, curr_iters) def train_step(self, data): self.optimizer.zero_grad() output = self.pipeline(data, Pipeline.TRAIN_MODE) total_loss, loss_dict = self.loss_fn(output, data) total_loss.backward() self.optimizer.step() for k, v in loss_dict.items(): loss_dict[k] = round(v, 5) return loss_dict def loss_fn(self, output, data): loss_config = self.train_config["losses"] loss_dict = {} total_loss = torch.tensor(0.0, dtype=torch.float32, device=self.device) for key in loss_config: weight = loss_config[key] target_loss_fn = LossFunctionFactory.create(key) loss = target_loss_fn(output, data) loss_dict[key] = loss.item() total_loss += weight * loss loss_dict['total_loss'] = total_loss.item() return total_loss, loss_dict def test(self): self.pipeline.eval() with torch.no_grad(): for dataset_idx, test_set in enumerate(self.test_set_list): eval_list = self.test_dataset_config_list[dataset_idx]["eval_list"] test_set_name = self.test_dataset_config_list[dataset_idx]["name"] ratio = self.test_dataset_config_list[dataset_idx]["ratio"] writer = self.test_writer_list[dataset_idx] output_list = [] data_list = [] test_loader = test_set.get_loader("cpu") loop = tqdm(enumerate(test_loader), total=int(len(test_loader))) for i, data in loop: test_set.process_batch(data, self.device) output = self.pipeline(data, Pipeline.TEST_MODE) output_list.append(output) data_list.append(data) loop.set_description( f'Epoch [{self.current_epoch}/{self.max_epochs}] (Test: {test_set_name}, ratio={ratio})') result_dict = self.eval_fn(output_list, data_list, eval_list) TensorboardWriter.write_tensorboard(writer, "epoch", result_dict, self.current_epoch - 1) @staticmethod def eval_fn(output_list, data_list, eval_list): target_eval_fn = EvalFunctionFactory.create(eval_list) result_dict = target_eval_fn(output_list, data_list) return result_dict def get_checkpoint_path(self, is_last=False): return os.path.join(self.experiment_path, Trainer.CHECKPOINT_DIR_NAME, "Epoch_{}.pth".format( self.current_epoch if self.current_epoch != -1 and not is_last else "last")) def load_checkpoint(self, is_last=False): self.load(self.get_checkpoint_path(is_last)) print(f"Loaded checkpoint from {self.get_checkpoint_path(is_last)}") if is_last: checkpoint_root = os.path.join(self.experiment_path, Trainer.CHECKPOINT_DIR_NAME) meta_path = os.path.join(checkpoint_root, "meta.json") if not os.path.exists(meta_path): raise FileNotFoundError( "No checkpoint meta.json file in the experiment {}".format(self.experiments_config["name"])) meta = FileUtil.load_json("meta.json", checkpoint_root) self.current_epoch = meta["last_epoch"] def save_checkpoint(self, is_last=False): self.save(self.get_checkpoint_path(is_last)) if not is_last: print(f"Checkpoint at epoch {self.current_epoch} saved to {self.get_checkpoint_path(is_last)}") else: meta = { "last_epoch": self.current_epoch, "time": str(datetime.now()) } checkpoint_root = os.path.join(self.experiment_path, Trainer.CHECKPOINT_DIR_NAME) FileUtil.save_json(meta, "meta.json", checkpoint_root) def load_experiment(self, backup_name=None): super().load_experiment(backup_name) if self.experiments_config["use_checkpoint"]: self.current_epoch = self.experiments_config["epoch"] self.load_checkpoint(is_last=(self.current_epoch == -1)) def create_experiment(self, backup_name=None): super().create_experiment(backup_name) ckpt_dir = os.path.join(str(self.experiment_path), Trainer.CHECKPOINT_DIR_NAME) os.makedirs(ckpt_dir) tensorboard_dir = os.path.join(str(self.experiment_path), Trainer.TENSORBOARD_DIR_NAME) os.makedirs(tensorboard_dir) def load(self, path): state_dict = torch.load(path) if self.parallel: self.pipeline.module.load_state_dict(state_dict) else: self.pipeline.load_state_dict(state_dict) def save(self, path): if self.parallel: state_dict = self.pipeline.module.state_dict() else: state_dict = self.pipeline.state_dict() for name, module in self.pipeline.named_modules(): if module.__class__ in EXTERNAL_FREEZE_MODULES: if name in state_dict: del state_dict[name] torch.save(state_dict, path) def print_info(self): def print_dataset(config, dataset): print("\t name: {}".format(config["name"])) print("\t source: {}".format(config["source"])) print("\t data_type: {}".format(config["data_type"])) print("\t total_length: {}".format(len(dataset))) print("\t ratio: {}".format(config["ratio"])) print() super().print_info() table_size = 70 print(f"{'+' + '-' * (table_size // 2)} Pipeline {'-' * (table_size // 2)}" + '+') print(self.pipeline) print(f"{'+' + '-' * (table_size // 2)} Datasets {'-' * (table_size // 2)}" + '+') print("train dataset: ") print_dataset(self.train_dataset_config, self.train_set) for i, test_dataset_config in enumerate(self.test_dataset_config_list): print(f"test dataset {i}: ") print_dataset(test_dataset_config, self.test_set_list[i]) print(f"{'+' + '-' * (table_size // 2)}----------{'-' * (table_size // 2)}" + '+') if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, default="configs/server_train_config.yaml") args = parser.parse_args() trainer = Trainer(args.config) trainer.run()