Source code for mridc.utils.debug_hook

# encoding: utf-8
__author__ = "Dimitrios Karkalousos"

# Taken and adapted from: https://github.com/NVIDIA/NeMo/blob/main/nemo/utils/debug_hook.py

import os

import torch


[docs]def get_forward_hook(name, trainer, rank, logger, dump_to_file=False):
    """
    A forward hook to dump all the module input and output norms. It is called at every time after forward() has
    computed an output. Only float type input/output tensor norms are computed.

    For more details about the forward hook, check:
    https://pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html

    Parameters
    ----------
    name : str
        tensor name
    trainer : PTL trainer
        PTL trainer
    rank : int
        worker rank
    logger : PTL log function
        PTL log function
    dump_to_file : bool, optional
        wether dump the csv file to the disk, by default False

    Returns
    -------
    forward_hook
    """
    if dump_to_file:
        os.makedirs("debug_info", exist_ok=True)
        fp = open(f"debug_info/forward_{name}_rank{rank}.txt", "w")
        header = False

    def forward_hook(module, inputs, outputs):
        """Forward hook to dump all of the module input and output norms. It is called at every time after forward()
        has computed an output. Only float type input/output tensor norms are computed."""
        nonlocal header
        nonlocal fp
        if trainer.training:
            values = []
            headers = []
            for n, i in enumerate(inputs):
                if isinstance(i, torch.Tensor) and i.dtype in [torch.float, torch.half, torch.bfloat16]:
                    if not header:
                        headers.append("input")
                    input_norm = i.data.norm()
                    values.append(f"{input_norm}")
                    logger(f"debug_info_forward/{name}_rank{rank}_input{n}", input_norm)
            if isinstance(outputs, tuple):
                for n, i in enumerate(outputs):
                    if isinstance(i, torch.Tensor) and i.dtype in [torch.float, torch.half, torch.bfloat16]:
                        if not header:
                            headers.append("output")
                        output_norm = i.data.norm()
                        values.append(f"{output_norm}")
                        logger(f"debug_info_forward/{name}_rank{rank}_output{n}", output_norm)
            else:
                headers.append("output")
                values.append(f"{outputs.data.norm()}")
            values.append(f"{trainer.global_step}")
            if not header:
                headers.append("step")
                fp.write(",".join(headers) + "\n")
                header = True
            fp.write(",".join(values) + "\n")
        fp.flush()

    return forward_hook


[docs]def get_backward_hook(name, trainer, rank, logger, dump_to_file=False):
    """
    A backward hook to dump all the module input and output grad norms. The hook will be called every time the \
    gradients with respect to module inputs are computed. Only float type input/output grad tensor norms are computed.

    For more details about the backward hook, check:
    https://pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_full_backward_hook.html

    Parameters
    ----------
    name : str
        tensor name
    trainer : PTL trainer
        PTL trainer
    rank : int
        worker rank
    logger : PTL log function
        PTL log function
    dump_to_file : bool, optional
        wether dump the csv file to the disk, by default False

    Returns
    -------
    backward_hook
    """
    if dump_to_file:
        os.makedirs("debug_info", exist_ok=True)
        fp = open(f"debug_info/backward_{name}_rank{rank}.txt", "w")
        header = False

    def backward_hook(module, inputs, outputs):
        """Backward hook to dump all the module input and output grad norms. The hook will be called every time the \
        has computed an output. Only float type input/output tensor norms are computed."""
        nonlocal header
        nonlocal fp
        if trainer.training:
            values = []
            headers = []
            for n, i in enumerate(inputs):
                if isinstance(i, torch.Tensor) and i.dtype in [torch.float, torch.half, torch.bfloat16]:
                    if not header:
                        headers.append("input")
                    input_norm = i.data.norm()
                    values.append(f"{input_norm}")
                    logger(f"debug_info_backward/{name}_rank{rank}_input{n}", input_norm)
            if isinstance(outputs, tuple):
                for n, i in enumerate(outputs):
                    if isinstance(i, torch.Tensor) and i.dtype in [torch.float, torch.half, torch.bfloat16]:
                        if not header:
                            headers.append("output")
                        output_norm = i.data.norm()
                        values.append(f"{output_norm}")
                        logger(f"debug_info_backward/{name}_rank{rank}_output{n}", output_norm)
            else:
                headers.append("output")
                values.append(f"{outputs.data.norm()}")
            values.append(f"{trainer.global_step}")
            if not header:
                headers.append("step")
                fp.write(",".join(headers) + "\n")
                header = True
            fp.write(",".join(values) + "\n")
        fp.flush()

    return backward_hook


[docs]def get_tensor_hook(module, name, trainer, rank, logger, dump_to_file=False):
    """
    A tensor hook to dump all of the tensor weight norms and grad norms at the end of each of the backward steps.

    For more details about the tensor hook, check:
    https://pytorch.org/docs/stable/generated/torch.Tensor.register_hook.html

    Parameters
    ----------
    module : torch.nn.Module
        module to register the hook
    name : str
        tensor name
    trainer : PTL trainer
        PTL trainer
    rank : int
        worker rank
    logger : PTL log function
        PTL log function
    dump_to_file : bool, optional
        wether dump the csv file to the disk, by default False

    Returns
    -------
    tensor_hook
    """
    if dump_to_file:
        os.makedirs("debug_info", exist_ok=True)
        fp = open(f"debug_info/tensor_{name}_rank{rank}.csv", "w")
        header = False

    def tensor_hook(grad):
        """Tensor hook to dump all the tensor weight norms and grad norms at the end of each of the backward steps."""
        nonlocal header
        nonlocal fp
        values = []
        headers = []

        weight = module.get_parameter(name)
        weight_norm = weight.data.norm()
        grad_norm = grad.data.norm()
        logger(f"debug_info_tensors/{name}_rank{rank}_grad_norm", grad_norm)
        logger(f"debug_info_tensors/{name}_rank{rank}_weight_norm", weight_norm)
        values.append(f"{weight_norm}")
        values.append(f"{grad_norm}")
        values.append(f"{trainer.global_step}")
        if dump_to_file:
            if not header:
                headers.append("weight")
                headers.append("grad")
                headers.append("step")
                fp.write(",".join(headers) + "\n")
                header = True
            fp.write(",".join(values) + "\n")
            fp.flush()
        return grad

    return tensor_hook


[docs]def register_debug_hooks(module, trainer, logger, dump_to_file=False):
    """
    Register debug hooks. It can
    1. track the module forward step input/output norm
    2. track the module backward step input/output grad norm
    3. track the parameter weight norm and grad norm.
    """
    # default rank 0
    rank = 0
    if torch.distributed.is_initialized():
        rank = torch.distributed.get_rank()
    for name, tensor in module.named_parameters():
        if name != "":
            tensor.register_hook(get_tensor_hook(module, name, trainer, rank, logger, dump_to_file))
    for name, layer in module.named_modules():
        if name != "":
            layer.register_forward_hook(get_forward_hook(name, trainer, rank, logger, dump_to_file))
            layer.register_full_backward_hook(get_backward_hook(name, trainer, rank, logger, dump_to_file))